From 30e3af304d96bb2b9fc7b5d053023af2a5db2710 Mon Sep 17 00:00:00 2001 From: dylni <46035563+dylni@users.noreply.github.com> Date: Thu, 28 Nov 2019 17:21:01 -0500 Subject: [PATCH] Initial commit --- .gitignore | 3 + COPYRIGHT | 5 + Cargo.toml | 17 ++ LICENSE-APACHE | 201 +++++++++++++++++++++++ LICENSE-MIT | 21 +++ README.md | 35 ++++ src/lib.rs | 435 +++++++++++++++++++++++++++++++++++++++++++++++++ src/unix.rs | 83 ++++++++++ src/windows.rs | 138 ++++++++++++++++ 9 files changed, 938 insertions(+) create mode 100644 .gitignore create mode 100644 COPYRIGHT create mode 100644 Cargo.toml create mode 100644 LICENSE-APACHE create mode 100644 LICENSE-MIT create mode 100644 README.md create mode 100644 src/lib.rs create mode 100644 src/unix.rs create mode 100644 src/windows.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6936990 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +**/*.rs.bk +Cargo.lock diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 0000000..fb2d62f --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,5 @@ +Copyright (c) 2019 Dylan Iuzzolino + +Licensed under the Apache License, Version 2.0 or the MIT +license , at your option. All files in this project may not be +copied, modified, or distributed except according to those terms. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..fb94412 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "os_str_bytes" +version = "0.1.0" +authors = ["dylni"] +edition = "2018" +exclude = ["/.git*", "/target"] +description = """ +Traits for converting between byte sequences and platform-native strings. +""" +repository = "https://github.com/dylni/os_str_bytes" +readme = "README.md" +keywords = ["osstr", "os_str", "osstring", "os_string", "bytes"] +categories = ["command-line-interface", "development-tools::ffi", "os"] +license = "MIT OR Apache-2.0" + +[dev-dependencies] +getrandom = "0.1.13" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..b825ac0 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Dylan Iuzzolino + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a60c34f --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# OsStr Bytes + +Traits for converting between byte sequences and platform-native strings. + +This crate allows interacting with the bytes stored internally by [`OsStr`] and +[`OsString`], without resorting to panics or data corruption for invalid UTF-8. +Thus, methods can be used that are already defined on [`[u8]`][slice] and +[`Vec`]. + +Typically, the only way to losslessly construct [`OsStr`] or [`OsString`] from +a byte sequence is to use `OsString::from(String::from(bytes).unwrap())`, which +requires the bytes to be valid in UTF-8. However, since this crate makes +conversions directly between the platform encoding and raw bytes, even some +strings invalid in UTF-8 can be converted. + +## Usage + +Add the following lines to your "Cargo.toml" file: + +```toml +[dependencies] +os_str_bytes = "0.1" +``` + +See the [documentation] for available functionality and examples. + +## Rust version support + +The minimum supported Rust toolchain version is currently Rust 1.32.0. + +[documentation]: https://docs.rs/os_str_bytes +[slice]: https://doc.rust-lang.org/std/primitive.slice.html +[`OsStr`]: https://doc.rust-lang.org/std/ffi/struct.OsStr.html +[`OsString`]: https://doc.rust-lang.org/std/ffi/struct.OsString.html +[`Vec`]: https://doc.rust-lang.org/std/vec/struct.Vec.html diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..29d843e --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,435 @@ +//! Traits for converting between byte sequences and platform-native strings. +//! +//! This crate allows interacting with the bytes stored internally by [`OsStr`] +//! and [`OsString`], without resorting to panics or data corruption for +//! invalid UTF-8. Thus, methods can be used that are already defined on +//! [`[u8]`][slice] and [`Vec`]. +//! +//! Typically, the only way to losslessly construct [`OsStr`] or [`OsString`] +//! from a byte sequence is to use `OsString::from(String::from(bytes)?)`, +//! which requires the bytes to be valid in UTF-8. However, since this crate +//! makes conversions directly between the platform encoding and raw bytes, +//! even some strings invalid in UTF-8 can be converted. +//! +//! # Implementation +//! +//! All traits are [sealed], meaning that they can only be implemented by this +//! crate. Otherwise, backwards compatibility would be more difficult to +//! maintain for new features. +//! +//! # Complexity +//! +//! The time complexities of methods will vary based on what functionality is +//! available for the platform. The most efficient implementation will be used, +//! but it is important to use the most applicable method. For example, +//! [`OsStringBytes::from_vec`] will be at least as efficient as +//! [`OsStringBytes::from_bytes`], but the latter should be used when only a +//! slice is available. +//! +//! # Safety +//! +//! Some unsafe assumptions are made, with the most egregious being that +//! [`str::from_utf8_unchecked`] returns a partially usable string for invalid +//! UTF-8. The alternative would be to encode and decode strings manually, +//! which would be more dangerous, as it would create a reliance on how the +//! standard library encodes invalid UTF-8 strings. +//! +//! To make this implementation less problematic, it is best to not make any +//! assumptions about the representation of invalid UTF-8 bytes. However, given +//! the purpose of this crate, every measure will be taken to ensure that it +//! matches the raw byte sequence, meaning this is usually not a concern. Tests +//! exist to validate that the conversions are sound. +//! +//! # Examples +//! +//! ``` +//! use std::env::temp_dir; +//! use std::ffi::OsStr; +//! use std::fs::read_to_string; +//! use std::fs::write; +//! # use std::io::Result; +//! +//! use os_str_bytes::OsStrBytes; +//! +//! # fn main() -> Result<()> { +//! let string = "hello world"; +//! let file_name = b"\xC3\xA9os_str\xED\xA0\xBDbytes\xF0\x9F\x92\xA9.txt"; +//! +//! let mut file = temp_dir(); +//! // In this example, conversion always succeeds, so `unwrap()` can be used. +//! file.push(OsStr::from_bytes(file_name).unwrap()); +//! +//! write(&file, string)?; +//! assert_eq!(string, read_to_string(file)?); +//! # +//! # Ok(()) +//! # } +//! ``` +//! +//! [sealed]: https://rust-lang.github.io/api-guidelines/future-proofing.html#c-sealed +//! [slice]: https://doc.rust-lang.org/std/primitive.slice.html +//! [`OsStr`]: https://doc.rust-lang.org/std/ffi/struct.OsStr.html +//! [`OsString`]: https://doc.rust-lang.org/std/ffi/struct.OsString.html +//! [`OsStringBytes::from_bytes`]: trait.OsStringBytes.html#tymethod.from_bytes +//! [`OsStringBytes::from_vec`]: trait.OsStringBytes.html#tymethod.from_vec +//! [`str::from_utf8_unchecked`]: https://doc.rust-lang.org/std/str/fn.from_utf8_unchecked.html +//! [`Vec`]: https://doc.rust-lang.org/std/vec/struct.Vec.html + +#![doc(html_root_url = "https://docs.rs/os_str_bytes/0.1.0")] + +use std::borrow::Cow; +use std::error::Error; +use std::fmt::Display; +use std::fmt::Formatter; +use std::fmt::Result as FmtResult; + +#[cfg(unix)] +#[path = "unix.rs"] +mod imp; +#[cfg(windows)] +#[path = "windows.rs"] +mod imp; + +/// The error that occurs when a byte sequence is not representable in the +/// platform encoding. +/// +/// On Unix, this error should never occur, but [`OsStrExt`] or [`OsStringExt`] +/// should be used instead if that needs to be guaranteed. +/// +/// [`OsStrExt`]: https://doc.rust-lang.org/std/os/unix/ffi/trait.OsStrExt.html +/// [`OsStringExt`]: https://doc.rust-lang.org/std/os/unix/ffi/trait.OsStringExt.html +#[derive(Debug, Eq, PartialEq)] +pub struct EncodingError(()); + +impl Display for EncodingError { + fn fmt(&self, formatter: &mut Formatter<'_>) -> FmtResult { + "byte sequence is not representable in the platform encoding" + .fmt(formatter) + } +} + +impl Error for EncodingError {} + +/// A platform agnostic variant of [`OsStrExt`]. +/// +/// For more information, see [the module-level documentation][module]. +/// +/// [module]: index.html +/// [`OsStrExt`]: https://doc.rust-lang.org/std/os/unix/ffi/trait.OsStrExt.html +pub trait OsStrBytes: private::Sealed + ToOwned { + /// Converts a byte slice into an equivalent platform-native string + /// reference. + /// + /// This method returns [`Cow`] to account for platform differences. + /// However, no guarantee is made that the same variant of that enum will + /// always be returned for the same platform. Whichever can be constructed + /// most efficiently will be returned. + /// + /// # Examples + /// + /// ``` + /// # use std::ffi::OsStr; + /// # + /// # use os_str_bytes::EncodingError; + /// use os_str_bytes::OsStrBytes; + /// + /// # fn main() -> Result<(), EncodingError> { + /// let string = b"foo\xED\xA0\xBDbar"; + /// assert_eq!(string.len(), OsStr::from_bytes(string)?.len()); + /// # Ok(()) + /// # } + /// ``` + /// + /// [`Cow`]: https://doc.rust-lang.org/std/borrow/enum.Cow.html + fn from_bytes(string: &[u8]) -> Result, EncodingError>; + + /// The unsafe equivalent of [`from_bytes`]. + /// + /// More information is given in that method's documentation. + /// + /// # Safety + /// + /// This method is unsafe, because it does not check that the bytes passed + /// are representable in the platform encoding. If this constraint is + /// violated, it may cause memory unsafety issues with future uses of this + /// string, as the rest of the standard library assumes that [`OsStr`] and + /// [`OsString`] will be usable for the platform. However, the most likely + /// issue is that the data gets corrupted. + /// + /// [`from_bytes`]: #tymethod.from_bytes + /// [`OsStr`]: https://doc.rust-lang.org/std/ffi/struct.OsStr.html + /// [`OsString`]: https://doc.rust-lang.org/std/ffi/struct.OsString.html + unsafe fn from_bytes_unchecked(string: &[u8]) -> Cow<'_, Self>; + + /// Converts the internal byte representation into a byte slice. + /// + /// For more information, see [`from_bytes`]. + /// + /// # Examples + /// + /// ``` + /// # use std::ffi::OsStr; + /// # + /// # use os_str_bytes::EncodingError; + /// use os_str_bytes::OsStrBytes; + /// + /// # fn main() -> Result<(), EncodingError> { + /// let string = b"foo\xED\xA0\xBDbar"; + /// let os_string = OsStr::from_bytes(string)?.into_owned(); + /// assert_eq!(string, os_string.to_bytes().as_ref()); + /// # Ok(()) + /// # } + /// ``` + /// + /// [`from_bytes`]: #tymethod.from_bytes + fn to_bytes(&self) -> Cow<'_, [u8]>; +} + +/// A platform agnostic variant of [`OsStringExt`]. +/// +/// For more information, see [the module-level documentation][module]. +/// +/// [module]: index.html +/// [`OsStringExt`]: https://doc.rust-lang.org/std/os/unix/ffi/trait.OsStringExt.html +pub trait OsStringBytes: private::Sealed + Sized { + /// Copies a byte slice into a new equivalent platform-native string. + /// + /// # Examples + /// + /// ``` + /// # use std::ffi::OsString; + /// # + /// # use os_str_bytes::EncodingError; + /// use os_str_bytes::OsStringBytes; + /// + /// # fn main() -> Result<(), EncodingError> { + /// let string = b"foo\xED\xA0\xBDbar"; + /// assert_eq!(string.len(), OsString::from_bytes(string)?.len()); + /// # Ok(()) + /// # } + /// ``` + fn from_bytes(string: TString) -> Result + where + TString: AsRef<[u8]>; + + /// The unsafe equivalent of [`from_bytes`]. + /// + /// More information is given in that method's documentation. + /// + /// # Safety + /// + /// This method is unsafe for the same reason as + /// [`OsStrBytes::from_bytes_unchecked`]. + /// + /// [`from_bytes`]: #tymethod.from_bytes + /// [`OsStrBytes::from_bytes_unchecked`]: trait.OsStrBytes.html#tymethod.from_bytes_unchecked + unsafe fn from_bytes_unchecked(string: TString) -> Self + where + TString: AsRef<[u8]>; + + /// Converts a byte vector into an equivalent platform-native string. + /// + /// Whenever possible, the conversion will be performed without copying. + /// + /// # Examples + /// + /// ``` + /// # use std::ffi::OsString; + /// # + /// # use os_str_bytes::EncodingError; + /// use os_str_bytes::OsStringBytes; + /// + /// # fn main() -> Result<(), EncodingError> { + /// let string = b"foo\xED\xA0\xBDbar".to_vec(); + /// assert_eq!(string.len(), OsString::from_vec(string)?.len()); + /// # Ok(()) + /// # } + /// ``` + fn from_vec(string: Vec) -> Result; + + /// The unsafe equivalent of [`from_vec`]. + /// + /// More information is given in that method's documentation. + /// + /// # Safety + /// + /// This method is unsafe for the same reason as + /// [`OsStrBytes::from_bytes_unchecked`]. + /// + /// [`from_vec`]: #tymethod.from_vec + /// [`OsStrBytes::from_bytes_unchecked`]: trait.OsStrBytes.html#tymethod.from_bytes_unchecked + unsafe fn from_vec_unchecked(string: Vec) -> Self; + + /// Converts the internal byte representation into a byte vector. + /// + /// Whenever possible, the conversion will be performed without copying. + /// + /// # Examples + /// + /// ``` + /// # use std::ffi::OsString; + /// # + /// # use os_str_bytes::EncodingError; + /// use os_str_bytes::OsStringBytes; + /// + /// # fn main() -> Result<(), EncodingError> { + /// let string = b"foo\xED\xA0\xBDbar".to_vec(); + /// let os_string = OsString::from_vec(string.clone())?; + /// assert_eq!(string, os_string.into_vec()); + /// # Ok(()) + /// # } + /// ``` + fn into_vec(self) -> Vec; +} + +mod private { + use std::ffi::OsStr; + use std::ffi::OsString; + + pub trait Sealed {} + impl Sealed for OsStr {} + impl Sealed for OsString {} +} + +#[cfg(test)] +mod tests { + use std::ffi::OsStr; + use std::ffi::OsString; + use std::str; + + use getrandom::getrandom; + use getrandom::Error as GetRandomError; + + use crate::EncodingError; + use crate::OsStrBytes; + use crate::OsStringBytes; + + const UTF8_STRING: &str = "string"; + + const WTF8_STRING: &[u8] = b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar"; + + pub(crate) const INVALID_STRING: &[u8] = + b"\xF1foo\xF1\x80bar\xF1\x80\x80baz"; + + const RANDOM_BYTES_LENGTH: usize = 100; + + #[inline] + fn assert_os_eq(left: &OsStr, right: Result) + where + TRight: AsRef, + { + assert_eq!(Ok(left), right.as_ref().map(TRight::as_ref)); + } + + fn random_os_string( + buffer_length: usize, + ) -> Result { + let mut buffer = vec![0; buffer_length]; + #[cfg(unix)] + { + getrandom(&mut buffer)?; + Ok(::std::os::unix::ffi::OsStringExt::from_vec(buffer)) + } + #[cfg(windows)] + { + // SAFETY: These bytes are random, so their values are arbitrary. + getrandom(unsafe { + ::std::mem::transmute::<&mut [u16], &mut [u8]>(&mut buffer) + })?; + Ok(::std::os::windows::ffi::OsStringExt::from_wide(&buffer)) + } + } + + #[test] + fn test_empty_bytes() { + assert_os_eq(&OsString::new(), OsStr::from_bytes(&[])); + assert_os_eq(&OsString::new(), OsString::from_bytes([])); + assert_eq!( + // Assist type inference. + &[b'\0'; 0], + OsString::new().as_os_str().to_bytes().as_ref(), + ); + } + + #[test] + fn test_empty_vec() -> Result<(), EncodingError> { + assert_eq!(0, OsString::from_vec(Vec::new())?.len()); + assert_eq!(Vec::::new(), OsString::new().into_vec()); + Ok(()) + } + + #[test] + fn test_utf8_bytes() { + let os_str = OsString::from(UTF8_STRING); + let os_str = os_str.as_os_str(); + assert_os_eq(&os_str, OsStr::from_bytes(UTF8_STRING.as_bytes())); + assert_os_eq(&os_str, OsString::from_bytes(UTF8_STRING)); + assert_eq!(UTF8_STRING.as_bytes(), os_str.to_bytes().as_ref()); + } + + #[test] + fn test_utf8_vec() { + let os_string = OsString::from(UTF8_STRING); + assert_os_eq( + &os_string, + OsString::from_vec(UTF8_STRING.to_string().into_bytes()), + ); + assert_eq!(UTF8_STRING.to_string().into_bytes(), os_string.into_vec()); + } + + fn test_string_is_invalid_utf8(string: &[u8]) { + assert!(str::from_utf8(string).is_err()); + } + + pub(crate) fn test_bytes(string: &[u8]) -> Result<(), EncodingError> { + let os_string = OsStr::from_bytes(string)?; + assert_eq!(string.len(), os_string.len()); + assert_os_eq(&os_string, OsString::from_bytes(string)); + assert_eq!(string, os_string.to_bytes().as_ref()); + Ok(()) + } + + pub(crate) fn test_vec(string: &[u8]) -> Result<(), EncodingError> { + let os_string = OsString::from_vec(string.to_vec())?; + assert_eq!(string.len(), os_string.len()); + assert_eq!(string, os_string.into_vec().as_slice()); + Ok(()) + } + + #[test] + fn test_invalid_string_is_invalid() { + test_string_is_invalid_utf8(INVALID_STRING); + } + + #[test] + fn test_wtf8_string_is_invalid_utf8() { + test_string_is_invalid_utf8(WTF8_STRING); + } + + #[test] + fn test_wtf8_bytes() -> Result<(), EncodingError> { + test_bytes(WTF8_STRING) + } + + #[test] + fn test_wtf8_vec() -> Result<(), EncodingError> { + test_vec(WTF8_STRING) + } + + #[test] + fn test_random_bytes() { + let os_string = random_os_string(RANDOM_BYTES_LENGTH).unwrap(); + let string = os_string.to_bytes(); + assert_eq!(os_string.len(), string.len()); + assert_os_eq(&os_string, OsStr::from_bytes(&string)); + assert_os_eq(&os_string, OsString::from_bytes(string)); + } + + #[test] + fn test_random_vec() { + let os_string = random_os_string(RANDOM_BYTES_LENGTH).unwrap(); + let string = os_string.clone().into_vec(); + assert_eq!(os_string.len(), string.len()); + assert_os_eq(&os_string, OsString::from_vec(string)); + } +} diff --git a/src/unix.rs b/src/unix.rs new file mode 100644 index 0000000..d8e2f37 --- /dev/null +++ b/src/unix.rs @@ -0,0 +1,83 @@ +use std::borrow::Cow; +use std::ffi::OsStr; +use std::ffi::OsString; + +use crate::EncodingError; +use crate::OsStrBytes; +use crate::OsStringBytes; + +#[inline] +fn from_bytes(string: &[u8]) -> Cow<'_, OsStr> { + Cow::Borrowed(::std::os::unix::ffi::OsStrExt::from_bytes(string)) +} + +#[inline] +fn from_vec(string: Vec) -> OsString { + ::std::os::unix::ffi::OsStringExt::from_vec(string) +} + +impl OsStrBytes for OsStr { + #[inline] + fn from_bytes(string: &[u8]) -> Result, EncodingError> { + Ok(from_bytes(string)) + } + + #[inline] + unsafe fn from_bytes_unchecked(string: &[u8]) -> Cow<'_, Self> { + from_bytes(string) + } + + #[inline] + fn to_bytes(&self) -> Cow<'_, [u8]> { + Cow::Borrowed(::std::os::unix::ffi::OsStrExt::as_bytes(self)) + } +} + +impl OsStringBytes for OsString { + #[inline] + fn from_bytes(string: TString) -> Result + where + TString: AsRef<[u8]>, + { + Ok(from_bytes(string.as_ref()).into_owned()) + } + + #[inline] + unsafe fn from_bytes_unchecked(string: TString) -> Self + where + TString: AsRef<[u8]>, + { + from_bytes(string.as_ref()).into_owned() + } + + #[inline] + fn from_vec(string: Vec) -> Result { + Ok(from_vec(string)) + } + + #[inline] + unsafe fn from_vec_unchecked(string: Vec) -> Self { + from_vec(string) + } + + #[inline] + fn into_vec(self) -> Vec { + ::std::os::unix::ffi::OsStringExt::into_vec(self) + } +} + +#[cfg(test)] +mod tests { + use crate::tests::*; + use crate::EncodingError; + + #[test] + fn test_invalid_bytes() -> Result<(), EncodingError> { + test_bytes(INVALID_STRING) + } + + #[test] + fn test_invalid_vec() -> Result<(), EncodingError> { + test_vec(INVALID_STRING) + } +} diff --git a/src/windows.rs b/src/windows.rs new file mode 100644 index 0000000..1b4098f --- /dev/null +++ b/src/windows.rs @@ -0,0 +1,138 @@ +// These methods are necessarily inefficient, because they must revert encoding +// conversions performed by the standard library. However, there is currently +// no better alternative. + +use std::borrow::Cow; +use std::char; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::mem::size_of; +use std::str; + +use crate::EncodingError; +use crate::OsStrBytes; +use crate::OsStringBytes; + +fn from_bytes_unchecked(string: &[u8]) -> OsString { + // https://github.com/rust-lang/rust/blob/4560ea788cb760f0a34127156c78e2552949f734/src/libstd/sys_common/wtf8.rs#L813-L831 + + // SAFETY: This conversion technically causes undefined behavior when + // [string] is not representable as UTF-8. However, + // [str::next_code_point()] is not exposed; it is only available + // through [Chars::next()]. This string will be dropped at the end of + // this method. + // https://github.com/rust-lang/rust/blob/4560ea788cb760f0a34127156c78e2552949f734/src/libcore/str/mod.rs#L500-L528 + let unchecked_string = unsafe { + str::from_utf8_unchecked(string) + }; + let mut encoded_chars = Vec::new(); + let mut buffer = [0; 2]; + for unchecked_char in unchecked_string.chars() { + encoded_chars.extend(&*unchecked_char.encode_utf16(&mut buffer)); + } + ::std::os::windows::ffi::OsStringExt::from_wide(&encoded_chars) +} + +impl OsStrBytes for OsStr { + #[inline] + fn from_bytes(string: &[u8]) -> Result, EncodingError> { + Ok(Cow::Owned(OsString::from_bytes(string)?)) + } + + #[inline] + unsafe fn from_bytes_unchecked(string: &[u8]) -> Cow<'_, Self> { + Cow::Owned(OsString::from_bytes_unchecked(string)) + } + + fn to_bytes(&self) -> Cow<'_, [u8]> { + // https://github.com/rust-lang/rust/blob/4560ea788cb760f0a34127156c78e2552949f734/src/libstd/sys_common/wtf8.rs#L183-L201 + + let mut string = Vec::with_capacity(self.len()); + let mut buffer = [0; size_of::()]; + for ch in char::decode_utf16( + ::std::os::windows::ffi::OsStrExt::encode_wide(self), + ) { + let unchecked_char = match ch { + Ok(ch) => ch, + Err(surrogate) => { + let surrogate = surrogate.unpaired_surrogate().into(); + // SAFETY: This conversion creates an invalid [char] value. + // However, there is otherwise no way to encode a [u32] + // value as invalid UTF-8, which is why the standard + // library uses the same approach: + // https://github.com/rust-lang/rust/blob/4560ea788cb760f0a34127156c78e2552949f734/src/libstd/sys_common/wtf8.rs#L206-L208 + unsafe { + char::from_u32_unchecked(surrogate) + } + }, + }; + string.extend_from_slice( + unchecked_char.encode_utf8(&mut buffer).as_bytes(), + ); + } + Cow::Owned(string) + } +} + +impl OsStringBytes for OsString { + fn from_bytes(string: TString) -> Result + where + TString: AsRef<[u8]>, + { + let string = string.as_ref(); + let os_string = from_bytes_unchecked(string); + if os_string.to_bytes() == string { Ok(os_string) } + else { Err(EncodingError(())) } + } + + #[inline] + unsafe fn from_bytes_unchecked(string: TString) -> Self + where + TString: AsRef<[u8]>, + { + from_bytes_unchecked(string.as_ref()) + } + + #[inline] + fn from_vec(string: Vec) -> Result { + Self::from_bytes(string) + } + + #[inline] + unsafe fn from_vec_unchecked(string: Vec) -> Self { + Self::from_bytes_unchecked(string) + } + + #[inline] + fn into_vec(self) -> Vec { + self.as_os_str().to_bytes().into_owned() + } +} + +#[cfg(test)] +mod tests { + use std::ffi::OsStr; + use std::ffi::OsString; + + use crate::tests::*; + use crate::EncodingError; + use crate::OsStrBytes; + use crate::OsStringBytes; + + #[test] + fn test_invalid_bytes() { + assert_eq!(Err(EncodingError(())), OsStr::from_bytes(INVALID_STRING)); + assert_eq!( + Err(EncodingError(())), + OsString::from_bytes(INVALID_STRING), + ); + } + + #[test] + fn test_invalid_vec() { + assert_eq!( + Err(EncodingError(())), + OsString::from_vec(INVALID_STRING.to_vec()), + ); + } +}