Bug 1857742 - patch 1 - Vendor the oxilangtag crate into third_party/rust. r=supply-chain-reviewers,dholbert

Differential Revision: https://phabricator.services.mozilla.com/D193891
This commit is contained in:
Jonathan Kew 2023-11-18 10:36:00 +00:00
parent 819c082c31
commit 94a3c09204
12 changed files with 1871 additions and 0 deletions

7
Cargo.lock generated
View File

@ -2207,6 +2207,7 @@ dependencies = [
"nsstring",
"oblivious_http",
"origin-trials-ffi",
"oxilangtag",
"prefs_parser",
"processtools",
"profiler_helper",
@ -4132,6 +4133,12 @@ dependencies = [
"stable_deref_trait",
]
[[package]]
name = "oxilangtag"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d91edf4fbb970279443471345a4e8c491bf05bb283b3e6c88e4e606fd8c181b"
[[package]]
name = "packed_simd"
version = "0.3.9"

View File

@ -2529,6 +2529,15 @@ who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy"
delta = "6.3.0 -> 6.4.1"
[[audits.oxilangtag]]
who = "Jonathan Kew <jkew@mozilla.com>"
criteria = "safe-to-deploy"
version = "0.1.3"
notes = """
I have reviewed all the code in this (small) crate.
There is no unsafe code present.
"""
[[audits.packed_simd]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"

View File

@ -0,0 +1 @@
{"files":{"CHANGELOG.md":"3d0f3240ed450d19b894dd8715e20bbec50a14eb0d357df8c09a4af1f19fc831","Cargo.toml":"b8414a40b2cdeb5b34dc4b7e79a5e192b56b953d9db1a762dbf3e8728074dd6a","LICENSE":"3fe41c99abc306c2cd34a9365b1810035ae93335ebf4736c0240b469b3f410eb","README.md":"fc98b140225bc0521a136c2c1ed8146f7398349a36d52481f97d8ec2b7679619","benches/lib.rs":"61c94b95e005c0df25ff740ddc7801d65f68bd6e00c0b8aca7eeb66b103f9eea","deny.toml":"fce6beebdde75e3950abfd230b5110d485f2daf5a333cc77b447669593fa7c62","src/lib.rs":"92c85f535a42b8dde8c2f3078c61e4e1580d326ac621eba2f410bdee521be41d","tests/lib.rs":"9927c137f39094cfd8fbcf56069a047818112374148e8950fd73708e9ae0382a"},"package":"8d91edf4fbb970279443471345a4e8c491bf05bb283b3e6c88e4e606fd8c181b"}

View File

@ -0,0 +1,16 @@
# Changelog
## [0.1.3] - 2022-03-26
### Added
- `LanguageTag` now implements Serde `Serialize` and `Deserialize` trait if the `serde` crate is present.
The serialization is a plain string.
## [0.1.2] - 2021-04-16
### Added
- `LanguageTag` struct with a parser, case normalization and components accessors.
### Changed
- Proper attribution from [`language-tags`](https://github.com/pyfisch/rust-language-tags/).

39
third_party/rust/oxilangtag/Cargo.toml vendored Normal file
View File

@ -0,0 +1,39 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "oxilangtag"
version = "0.1.3"
authors = ["Tpt <thomas@pellissier-tanon.fr>"]
description = "Simple and fast implementation of language tag normalization and validation\n"
readme = "README.md"
keywords = ["language-tag", "BCP47"]
license = "MIT"
repository = "https://github.com/oxigraph/oxilangtag"
[package.metadata.docs.rs]
all-features = true
[[bench]]
name = "lib"
harness = false
[dependencies.serde]
version = "1"
optional = true
[dev-dependencies.criterion]
version = "0.3"
[dev-dependencies.serde_test]
version = "1"
[features]
default = []
serialize = ["serde"]

19
third_party/rust/oxilangtag/LICENSE vendored Normal file
View File

@ -0,0 +1,19 @@
Copyright (c) 2015-2021 Pyfisch Tpt
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

46
third_party/rust/oxilangtag/README.md vendored Normal file
View File

@ -0,0 +1,46 @@
oxilangtag
==========
[![actions status](https://github.com/oxigraph/oxilangtag/workflows/build/badge.svg)](https://github.com/oxigraph/oxilangtag/actions)
[![Latest Version](https://img.shields.io/crates/v/oxilangtag.svg)](https://crates.io/crates/oxilangtag)
[![Released API docs](https://docs.rs/oxilangtag/badge.svg)](https://docs.rs/oxilangtag)
OxiLangTag is a Rust library allowing to validate and normalize language tags following [RFC 5646](https://tools.ietf.org/html/rfc5646)
([BCP 47](https://tools.ietf.org/html/bcp47)).
It is a fork of the [`language-tags`](https://github.com/pyfisch/rust-language-tags/) focusing on [RDF use cases](https://www.w3.org/TR/rdf11-primer/).
You might find the [`language-tags`](https://github.com/pyfisch/rust-language-tags/) crate more convenient.
It allows zero stack allocation language tag validation.
Getters are also provided to easily retrieve the various language tag components.
If [`serde`](https://serde.rs/) is available, `LanguageTag` implements the `Serialize` and `Deserialize` traits and encodes the language tag as a string.
Example:
```rust
use oxilangtag::LanguageTag;
// Parsing and validation
let language_tag = LanguageTag::parse("zh-cmn-Hans-CN-x-test").unwrap();
assert_eq!(language_tag.as_str(), "zh-cmn-Hans-CN-x-test");
// Language tag components
assert_eq!(language_tag.primary_language(), "zh");
assert_eq!(language_tag.extended_language(), Some("cmn"));
assert_eq!(language_tag.full_language(), "zh-cmn");
assert_eq!(language_tag.script(), Some("Hans"));
assert_eq!(language_tag.region(), Some("CN"));
assert_eq!(language_tag.extension(), None);
assert_eq!(language_tag.private_use_subtags().collect::<Vec<_>>(), vec!["test"]);
```
## License
This project is licensed under the MIT license ([LICENSE-MIT](LICENSE-MIT) or `<http://opensource.org/licenses/MIT>`).
It is based on the [`language-tags`](https://github.com/pyfisch/rust-language-tags/) crate by [pyfisch](https://github.com/pyfisch) under MIT license.
### Contribution
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in Oxilangtag by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.

View File

@ -0,0 +1,77 @@
use criterion::{criterion_group, criterion_main, Criterion};
use oxilangtag::LanguageTag;
fn bench_language_tag_parse(c: &mut Criterion) {
let examples = [
"fr",
"fr-Latn",
"fr-fra",
"fr-Latn-FR",
"fr-Latn-419",
"fr-FR",
"ax-TZ",
"fr-shadok",
"fr-y-myext-myext2",
"fra-Latn",
"fra",
"fra-FX",
"i-klingon",
"I-kLINgon",
"no-bok",
"fr-Lat",
"mn-Cyrl-MN",
"mN-cYrL-Mn",
"fr-Latn-CA",
"en-US",
"fr-Latn-CA",
"i-enochian",
"x-fr-CH",
"sr-Latn-CS",
"es-419",
"sl-nedis",
"de-CH-1996",
"de-Latg-1996",
"sl-IT-nedis",
"en-a-bbb-x-a-ccc",
"de-a-value",
"en-Latn-GB-boont-r-extended-sequence-x-private",
"en-x-US",
"az-Arab-x-AZE-derbend",
"es-Latn-CO-x-private",
"en-US-boont",
"ab-x-abc-x-abc",
"ab-x-abc-a-a",
"i-default",
"i-klingon",
"abcd-Latn",
"AaBbCcDd-x-y-any-x",
"en",
"de-AT",
"es-419",
"de-CH-1901",
"sr-Cyrl",
"sr-Cyrl-CS",
"sl-Latn-IT-rozaj",
"en-US-x-twain",
"zh-cmn",
"zh-cmn-Hant",
"zh-cmn-Hant-HK",
"zh-gan",
"zh-yue-Hant-HK",
"xr-lxs-qut",
"xr-lqt-qu",
"xr-p-lze",
];
c.bench_function("language tag parse tests", |b| {
b.iter(|| {
for tag in examples.iter() {
LanguageTag::parse(*tag).unwrap();
}
})
});
}
criterion_group!(language_tag, bench_language_tag_parse);
criterion_main!(language_tag);

11
third_party/rust/oxilangtag/deny.toml vendored Normal file
View File

@ -0,0 +1,11 @@
[licenses]
unlicensed = "deny"
allow = [
"MIT",
"Apache-2.0"
]
default = "deny"
[bans]
multiple-versions = "warn"
wildcards = "deny"

923
third_party/rust/oxilangtag/src/lib.rs vendored Normal file
View File

@ -0,0 +1,923 @@
#![doc = include_str!("../README.md")]
#![deny(unsafe_code)]
#[cfg(feature = "serde")]
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::borrow::{Borrow, Cow};
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::iter::once;
use std::ops::Deref;
use std::str::{FromStr, Split};
/// A [RFC 5646](https://tools.ietf.org/html/rfc5646) language tag.
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("en-us").unwrap();
/// assert_eq!(language_tag.into_inner(), "en-us")
/// ```
#[derive(Copy, Clone)]
pub struct LanguageTag<T> {
tag: T,
positions: TagElementsPositions,
}
impl<T: Deref<Target = str>> LanguageTag<T> {
/// Parses a language tag acccording to [RFC 5646](https://tools.ietf.org/html/rfc5646).
/// and checks if the tag is ["well-formed"](https://tools.ietf.org/html/rfc5646#section-2.2.9).
///
/// This operation keeps internally the `tag` parameter and does not allocate on the heap.
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("en-us").unwrap();
/// assert_eq!(language_tag.into_inner(), "en-us")
/// ```
pub fn parse(tag: T) -> Result<Self, LanguageTagParseError> {
let positions = parse_language_tag(&tag, &mut VoidOutputBuffer::default())?;
Ok(Self { tag, positions })
}
/// Returns the underlying language tag representation.
#[inline]
pub fn as_str(&self) -> &str {
&self.tag
}
/// Returns the underlying language tag representation.
#[inline]
pub fn into_inner(self) -> T {
self.tag
}
/// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
/// assert_eq!(language_tag.primary_language(), "zh");
/// ```
#[inline]
pub fn primary_language(&self) -> &str {
&self.tag[..self.positions.language_end]
}
/// Returns the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
///
/// Valid language tags have at most one extended language.
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
/// assert_eq!(language_tag.extended_language(), Some("cmn"));
/// ```
#[inline]
pub fn extended_language(&self) -> Option<&str> {
if self.positions.language_end == self.positions.extlang_end {
None
} else {
Some(&self.tag[self.positions.language_end + 1..self.positions.extlang_end])
}
}
/// Iterates on the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
///
/// Valid language tags have at most one extended language.
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
/// assert_eq!(language_tag.extended_language_subtags().collect::<Vec<_>>(), vec!["cmn"]);
/// ```
#[inline]
pub fn extended_language_subtags(&self) -> impl Iterator<Item = &str> {
self.extended_language().unwrap_or("").split_terminator('-')
}
/// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1)
/// and its [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
/// assert_eq!(language_tag.full_language(), "zh-cmn");
/// ```
#[inline]
pub fn full_language(&self) -> &str {
&self.tag[..self.positions.extlang_end]
}
/// Returns the [script subtag](https://tools.ietf.org/html/rfc5646#section-2.2.3).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
/// assert_eq!(language_tag.script(), Some("Hans"));
/// ```
#[inline]
pub fn script(&self) -> Option<&str> {
if self.positions.extlang_end == self.positions.script_end {
None
} else {
Some(&self.tag[self.positions.extlang_end + 1..self.positions.script_end])
}
}
/// Returns the [region subtag](https://tools.ietf.org/html/rfc5646#section-2.2.4).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
/// assert_eq!(language_tag.region(), Some("CN"));
/// ```
#[inline]
pub fn region(&self) -> Option<&str> {
if self.positions.script_end == self.positions.region_end {
None
} else {
Some(&self.tag[self.positions.script_end + 1..self.positions.region_end])
}
}
/// Returns the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap();
/// assert_eq!(language_tag.variant(), Some("pinyin"));
/// ```
#[inline]
pub fn variant(&self) -> Option<&str> {
if self.positions.region_end == self.positions.variant_end {
None
} else {
Some(&self.tag[self.positions.region_end + 1..self.positions.variant_end])
}
}
/// Iterates on the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap();
/// assert_eq!(language_tag.variant_subtags().collect::<Vec<_>>(), vec!["pinyin"]);
/// ```
#[inline]
pub fn variant_subtags(&self) -> impl Iterator<Item = &str> {
self.variant().unwrap_or("").split_terminator('-')
}
/// Returns the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap();
/// assert_eq!(language_tag.extension(), Some("u-co-phonebk"));
/// ```
#[inline]
pub fn extension(&self) -> Option<&str> {
if self.positions.variant_end == self.positions.extension_end {
None
} else {
Some(&self.tag[self.positions.variant_end + 1..self.positions.extension_end])
}
}
/// Iterates on the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap();
/// assert_eq!(language_tag.extension_subtags().collect::<Vec<_>>(), vec![('u', "co-phonebk")]);
/// ```
#[inline]
pub fn extension_subtags(&self) -> impl Iterator<Item = (char, &str)> {
match self.extension() {
Some(parts) => ExtensionsIterator::new(parts),
None => ExtensionsIterator::new(""),
}
}
/// Returns the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap();
/// assert_eq!(language_tag.private_use(), Some("x-foo-bar"));
/// ```
#[inline]
pub fn private_use(&self) -> Option<&str> {
if self.tag.starts_with("x-") {
Some(&self.tag)
} else if self.positions.extension_end == self.tag.len() {
None
} else {
Some(&self.tag[self.positions.extension_end + 1..])
}
}
/// Iterates on the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7).
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap();
/// assert_eq!(language_tag.private_use_subtags().collect::<Vec<_>>(), vec!["foo", "bar"]);
/// ```
#[inline]
pub fn private_use_subtags(&self) -> impl Iterator<Item = &str> {
self.private_use()
.map(|part| &part[2..])
.unwrap_or("")
.split_terminator('-')
}
}
impl LanguageTag<String> {
/// Parses a language tag acccording to [RFC 5646](https://tools.ietf.org/html/rfc5646)
/// and normalizes its case.
///
/// This parser accepts the language tags that are "well-formed" according to
/// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9).
///
/// This operation does heap allocation.
///
/// ```
/// use oxilangtag::LanguageTag;
///
/// let language_tag = LanguageTag::parse_and_normalize("en-us").unwrap();
/// assert_eq!(language_tag.into_inner(), "en-US")
/// ```
pub fn parse_and_normalize(tag: &str) -> Result<Self, LanguageTagParseError> {
let mut output_buffer = String::with_capacity(tag.len());
let positions = parse_language_tag(tag, &mut output_buffer)?;
Ok(Self {
tag: output_buffer,
positions,
})
}
}
impl<Lft: PartialEq<Rhs>, Rhs> PartialEq<LanguageTag<Rhs>> for LanguageTag<Lft> {
#[inline]
fn eq(&self, other: &LanguageTag<Rhs>) -> bool {
self.tag.eq(&other.tag)
}
}
impl<T: PartialEq<str>> PartialEq<str> for LanguageTag<T> {
#[inline]
fn eq(&self, other: &str) -> bool {
self.tag.eq(other)
}
}
impl<'a, T: PartialEq<&'a str>> PartialEq<&'a str> for LanguageTag<T> {
#[inline]
fn eq(&self, other: &&'a str) -> bool {
self.tag.eq(other)
}
}
impl<T: PartialEq<String>> PartialEq<String> for LanguageTag<T> {
#[inline]
fn eq(&self, other: &String) -> bool {
self.tag.eq(other)
}
}
impl<'a, T: PartialEq<Cow<'a, str>>> PartialEq<Cow<'a, str>> for LanguageTag<T> {
#[inline]
fn eq(&self, other: &Cow<'a, str>) -> bool {
self.tag.eq(other)
}
}
impl<T: PartialEq<str>> PartialEq<LanguageTag<T>> for str {
#[inline]
fn eq(&self, other: &LanguageTag<T>) -> bool {
other.tag.eq(self)
}
}
impl<'a, T: PartialEq<&'a str>> PartialEq<LanguageTag<T>> for &'a str {
#[inline]
fn eq(&self, other: &LanguageTag<T>) -> bool {
other.tag.eq(self)
}
}
impl<T: PartialEq<String>> PartialEq<LanguageTag<T>> for String {
#[inline]
fn eq(&self, other: &LanguageTag<T>) -> bool {
other.tag.eq(self)
}
}
impl<'a, T: PartialEq<Cow<'a, str>>> PartialEq<LanguageTag<T>> for Cow<'a, str> {
#[inline]
fn eq(&self, other: &LanguageTag<T>) -> bool {
other.tag.eq(self)
}
}
impl<T: Eq> Eq for LanguageTag<T> {}
impl<T: Hash> Hash for LanguageTag<T> {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.tag.hash(state)
}
}
impl<T: PartialOrd> PartialOrd for LanguageTag<T> {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.tag.partial_cmp(&other.tag)
}
}
impl<T: Ord> Ord for LanguageTag<T> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
self.tag.cmp(&other.tag)
}
}
impl<T: Deref<Target = str>> Deref for LanguageTag<T> {
type Target = str;
#[inline]
fn deref(&self) -> &str {
self.tag.deref()
}
}
impl<T: AsRef<str>> AsRef<str> for LanguageTag<T> {
#[inline]
fn as_ref(&self) -> &str {
self.tag.as_ref()
}
}
impl<T: Borrow<str>> Borrow<str> for LanguageTag<T> {
#[inline]
fn borrow(&self) -> &str {
self.tag.borrow()
}
}
impl<T: fmt::Debug> fmt::Debug for LanguageTag<T> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.tag.fmt(f)
}
}
impl<T: fmt::Display> fmt::Display for LanguageTag<T> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.tag.fmt(f)
}
}
impl FromStr for LanguageTag<String> {
type Err = LanguageTagParseError;
#[inline]
fn from_str(tag: &str) -> Result<Self, LanguageTagParseError> {
Self::parse_and_normalize(tag)
}
}
impl<'a> From<LanguageTag<&'a str>> for LanguageTag<String> {
#[inline]
fn from(tag: LanguageTag<&'a str>) -> Self {
Self {
tag: tag.tag.into(),
positions: tag.positions,
}
}
}
impl<'a> From<LanguageTag<Cow<'a, str>>> for LanguageTag<String> {
#[inline]
fn from(tag: LanguageTag<Cow<'a, str>>) -> Self {
Self {
tag: tag.tag.into(),
positions: tag.positions,
}
}
}
impl From<LanguageTag<Box<str>>> for LanguageTag<String> {
#[inline]
fn from(tag: LanguageTag<Box<str>>) -> Self {
Self {
tag: tag.tag.into(),
positions: tag.positions,
}
}
}
impl<'a> From<LanguageTag<&'a str>> for LanguageTag<Cow<'a, str>> {
#[inline]
fn from(tag: LanguageTag<&'a str>) -> Self {
Self {
tag: tag.tag.into(),
positions: tag.positions,
}
}
}
impl<'a> From<LanguageTag<String>> for LanguageTag<Cow<'a, str>> {
#[inline]
fn from(tag: LanguageTag<String>) -> Self {
Self {
tag: tag.tag.into(),
positions: tag.positions,
}
}
}
#[cfg(feature = "serde")]
impl<T: Serialize> Serialize for LanguageTag<T> {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.tag.serialize(serializer)
}
}
#[cfg(feature = "serde")]
impl<'de, T: Deref<Target = str> + Deserialize<'de>> Deserialize<'de> for LanguageTag<T> {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<LanguageTag<T>, D::Error> {
use serde::de::Error;
Self::parse(T::deserialize(deserializer)?).map_err(D::Error::custom)
}
}
/// An error raised during [`LanguageTag`](struct.LanguageTag.html) validation.
#[derive(Debug)]
pub struct LanguageTagParseError {
kind: TagParseErrorKind,
}
impl fmt::Display for LanguageTagParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.kind {
TagParseErrorKind::EmptyExtension => {
write!(f, "If an extension subtag is present, it must not be empty")
}
TagParseErrorKind::EmptyPrivateUse => {
write!(f, "If the `x` subtag is present, it must not be empty")
}
TagParseErrorKind::ForbiddenChar => {
write!(f, "The langtag contains a char not allowed")
}
TagParseErrorKind::InvalidSubtag => write!(
f,
"A subtag fails to parse, it does not match any other subtags"
),
TagParseErrorKind::InvalidLanguage => write!(f, "The given language subtag is invalid"),
TagParseErrorKind::SubtagTooLong => {
write!(f, "A subtag may be eight characters in length at maximum")
}
TagParseErrorKind::EmptySubtag => write!(f, "A subtag should not be empty"),
TagParseErrorKind::TooManyExtlangs => {
write!(f, "At maximum three extlangs are allowed")
}
}
}
}
impl Error for LanguageTagParseError {}
#[derive(Debug)]
enum TagParseErrorKind {
/// If an extension subtag is present, it must not be empty.
EmptyExtension,
/// If the `x` subtag is present, it must not be empty.
EmptyPrivateUse,
/// The langtag contains a char that is not A-Z, a-z, 0-9 or the dash.
ForbiddenChar,
/// A subtag fails to parse, it does not match any other subtags.
InvalidSubtag,
/// The given language subtag is invalid.
InvalidLanguage,
/// A subtag may be eight characters in length at maximum.
SubtagTooLong,
/// A subtag should not be empty.
EmptySubtag,
/// At maximum three extlangs are allowed, but zero to one extlangs are preferred.
TooManyExtlangs,
}
#[derive(Copy, Clone, Debug)]
struct TagElementsPositions {
language_end: usize,
extlang_end: usize,
script_end: usize,
region_end: usize,
variant_end: usize,
extension_end: usize,
}
trait OutputBuffer: Extend<char> {
fn push(&mut self, c: char);
fn push_str(&mut self, s: &str);
}
#[derive(Default)]
struct VoidOutputBuffer {}
impl OutputBuffer for VoidOutputBuffer {
#[inline]
fn push(&mut self, _: char) {}
#[inline]
fn push_str(&mut self, _: &str) {}
}
impl Extend<char> for VoidOutputBuffer {
#[inline]
fn extend<T: IntoIterator<Item = char>>(&mut self, _: T) {}
}
impl OutputBuffer for String {
#[inline]
fn push(&mut self, c: char) {
self.push(c);
}
#[inline]
fn push_str(&mut self, s: &str) {
self.push_str(s);
}
}
/// Parses language tag following [the RFC5646 grammar](https://tools.ietf.org/html/rfc5646#section-2.1)
fn parse_language_tag(
input: &str,
output: &mut impl OutputBuffer,
) -> Result<TagElementsPositions, LanguageTagParseError> {
//grandfathered tags
if let Some(tag) = GRANDFATHEREDS
.iter()
.find(|record| record.eq_ignore_ascii_case(input))
{
output.push_str(tag);
Ok(TagElementsPositions {
language_end: tag.len(),
extlang_end: tag.len(),
script_end: tag.len(),
region_end: tag.len(),
variant_end: tag.len(),
extension_end: tag.len(),
})
} else if input.starts_with("x-") || input.starts_with("X-") {
// private use
if !is_alphanumeric_or_dash(input) {
Err(LanguageTagParseError {
kind: TagParseErrorKind::ForbiddenChar,
})
} else if input.len() == 2 {
Err(LanguageTagParseError {
kind: TagParseErrorKind::EmptyPrivateUse,
})
} else {
output.extend(input.chars().map(|c| c.to_ascii_lowercase()));
Ok(TagElementsPositions {
language_end: input.len(),
extlang_end: input.len(),
script_end: input.len(),
region_end: input.len(),
variant_end: input.len(),
extension_end: input.len(),
})
}
} else {
parse_langtag(input, output)
}
}
/// Handles normal tags.
fn parse_langtag(
input: &str,
output: &mut impl OutputBuffer,
) -> Result<TagElementsPositions, LanguageTagParseError> {
#[derive(PartialEq, Eq)]
enum State {
Start,
AfterLanguage,
AfterExtLang,
AfterScript,
AfterRegion,
InExtension { expected: bool },
InPrivateUse { expected: bool },
}
let mut state = State::Start;
let mut language_end = 0;
let mut extlang_end = 0;
let mut script_end = 0;
let mut region_end = 0;
let mut variant_end = 0;
let mut extension_end = 0;
let mut extlangs_count = 0;
for (subtag, end) in SubTagIterator::new(input) {
if subtag.is_empty() {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::EmptySubtag,
});
}
if subtag.len() > 8 {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::SubtagTooLong,
});
}
if state == State::Start {
// Primary language
if subtag.len() < 2 || !is_alphabetic(subtag) {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::InvalidLanguage,
});
}
language_end = end;
output.extend(to_lowercase(subtag));
if subtag.len() < 4 {
// extlangs are only allowed for short language tags
state = State::AfterLanguage;
} else {
state = State::AfterExtLang;
}
} else if let State::InPrivateUse { .. } = state {
if !is_alphanumeric(subtag) {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::InvalidSubtag,
});
}
output.push('-');
output.extend(to_lowercase(subtag));
state = State::InPrivateUse { expected: false };
} else if subtag == "x" || subtag == "X" {
// We make sure extension is found
if let State::InExtension { expected: true } = state {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::EmptyExtension,
});
}
output.push('-');
output.push('x');
state = State::InPrivateUse { expected: true };
} else if subtag.len() == 1 && is_alphanumeric(subtag) {
// We make sure extension is found
if let State::InExtension { expected: true } = state {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::EmptyExtension,
});
}
let extension_tag = subtag.chars().next().unwrap().to_ascii_lowercase();
output.push('-');
output.push(extension_tag);
state = State::InExtension { expected: true };
} else if let State::InExtension { .. } = state {
if !is_alphanumeric(subtag) {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::InvalidSubtag,
});
}
extension_end = end;
output.push('-');
output.extend(to_lowercase(subtag));
state = State::InExtension { expected: false };
} else if state == State::AfterLanguage && subtag.len() == 3 && is_alphabetic(subtag) {
extlangs_count += 1;
if extlangs_count > 3 {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::TooManyExtlangs,
});
}
// valid extlangs
extlang_end = end;
output.push('-');
output.extend(to_lowercase(subtag));
} else if (state == State::AfterLanguage || state == State::AfterExtLang)
&& subtag.len() == 4
&& is_alphabetic(subtag)
{
// Script
script_end = end;
output.push('-');
output.extend(to_uppercase_first(subtag));
state = State::AfterScript;
} else if (state == State::AfterLanguage
|| state == State::AfterExtLang
|| state == State::AfterScript)
&& (subtag.len() == 2 && is_alphabetic(subtag)
|| subtag.len() == 3 && is_numeric(subtag))
{
// Region
region_end = end;
output.push('-');
output.extend(to_uppercase(subtag));
state = State::AfterRegion;
} else if (state == State::AfterLanguage
|| state == State::AfterExtLang
|| state == State::AfterScript
|| state == State::AfterRegion)
&& is_alphanumeric(subtag)
&& (subtag.len() >= 5 && is_alphabetic(&subtag[0..1])
|| subtag.len() >= 4 && is_numeric(&subtag[0..1]))
{
// Variant
variant_end = end;
output.push('-');
output.extend(to_lowercase(subtag));
state = State::AfterRegion;
} else {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::InvalidSubtag,
});
}
}
//We make sure we are in a correct final state
if let State::InExtension { expected: true } = state {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::EmptyExtension,
});
}
if let State::InPrivateUse { expected: true } = state {
return Err(LanguageTagParseError {
kind: TagParseErrorKind::EmptyPrivateUse,
});
}
//We make sure we have not skipped anyone
if extlang_end < language_end {
extlang_end = language_end;
}
if script_end < extlang_end {
script_end = extlang_end;
}
if region_end < script_end {
region_end = script_end;
}
if variant_end < region_end {
variant_end = region_end;
}
if extension_end < variant_end {
extension_end = variant_end;
}
Ok(TagElementsPositions {
language_end,
extlang_end,
script_end,
region_end,
variant_end,
extension_end,
})
}
struct ExtensionsIterator<'a> {
input: &'a str,
}
impl<'a> ExtensionsIterator<'a> {
fn new(input: &'a str) -> Self {
Self { input }
}
}
impl<'a> Iterator for ExtensionsIterator<'a> {
type Item = (char, &'a str);
fn next(&mut self) -> Option<(char, &'a str)> {
let mut parts_iterator = self.input.split_terminator('-');
let singleton = parts_iterator.next()?.chars().next().unwrap();
let mut content_size: usize = 2;
for part in parts_iterator {
if part.len() == 1 {
let content = &self.input[2..content_size - 1];
self.input = &self.input[content_size..];
return Some((singleton, content));
} else {
content_size += part.len() + 1;
}
}
let result = self.input.get(2..).map(|content| (singleton, content));
self.input = "";
result
}
}
struct SubTagIterator<'a> {
split: Split<'a, char>,
position: usize,
}
impl<'a> SubTagIterator<'a> {
#[inline]
fn new(input: &'a str) -> Self {
Self {
split: input.split('-'),
position: 0,
}
}
}
impl<'a> Iterator for SubTagIterator<'a> {
type Item = (&'a str, usize);
#[inline]
fn next(&mut self) -> Option<(&'a str, usize)> {
let tag = self.split.next()?;
let tag_end = self.position + tag.len();
self.position = tag_end + 1;
Some((tag, tag_end))
}
}
#[inline]
fn is_alphabetic(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_alphabetic())
}
#[inline]
fn is_numeric(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_digit())
}
#[inline]
fn is_alphanumeric(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_alphanumeric())
}
#[inline]
fn is_alphanumeric_or_dash(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_alphanumeric() || x == '-')
}
#[inline]
fn to_uppercase(s: &str) -> impl Iterator<Item = char> + '_ {
s.chars().map(|c| c.to_ascii_uppercase())
}
// Beware: panics if s.len() == 0 (should never happen in our code)
#[inline]
fn to_uppercase_first(s: &str) -> impl Iterator<Item = char> + '_ {
let mut chars = s.chars();
once(chars.next().unwrap().to_ascii_uppercase()).chain(chars.map(|c| c.to_ascii_lowercase()))
}
#[inline]
fn to_lowercase(s: &str) -> impl Iterator<Item = char> + '_ {
s.chars().map(|c| c.to_ascii_lowercase())
}
const GRANDFATHEREDS: [&str; 26] = [
"art-lojban",
"cel-gaulish",
"en-GB-oed",
"i-ami",
"i-bnn",
"i-default",
"i-enochian",
"i-hak",
"i-klingon",
"i-lux",
"i-mingo",
"i-navajo",
"i-pwn",
"i-tao",
"i-tay",
"i-tsu",
"no-bok",
"no-nyn",
"sgn-BE-FR",
"sgn-BE-NL",
"sgn-CH-DE",
"zh-guoyu",
"zh-hakka",
"zh-min",
"zh-min-nan",
"zh-xiang",
];

722
third_party/rust/oxilangtag/tests/lib.rs vendored Normal file
View File

@ -0,0 +1,722 @@
use oxilangtag::LanguageTag;
use serde_test::{assert_de_tokens, assert_de_tokens_error};
#[cfg(feature = "serde")]
use serde_test::{assert_tokens, Token};
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
// Tests from RFC 5646 2.1.1
#[test]
fn test_formatting() {
assert_eq!(
"mn-Cyrl-MN",
LanguageTag::parse_and_normalize("mn-Cyrl-MN")
.unwrap()
.as_str()
);
assert_eq!(
"mn-Cyrl-MN",
LanguageTag::parse_and_normalize("MN-cYRL-mn")
.unwrap()
.as_str()
);
assert_eq!(
"mn-Cyrl-MN",
LanguageTag::parse_and_normalize("mN-cYrL-Mn")
.unwrap()
.as_str()
);
assert_eq!(
"en-CA-x-ca",
LanguageTag::parse_and_normalize("en-CA-x-ca")
.unwrap()
.as_str()
);
assert_eq!(
"sgn-BE-FR",
LanguageTag::parse_and_normalize("sgn-BE-FR")
.unwrap()
.as_str()
);
assert_eq!(
"az-Latn-x-latn",
LanguageTag::parse_and_normalize("az-Latn-x-latn")
.unwrap()
.as_str()
);
assert_eq!(
"i-ami",
LanguageTag::parse_and_normalize("i-ami").unwrap().as_str()
);
assert_eq!(
"i-ami",
LanguageTag::parse_and_normalize("I-AMI").unwrap().as_str()
);
assert_eq!(
"sl-afb-Latn-005-nedis",
LanguageTag::parse_and_normalize("SL-AFB-lATN-005-nEdis")
.unwrap()
.as_str()
)
}
// Tests from RFC 5646 2.2.1
#[test]
fn test_primary_language() {
assert_eq!(
"fr",
LanguageTag::parse_and_normalize("fr")
.unwrap()
.primary_language()
);
assert_eq!(
"de",
LanguageTag::parse_and_normalize("de")
.unwrap()
.primary_language()
);
assert_eq!(
"x-fr-ch",
LanguageTag::parse_and_normalize("x-fr-CH")
.unwrap()
.primary_language()
);
assert_eq!(
"i-klingon",
LanguageTag::parse_and_normalize("i-klingon")
.unwrap()
.primary_language()
);
assert_eq!(
"i-bnn",
LanguageTag::parse_and_normalize("i-bnn")
.unwrap()
.primary_language()
);
assert_eq!(
"zh-hakka",
LanguageTag::parse_and_normalize("zh-hakka")
.unwrap()
.primary_language()
)
}
// Tests from RFC 5646 2.2.2
#[test]
fn test_extended_language() {
fn parts(tag: &LanguageTag<String>) -> (&str, &str, Option<&str>, Vec<&str>) {
(
tag.full_language(),
tag.primary_language(),
tag.extended_language(),
tag.extended_language_subtags().collect(),
)
}
assert_eq!(("zh", "zh", None, vec![]), parts(&"zh".parse().unwrap()));
assert_eq!(
("zh-gan", "zh", Some("gan"), vec!["gan"]),
parts(&"zh-gan".parse().unwrap())
);
assert_eq!(
("zh-gan-foo", "zh", Some("gan-foo"), vec!["gan", "foo"]),
parts(&"zh-gan-foo".parse().unwrap())
);
assert_eq!(
("zh-min-nan", "zh-min-nan", None, vec![]),
parts(&"zh-min-nan".parse().unwrap())
);
assert_eq!(
("i-tsu", "i-tsu", None, vec![]),
parts(&"i-tsu".parse().unwrap())
);
assert_eq!(("zh", "zh", None, vec![]), parts(&"zh-CN".parse().unwrap()));
assert_eq!(
("zh-gan", "zh", Some("gan"), vec!["gan"]),
parts(&"zh-gan-CN".parse().unwrap())
);
assert_eq!(
("ar-afb", "ar", Some("afb"), vec!["afb"]),
parts(&"ar-afb".parse().unwrap())
);
}
// Tests from RFC 5646 2.2.3
#[test]
fn test_script() {
fn parts(tag: &LanguageTag<String>) -> (&str, Option<&str>) {
(tag.primary_language(), tag.script())
}
assert_eq!(("sr", Some("Latn")), parts(&"sr-Latn".parse().unwrap()));
assert_eq!(("ar", Some("Latn")), parts(&"ar-afb-Latn".parse().unwrap()))
}
// Tests from RFC 5646 2.2.4
#[test]
fn test_region() {
fn parts(tag: &LanguageTag<String>) -> (&str, Option<&str>, Option<&str>) {
(tag.primary_language(), tag.script(), tag.region())
}
assert_eq!(("de", None, Some("AT")), parts(&"de-AT".parse().unwrap()));
assert_eq!(
("sr", Some("Latn"), Some("RS")),
parts(&"sr-Latn-RS".parse().unwrap())
);
assert_eq!(("es", None, Some("419")), parts(&"es-419".parse().unwrap()));
assert_eq!(("ar", None, Some("DE")), parts(&"ar-DE".parse().unwrap()));
assert_eq!(("ar", None, Some("005")), parts(&"ar-005".parse().unwrap()));
}
// Tests from RFC 5646 2.2.5
#[test]
fn test_variant() {
fn parts(tag: &LanguageTag<String>) -> (&str, Option<&str>, Vec<&str>) {
(
tag.primary_language(),
tag.variant(),
tag.variant_subtags().collect(),
)
}
assert_eq!(("sl", None, vec![]), parts(&"sl".parse().unwrap()));
assert_eq!(
("sl", Some("nedis"), vec!["nedis"]),
parts(&"sl-nedis".parse().unwrap())
);
assert_eq!(
("de", Some("1996"), vec!["1996"]),
parts(&"de-CH-1996".parse().unwrap())
);
assert_eq!(
("art-lojban", None, vec![]),
parts(&"art-lojban".parse().unwrap())
);
}
// Tests from RFC 5646 2.2.6
#[test]
fn test_extension() {
fn parts(tag: &LanguageTag<String>) -> (&str, Option<&str>, Vec<(char, &str)>) {
(
tag.primary_language(),
tag.extension(),
tag.extension_subtags().collect(),
)
}
assert_eq!(("en", None, vec![]), parts(&"en".parse().unwrap()));
assert_eq!(
("en", Some("a-bbb"), vec![('a', "bbb")]),
parts(&"en-a-bbb-x-a-ccc".parse().unwrap())
);
assert_eq!(
(
"en",
Some("a-babble-b-warble"),
vec![('a', "babble"), ('b', "warble")]
),
parts(&"en-a-babble-b-warble".parse().unwrap())
);
assert_eq!(
("fr", Some("a-latn"), vec![('a', "latn")]),
parts(&"fr-a-Latn".parse().unwrap())
);
assert_eq!(
(
"en",
Some("r-extended-sequence"),
vec![('r', "extended-sequence")]
),
parts(
&"en-Latn-GB-boont-r-extended-sequence-x-private"
.parse()
.unwrap()
)
);
assert_eq!(
("en", Some("r-az-r-qt"), vec![('r', "az"), ('r', "qt")]),
parts(&"en-r-az-r-qt".parse().unwrap())
);
assert_eq!(("i-tsu", None, vec![]), parts(&"i-tsu".parse().unwrap()));
}
// Tests from RFC 5646 2.2.7
#[test]
fn test_privateuse() {
fn parts(tag: &LanguageTag<String>) -> (&str, Option<&str>, Vec<&str>) {
(
tag.primary_language(),
tag.private_use(),
tag.private_use_subtags().collect(),
)
}
assert_eq!(("en", None, vec![]), parts(&"en".parse().unwrap()));
assert_eq!(
("en", Some("x-us"), vec!["us"]),
parts(&"en-x-US".parse().unwrap())
);
assert_eq!(
("el", Some("x-koine"), vec!["koine"]),
parts(&"el-x-koine".parse().unwrap())
);
assert_eq!(
("x-fr-ch", Some("x-fr-ch"), vec!["fr", "ch"]),
parts(&"x-fr-ch".parse().unwrap())
);
assert_eq!(
("es", Some("x-foobar-at-007"), vec!["foobar", "at", "007"]),
parts(&"es-x-foobar-AT-007".parse().unwrap())
)
}
#[test]
fn test_fmt() {
assert_eq!(
"ar-arb-Latn-DE-nedis-foobar",
LanguageTag::parse_and_normalize("ar-arb-Latn-DE-nedis-foobar")
.unwrap()
.as_str()
);
assert_eq!(
"ar-arb-Latn-DE-nedis-foobar",
LanguageTag::parse_and_normalize("ar-arb-latn-de-nedis-foobar")
.unwrap()
.as_str()
);
assert_eq!(
"ar-arb-Latn-DE-nedis-foobar",
LanguageTag::parse_and_normalize("AR-ARB-LATN-DE-NEDIS-FOOBAR")
.unwrap()
.as_str()
);
assert_eq!(
"xx-z-foo-a-bar-f-spam-b-eggs",
LanguageTag::parse_and_normalize("xx-z-foo-a-bar-F-spam-b-eggs")
.unwrap()
.as_str()
);
assert_eq!(
"hkgnmerm-x-e5-zf-vddjcpz-1v6",
LanguageTag::parse_and_normalize("HkgnmerM-x-e5-zf-VdDjcpz-1V6")
.unwrap()
.to_string()
);
assert_eq!(
"mgxqa-Ywep-8lcw-7bvt-h-dp1md-0h7-0z3ir",
LanguageTag::parse_and_normalize("MgxQa-ywEp-8lcW-7bvT-h-dP1Md-0h7-0Z3ir")
.unwrap()
.as_str()
);
}
#[test]
fn test_unicode() {
assert!(LanguageTag::parse("zh-x-Üńìcødê").is_err());
}
#[test]
fn test_cmp() {
assert_eq!(
LanguageTag::parse_and_normalize("dE-AraB-lY").unwrap(),
LanguageTag::parse_and_normalize("DE-aRaB-LY").unwrap()
);
assert_ne!(
LanguageTag::parse_and_normalize("zh").unwrap(),
LanguageTag::parse_and_normalize("zh-Latn").unwrap()
);
}
// http://www.langtag.net/test-suites/well-formed-tags.txt
#[test]
fn test_wellformed_tags() {
let tags = vec![
"fr",
"fr-Latn",
"fr-fra", // Extended tag
"fr-Latn-FR",
"fr-Latn-419",
"fr-FR",
"ax-TZ", // Not in the registry, but well-formed
"fr-shadok", // Variant
"fr-y-myext-myext2",
"fra-Latn", // ISO 639 can be 3-letters
"fra",
"fra-FX",
"i-klingon", // grandfathered with singleton
"I-kLINgon", // tags are case-insensitive...
"no-bok", // grandfathered without singleton
"fr-Lat", // Extended",
"mn-Cyrl-MN",
"mN-cYrL-Mn",
"fr-Latn-CA",
"en-US",
"fr-Latn-CA",
"i-enochian", // Grand fathered
"x-fr-CH",
"sr-Latn-CS",
"es-419",
"sl-nedis",
"de-CH-1996",
"de-Latg-1996",
"sl-IT-nedis",
"en-a-bbb-x-a-ccc",
"de-a-value",
"en-Latn-GB-boont-r-extended-sequence-x-private",
"en-x-US",
"az-Arab-x-AZE-derbend",
"es-Latn-CO-x-private",
"en-US-boont",
"ab-x-abc-x-abc", // anything goes after x
"ab-x-abc-a-a", // ditto",
"i-default", // grandfathered",
"i-klingon", // grandfathered",
"abcd-Latn", // Language of 4 chars reserved for future use
"AaBbCcDd-x-y-any-x", // Language of 5-8 chars, registered
"en",
"de-AT",
"es-419",
"de-CH-1901",
"sr-Cyrl",
"sr-Cyrl-CS",
"sl-Latn-IT-rozaj",
"en-US-x-twain",
"zh-cmn",
"zh-cmn-Hant",
"zh-cmn-Hant-HK",
"zh-gan",
"zh-yue-Hant-HK",
"xr-lxs-qut", // extlangS
"xr-lqt-qu", // extlang + region
"xr-p-lze", // Extension
];
for tag in tags {
let result = LanguageTag::parse(tag);
assert!(
result.is_ok(),
"{} should be considered well-formed but returned error {}",
tag,
result.err().unwrap()
);
}
}
// http://www.langtag.net/test-suites/broken-tags.txt
#[test]
fn test_broken_tags() {
let tags = vec![
"",
"f",
"f-Latn",
"fr-Latn-F",
"a-value",
"tlh-a-b-foo",
"i-notexist", // grandfathered but not registered: always invalid
"abcdefghi-012345678",
"ab-abc-abc-abc-abc",
"ab-abcd-abc",
"ab-ab-abc",
"ab-123-abc",
"a-Hant-ZH",
"a1-Hant-ZH",
"ab-abcde-abc",
"ab-1abc-abc",
"ab-ab-abcd",
"ab-123-abcd",
"ab-abcde-abcd",
"ab-1abc-abcd",
"ab-a-b",
"ab-a-x",
"ab--ab",
"ab-abc-",
"-ab-abc",
"abcd-efg",
"aabbccddE",
];
for tag in tags {
let result = LanguageTag::parse(tag);
assert!(
result.is_err(),
"{} should be considered not well-formed but returned result {:?}",
tag,
result.ok().unwrap()
);
}
}
#[test]
fn test_random_good_tags() {
// http://unicode.org/repos/cldr/trunk/tools/java/org/unicode/cldr/util/data/langtagTest.txt
let tags = vec![
"zszLDm-sCVS-es-x-gn762vG-83-S-mlL",
"IIJdFI-cfZv",
"kbAxSgJ-685",
"tbutP",
"hDL-595",
"dUf-iUjq-0hJ4P-5YkF-WD8fk",
"FZAABA-FH",
"xZ-lh-4QfM5z9J-1eG4-x-K-R6VPr2z",
"Fyi",
"SeI-DbaG",
"ch-xwFn",
"OeC-GPVI",
"JLzvUSi",
"Fxh-hLAs",
"pKHzCP-sgaO-554",
"eytqeW-hfgH-uQ",
"ydn-zeOP-PR",
"uoWmBM-yHCf-JE",
"xwYem",
"zie",
"Re-wjSv-Ey-i-XE-E-JjWTEB8-f-DLSH-NVzLH-AtnFGWoH-SIDE",
"Ri-063-c-u6v-ZfhkToTB-C-IFfmv-XT-j-rdyYFMhK-h-pY-D5-Oh6FqBhL-hcXt-v-WdpNx71-\
K-c74m4-eBTT7-JdH7Q1Z",
"ji",
"IM-487",
"EPZ-zwcB",
"GauwEcwo",
"kDEP",
"FwDYt-TNvo",
"ottqP-KLES-x-9-i9",
"fcflR-grQQ",
"TvFwdu-kYhs",
"WE-336",
"MgxQa-ywEp-8lcW-7bvT-h-dP1Md-0h7-0Z3ir-K-Srkm-kA-7LXM-Z-whb2MiO-2mNsvbLm-W3O\
-4r-U-KceIxHdI-gvMVgUBV-2uRUni-J0-7C8yTK2",
"Hyr-B-evMtVoB1-mtsVZf-vQMV-gM-I-rr-kvLzg-f-lAUK-Qb36Ne-Z-7eFzOD-mv6kKf-l-miZ\
7U3-k-XDGtNQG",
"ybrlCpzy",
"PTow-w-cAQ51-8Xd6E-cumicgt-WpkZv3NY-q-ORYPRy-v-A4jL4A-iNEqQZZ-sjKn-W-N1F-pzy\
c-xP5eWz-LmsCiCcZ",
"ih-DlPR-PE",
"Krf-362",
"WzaD",
"EPaOnB-gHHn",
"XYta",
"NZ-RgOO-tR",
"at-FE",
"Tpc-693",
"YFp",
"gRQrQULo",
"pVomZ-585",
"laSu-ZcAq-338",
"gCW",
"PydSwHRI-TYfF",
"zKmWDD",
"X-bCrL5RL",
"HK",
"YMKGcLY",
"GDJ-nHYa-bw-X-ke-rohH5GfS-LdJKsGVe",
"tfOxdau-yjge-489-a-oB-I8Csb-1ESaK1v-VFNz-N-FT-ZQyn-On2-I-hu-vaW3-jIQb-vg0U-h\
Ul-h-dO6KuJqB-U-tde2L-P3gHUY-vnl5c-RyO-H-gK1-zDPu-VF1oeh8W-kGzzvBbW-yuAJZ",
"LwDux",
"Zl-072",
"Ri-Ar",
"vocMSwo-cJnr-288",
"kUWq-gWfQ-794",
"YyzqKL-273",
"Xrw-ZHwH-841-9foT-ESSZF-6OqO-0knk-991U-9p3m-b-JhiV-0Kq7Y-h-cxphLb-cDlXUBOQ-X\
-4Ti-jty94yPp",
"en-GB-oed",
"LEuZl-so",
"HyvBvFi-cCAl-X-irMQA-Pzt-H",
"uDbsrAA-304",
"wTS",
"IWXS",
"XvDqNkSn-jRDR",
"gX-Ycbb-iLphEks-AQ1aJ5",
"FbSBz-VLcR-VL",
"JYoVQOP-Iytp",
"gDSoDGD-lq-v-7aFec-ag-k-Z4-0kgNxXC-7h",
"Bjvoayy-029",
"qSDJd",
"qpbQov",
"fYIll-516",
"GfgLyfWE-EHtB",
"Wc-ZMtk",
"cgh-VEYK",
"WRZs-AaFd-yQ",
"eSb-CpsZ-788",
"YVwFU",
"JSsHiQhr-MpjT-381",
"LuhtJIQi-JKYt",
"vVTvS-RHcP",
"SY",
"fSf-EgvQfI-ktWoG-8X5z-63PW",
"NOKcy",
"OjJb-550",
"KB",
"qzKBv-zDKk-589",
"Jr",
"Acw-GPXf-088",
"WAFSbos",
"HkgnmerM-x-e5-zf-VdDjcpz-1V6",
"UAfYflJU-uXDc-YV",
"x-CHsHx-VDcOUAur-FqagDTx-H-V0e74R",
"uZIAZ-Xmbh-pd",
];
for tag in tags {
let result = LanguageTag::parse(tag);
assert!(
result.is_ok(),
"{} should be considered well-formed but returned error {}",
tag,
result.err().unwrap()
);
}
}
#[test]
fn test_random_bad_tags() {
// http://unicode.org/repos/cldr/trunk/tools/java/org/unicode/cldr/util/data/langtagTest.txt
let tags = vec![
"EdY-z_H791Xx6_m_kj",
"qWt85_8S0-L_rbBDq0gl_m_O_zsAx_nRS",
"VzyL2",
"T_VFJq-L-0JWuH_u2_VW-hK-kbE",
"u-t",
"Q-f_ZVJXyc-doj_k-i",
"JWB7gNa_K-5GB-25t_W-s-ZbGVwDu1-H3E",
"b-2T-Qob_L-C9v_2CZxK86",
"fQTpX_0_4Vg_L3L_g7VtALh2",
"S-Z-E_J",
"f6wsq-02_i-F",
"9_GcUPq_G",
"QjsIy_9-0-7_Dv2yPV09_D-JXWXM",
"D_se-f-k",
"ON47Wv1_2_W",
"f-z-R_s-ha",
"N3APeiw_195_Bx2-mM-pf-Z-Ip5lXWa-5r",
"IRjxU-E_6kS_D_b1b_H",
"NB-3-5-AyW_FQ-9hB-TrRJg3JV_3C",
"yF-3a_V_FoJQAHeL_Z-Mc-u",
"n_w_bbunOG_1-s-tJMT5je",
"Q-AEWE_X",
"57b1O_k_R6MU_sb",
"hK_65J_i-o_SI-Y",
"wB4B7u_5I2_I_NZPI",
"J24Nb_q_d-zE",
"v6-dHjJmvPS_IEb-x_A-O-i",
"8_8_dl-ZgBr84u-P-E",
"nIn-xD7EVhe_C",
"5_N-6P_x7Of_Lo_6_YX_R",
"0_46Oo0sZ-YNwiU8Wr_d-M-pg1OriV",
"laiY-5",
"K-8Mdd-j_ila0sSpo_aO8_J",
"wNATtSL-Cp4_gPa_fD41_9z",
"H_FGz5V8_n6rrcoz0_1O6d-kH-7-N",
"wDOrnHU-odqJ_vWl",
"gP_qO-I-jH",
"h",
"dJ0hX-o_csBykEhU-F",
"L-Vf7_BV_eRJ5goSF_Kp",
"y-oF-chnavU-H",
"9FkG-8Q-8_v",
"W_l_NDQqI-O_SFSAOVq",
"kDG3fzXw",
"t-nsSp-7-t-mUK2",
"Yw-F",
"1-S_3_l",
"u-v_brn-Y",
"4_ft_3ZPZC5lA_D",
"n_dR-QodsqJnh_e",
"Hwvt-bSwZwj_KL-hxg0m-3_hUG",
"mQHzvcV-UL-o2O_1KhUJQo_G2_uryk3-a",
"b-UTn33HF",
"r-Ep-jY-aFM_N_H",
"K-k-krEZ0gwD_k_ua-9dm3Oy-s_v",
"XS_oS-p",
"EIx_h-zf5",
"p_z-0_i-omQCo3B",
"1_q0N_jo_9",
"0Ai-6-S",
"L-LZEp_HtW",
"Zj-A4JD_2A5Aj7_b-m3",
"x",
"p-qPuXQpp_d-jeKifB-c-7_G-X",
"X94cvJ_A",
"F2D25R_qk_W-w_Okf_kx",
"rc-f",
"D",
"gD_WrDfxmF-wu-E-U4t",
"Z_BN9O4_D9-D_0E_KnCwZF-84b-19",
"T-8_g-u-0_E",
"lXTtys9j_X_A_m-vtNiNMw_X_b-C6Nr",
"V_Ps-4Y-S",
"X5wGEA",
"mIbHFf_ALu4_Jo1Z1",
"ET-TacYx_c",
"Z-Lm5cAP_ri88-d_q_fi8-x",
"rTi2ah-4j_j_4AlxTs6m_8-g9zqncIf-N5",
"FBaLB85_u-0NxhAy-ZU_9c",
"x_j_l-5_aV95_s_tY_jp4",
"PL768_D-m7jNWjfD-Nl_7qvb_bs_8_Vg",
"9-yOc-gbh",
"6DYxZ_SL-S_Ye",
"ZCa-U-muib-6-d-f_oEh_O",
"Qt-S-o8340F_f_aGax-c-jbV0gfK_p",
"WE_SzOI_OGuoBDk-gDp",
"cs-Y_9",
"m1_uj",
"Y-ob_PT",
"li-B",
"f-2-7-9m_f8den_J_T_d",
"p-Os0dua-H_o-u",
"L",
"rby-w",
];
for tag in tags {
let result = LanguageTag::parse(tag);
assert!(
result.is_err(),
"{} should be considered not well-formed but returned result {:?}",
tag,
result.ok().unwrap()
);
}
}
#[test]
fn test_eq() {
let tag = LanguageTag::parse("en-fr").unwrap();
assert_eq!(tag, "en-fr");
assert_ne!(tag, "en-FR");
assert_eq!("en-fr", tag);
assert_eq!(hash(&tag), hash("en-fr"));
assert_ne!(hash(&tag), hash("en-FR"));
}
fn hash(value: impl Hash) -> u64 {
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
hasher.finish()
}
#[test]
fn test_str() {
let tag = LanguageTag::parse("en-fr").unwrap();
assert!(tag.starts_with("en-"));
}
#[cfg(feature = "serde")]
#[test]
fn test_serd_impl() {
assert_tokens(
&LanguageTag::parse("en-us").unwrap(),
&[Token::BorrowedStr("en-us")],
);
assert_tokens(
&LanguageTag::parse("en-US".to_string()).unwrap(),
&[Token::String("en-US")],
);
assert_de_tokens(
&LanguageTag::parse("en-US".to_string()).unwrap(),
&[Token::BorrowedStr("en-US")],
);
assert_de_tokens_error::<LanguageTag<String>>(
&[Token::String("verybadvalue")],
"A subtag may be eight characters in length at maximum",
);
}

View File

@ -56,6 +56,7 @@ unic-langid = { version = "0.9", features = ["likelysubtags"] }
unic-langid-ffi = { path = "../../../../intl/locale/rust/unic-langid-ffi" }
fluent-langneg = { version = "0.13", features = ["cldr"] }
fluent-langneg-ffi = { path = "../../../../intl/locale/rust/fluent-langneg-ffi" }
oxilangtag = "0.1.3"
rure = "0.2.2"
rust_minidump_writer_linux = { path = "../../../crashreporter/rust_minidump_writer_linux", optional = true }
mozannotation_client = { path = "../../../crashreporter/mozannotation_client", optional = true }