diff --git a/.gitignore b/.gitignore index 7cbe84a..0284c25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -/target -/Cargo.lock +target +Cargo.lock /.cargo/config diff --git a/Cargo.toml b/Cargo.toml index b2458c0..9d96538 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,10 +16,6 @@ name = "format" [[test]] name = "form_urlencoded" [[test]] -name = "idna" -[[test]] -name = "punycode" -[[test]] name = "tests" [[test]] name = "wpt" @@ -50,8 +46,7 @@ version = ">=0.6.1, <0.8" optional = true [dependencies] +idna = { version = "0.1.0", path = "./idna" } uuid = { version = "0.2", features = ["v4"] } rustc-serialize = "0.3" -unicode-bidi = "0.2.3" -unicode-normalization = "0.1.2" matches = "0.1" diff --git a/idna/Cargo.toml b/idna/Cargo.toml new file mode 100644 index 0000000..0454650 --- /dev/null +++ b/idna/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "idna" +version = "0.1.0" +authors = ["Simon Sapin "] +description = "IDNA (Internationalizing Domain Names in Applications) and Punycode." +repository = "https://github.com/servo/rust-url/" +license = "MIT/Apache-2.0" + +[dependencies] +unicode-bidi = "0.2.3" +unicode-normalization = "0.1.2" +matches = "0.1" + +[dev-dependencies] +rustc-serialize = "0.3" + +[[test]] +name = "tests" diff --git a/IdnaMappingTable.txt b/idna/src/IdnaMappingTable.txt similarity index 100% rename from IdnaMappingTable.txt rename to idna/src/IdnaMappingTable.txt diff --git a/idna/src/lib.rs b/idna/src/lib.rs new file mode 100644 index 0000000..d53874f --- /dev/null +++ b/idna/src/lib.rs @@ -0,0 +1,73 @@ +// Copyright 2016 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This Rust crate implements IDNA +//! [per the WHATWG URL Standard](https://url.spec.whatwg.org/#idna). +//! +//! It also exposes the underlying algorithms from [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) +//! and [Punycode (RFC 3492)](https://tools.ietf.org/html/rfc3492). +//! +//! Quoting from [UTS #46’s introduction](http://www.unicode.org/reports/tr46/#Introduction): +//! +//! > Initially, domain names were restricted to ASCII characters. +//! > A system was introduced in 2003 for internationalized domain names (IDN). +//! > This system is called Internationalizing Domain Names for Applications, +//! > or IDNA2003 for short. +//! > This mechanism supports IDNs by means of a client software transformation +//! > into a format known as Punycode. +//! > A revision of IDNA was approved in 2010 (IDNA2008). +//! > This revision has a number of incompatibilities with IDNA2003. +//! > +//! > The incompatibilities force implementers of client software, +//! > such as browsers and emailers, +//! > to face difficult choices during the transition period +//! > as registries shift from IDNA2003 to IDNA2008. +//! > This document specifies a mechanism +//! > that minimizes the impact of this transition for client software, +//! > allowing client software to access domains that are valid under either system. + +#[macro_use] extern crate matches; +extern crate unicode_bidi; +extern crate unicode_normalization; + +pub mod punycode; +pub mod uts46; + +/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm. +/// +/// Return the ASCII representation a domain name, +/// normalizing characters (upper-case to lower-case and other kinds of equivalence) +/// and using Punycode as necessary. +/// +/// This process may fail. +pub fn domain_to_ascii(domain: &str) -> Result { + uts46::to_ascii(domain, uts46::Flags { + use_std3_ascii_rules: false, + transitional_processing: true, // XXX: switch when Firefox does + verify_dns_length: false, + }) +} + +/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm. +/// +/// Return the Unicode representation of a domain name, +/// normalizing characters (upper-case to lower-case and other kinds of equivalence) +/// and decoding Punycode as necessary. +/// +/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation) +/// but always returns a string for the mapped domain. +pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) { + uts46::to_unicode(domain, uts46::Flags { + use_std3_ascii_rules: false, + + // Unused: + transitional_processing: true, + verify_dns_length: false, + }) +} diff --git a/make_idna_table.py b/idna/src/make_uts46_mapping_table.py similarity index 95% rename from make_idna_table.py rename to idna/src/make_uts46_mapping_table.py index 5700d68..8e090dc 100644 --- a/make_idna_table.py +++ b/idna/src/make_uts46_mapping_table.py @@ -6,8 +6,7 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. - -# Run as: python make_idna_table.py idna_table.txt > src/idna_table.rs +# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs # You can get the latest idna table from # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt diff --git a/src/punycode.rs b/idna/src/punycode.rs similarity index 100% rename from src/punycode.rs rename to idna/src/punycode.rs diff --git a/src/idna.rs b/idna/src/uts46.rs similarity index 84% rename from src/idna.rs rename to idna/src/uts46.rs index e0efdb3..5f230e0 100644 --- a/src/idna.rs +++ b/idna/src/uts46.rs @@ -1,6 +1,13 @@ -//! International domain names -//! -//! https://url.spec.whatwg.org/#idna +// Copyright 2013-2014 Valentin Gosu. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) use self::Mapping::*; use punycode; @@ -9,7 +16,7 @@ use unicode_normalization::UnicodeNormalization; use unicode_normalization::char::is_combining_mark; use unicode_bidi::{BidiClass, bidi_class}; -include!("idna_mapping.rs"); +include!("uts46_mapping_table.rs"); #[derive(Debug)] enum Mapping { @@ -23,9 +30,9 @@ enum Mapping { } struct Range { - pub from: char, - pub to: char, - pub mapping: Mapping, + from: char, + to: char, + mapping: Mapping, } fn find_char(codepoint: char) -> &'static Mapping { @@ -45,7 +52,7 @@ fn find_char(codepoint: char) -> &'static Mapping { &TABLE[min].mapping } -fn map_char(codepoint: char, flags: Uts46Flags, output: &mut String, errors: &mut Vec) { +fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec) { match *find_char(codepoint) { Mapping::Valid => output.push(codepoint), Mapping::Ignored => {}, @@ -185,7 +192,7 @@ fn passes_bidi(label: &str, transitional_processing: bool) -> bool { } /// http://www.unicode.org/reports/tr46/#Validity_Criteria -fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec) { +fn validate(label: &str, flags: Flags, errors: &mut Vec) { if label.nfc().ne(label.chars()) { errors.push(Error::ValidityCriteria); } @@ -212,7 +219,7 @@ fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec) { } /// http://www.unicode.org/reports/tr46/#Processing -fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> String { +fn processing(domain: &str, flags: Flags, errors: &mut Vec) -> String { let mut mapped = String::new(); for c in domain.chars() { map_char(c, flags, &mut mapped, errors) @@ -226,7 +233,7 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> if label.starts_with("xn--") { match punycode::decode_to_string(&label["xn--".len()..]) { Some(decoded_label) => { - let flags = Uts46Flags { transitional_processing: false, ..flags }; + let flags = Flags { transitional_processing: false, ..flags }; validate(&decoded_label, flags, errors); validated.push_str(&decoded_label) } @@ -241,14 +248,14 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> } #[derive(Copy, Clone)] -pub struct Uts46Flags { +pub struct Flags { pub use_std3_ascii_rules: bool, pub transitional_processing: bool, pub verify_dns_length: bool, } #[derive(PartialEq, Eq, Clone, Copy, Debug)] -pub enum Error { +enum Error { PunycodeError, ValidityCriteria, DissallowedByStd3AsciiRules, @@ -257,11 +264,18 @@ pub enum Error { TooLongForDns, } +/// Errors recorded during UTS #46 processing. +/// +/// This is opaque for now, only indicating the precense of at least one error. +/// More details may be exposed in the future. +#[derive(Debug)] +pub struct Errors(Vec); + /// http://www.unicode.org/reports/tr46/#ToASCII -pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result> { +pub fn to_ascii(domain: &str, flags: Flags) -> Result { let mut errors = Vec::new(); let mut result = String::new(); - for label in uts46_processing(domain, flags, &mut errors).split('.') { + for label in processing(domain, flags, &mut errors).split('.') { if result.len() > 0 { result.push('.'); } @@ -288,36 +302,21 @@ pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result Result> { - uts46_to_ascii(domain, Uts46Flags { - use_std3_ascii_rules: false, - transitional_processing: true, // XXX: switch when Firefox does - verify_dns_length: false, - }) -} - /// http://www.unicode.org/reports/tr46/#ToUnicode /// /// Only `use_std3_ascii_rules` is used in `flags`. -pub fn uts46_to_unicode(domain: &str, mut flags: Uts46Flags) -> (String, Vec) { +pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) { flags.transitional_processing = false; let mut errors = Vec::new(); - let domain = uts46_processing(domain, flags, &mut errors); + let domain = processing(domain, flags, &mut errors); + let errors = if errors.is_empty() { + Ok(()) + } else { + Err(Errors(errors)) + }; (domain, errors) } - -/// https://url.spec.whatwg.org/#concept-domain-to-unicode -pub fn domain_to_unicode(domain: &str) -> (String, Vec) { - uts46_to_unicode(domain, Uts46Flags { - use_std3_ascii_rules: false, - - // Unused: - transitional_processing: true, - verify_dns_length: false, - }) -} diff --git a/src/idna_mapping.rs b/idna/src/uts46_mapping_table.rs similarity index 100% rename from src/idna_mapping.rs rename to idna/src/uts46_mapping_table.rs diff --git a/tests/IdnaTest.txt b/idna/tests/IdnaTest.txt similarity index 100% rename from tests/IdnaTest.txt rename to idna/tests/IdnaTest.txt diff --git a/tests/punycode.rs b/idna/tests/punycode.rs similarity index 79% rename from tests/punycode.rs rename to idna/tests/punycode.rs index ae42b34..0f660fe 100644 --- a/tests/punycode.rs +++ b/idna/tests/punycode.rs @@ -1,7 +1,12 @@ -extern crate url; -extern crate rustc_serialize; +// Copyright 2013 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. -use url::punycode::{decode, encode_str}; +use idna::punycode::{decode, encode_str}; use rustc_serialize::json::{Json, Object}; fn one_test(description: &str, decoded: &str, encoded: &str) { diff --git a/tests/punycode_tests.json b/idna/tests/punycode_tests.json similarity index 100% rename from tests/punycode_tests.json rename to idna/tests/punycode_tests.json diff --git a/idna/tests/tests.rs b/idna/tests/tests.rs new file mode 100644 index 0000000..087fcd3 --- /dev/null +++ b/idna/tests/tests.rs @@ -0,0 +1,5 @@ +extern crate idna; +extern crate rustc_serialize; + +mod punycode; +mod uts46; diff --git a/tests/idna.rs b/idna/tests/uts46.rs similarity index 89% rename from tests/idna.rs rename to idna/tests/uts46.rs index bb03f39..4328e33 100644 --- a/tests/idna.rs +++ b/idna/tests/uts46.rs @@ -1,7 +1,13 @@ -extern crate url; +// Copyright 2013-2014 Valentin Gosu. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. use std::char; -use url::idna; +use idna::uts46; #[test] fn test_uts46() { @@ -35,7 +41,7 @@ fn test_uts46() { continue; } - let result = idna::uts46_to_ascii(&source, idna::Uts46Flags { + let result = uts46::to_ascii(&source, uts46::Flags { use_std3_ascii_rules: true, transitional_processing: test_type == "T", verify_dns_length: true, diff --git a/src/lib.rs b/src/lib.rs index 9caffad..dcc3e9e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -141,8 +141,7 @@ extern crate serde; #[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; -extern crate unicode_normalization; -extern crate unicode_bidi; +extern crate idna; use std::fmt::{self, Formatter}; use std::str; @@ -170,9 +169,7 @@ mod parser; pub mod urlutils; pub mod percent_encoding; pub mod form_urlencoded; -pub mod punycode; pub mod format; -pub mod idna; /// The parsed representation of an absolute URL. #[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)]