Split IDNA into a separate crate.

This commit is contained in:
Simon Sapin
2016-03-30 17:11:10 +02:00
parent be00f8f007
commit 83f4b9e954
15 changed files with 155 additions and 58 deletions
+2 -2
View File
@@ -1,3 +1,3 @@
/target
/Cargo.lock
target
Cargo.lock
/.cargo/config
+1 -6
View File
@@ -16,10 +16,6 @@ name = "format"
[[test]]
name = "form_urlencoded"
[[test]]
name = "idna"
[[test]]
name = "punycode"
[[test]]
name = "tests"
[[test]]
name = "wpt"
@@ -50,8 +46,7 @@ version = ">=0.6.1, <0.8"
optional = true
[dependencies]
idna = { version = "0.1.0", path = "./idna" }
uuid = { version = "0.2", features = ["v4"] }
rustc-serialize = "0.3"
unicode-bidi = "0.2.3"
unicode-normalization = "0.1.2"
matches = "0.1"
+18
View File
@@ -0,0 +1,18 @@
[package]
name = "idna"
version = "0.1.0"
authors = ["Simon Sapin <simon.sapin@exyr.org>"]
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
repository = "https://github.com/servo/rust-url/"
license = "MIT/Apache-2.0"
[dependencies]
unicode-bidi = "0.2.3"
unicode-normalization = "0.1.2"
matches = "0.1"
[dev-dependencies]
rustc-serialize = "0.3"
[[test]]
name = "tests"
+73
View File
@@ -0,0 +1,73 @@
// Copyright 2016 Simon Sapin.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! This Rust crate implements IDNA
//! [per the WHATWG URL Standard](https://url.spec.whatwg.org/#idna).
//!
//! It also exposes the underlying algorithms from [*Unicode IDNA Compatibility Processing*
//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
//! and [Punycode (RFC 3492)](https://tools.ietf.org/html/rfc3492).
//!
//! Quoting from [UTS #46s introduction](http://www.unicode.org/reports/tr46/#Introduction):
//!
//! > Initially, domain names were restricted to ASCII characters.
//! > A system was introduced in 2003 for internationalized domain names (IDN).
//! > This system is called Internationalizing Domain Names for Applications,
//! > or IDNA2003 for short.
//! > This mechanism supports IDNs by means of a client software transformation
//! > into a format known as Punycode.
//! > A revision of IDNA was approved in 2010 (IDNA2008).
//! > This revision has a number of incompatibilities with IDNA2003.
//! >
//! > The incompatibilities force implementers of client software,
//! > such as browsers and emailers,
//! > to face difficult choices during the transition period
//! > as registries shift from IDNA2003 to IDNA2008.
//! > This document specifies a mechanism
//! > that minimizes the impact of this transition for client software,
//! > allowing client software to access domains that are valid under either system.
#[macro_use] extern crate matches;
extern crate unicode_bidi;
extern crate unicode_normalization;
pub mod punycode;
pub mod uts46;
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm.
///
/// Return the ASCII representation a domain name,
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
/// and using Punycode as necessary.
///
/// This process may fail.
pub fn domain_to_ascii(domain: &str) -> Result<String, uts46::Errors> {
uts46::to_ascii(domain, uts46::Flags {
use_std3_ascii_rules: false,
transitional_processing: true, // XXX: switch when Firefox does
verify_dns_length: false,
})
}
/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm.
///
/// Return the Unicode representation of a domain name,
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
/// and decoding Punycode as necessary.
///
/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation)
/// but always returns a string for the mapped domain.
pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) {
uts46::to_unicode(domain, uts46::Flags {
use_std3_ascii_rules: false,
// Unused:
transitional_processing: true,
verify_dns_length: false,
})
}
@@ -6,8 +6,7 @@
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# Run as: python make_idna_table.py idna_table.txt > src/idna_table.rs
# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
# You can get the latest idna table from
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
+37 -38
View File
@@ -1,6 +1,13 @@
//! International domain names
//!
//! https://url.spec.whatwg.org/#idna
// Copyright 2013-2014 Valentin Gosu.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! [*Unicode IDNA Compatibility Processing*
//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
use self::Mapping::*;
use punycode;
@@ -9,7 +16,7 @@ use unicode_normalization::UnicodeNormalization;
use unicode_normalization::char::is_combining_mark;
use unicode_bidi::{BidiClass, bidi_class};
include!("idna_mapping.rs");
include!("uts46_mapping_table.rs");
#[derive(Debug)]
enum Mapping {
@@ -23,9 +30,9 @@ enum Mapping {
}
struct Range {
pub from: char,
pub to: char,
pub mapping: Mapping,
from: char,
to: char,
mapping: Mapping,
}
fn find_char(codepoint: char) -> &'static Mapping {
@@ -45,7 +52,7 @@ fn find_char(codepoint: char) -> &'static Mapping {
&TABLE[min].mapping
}
fn map_char(codepoint: char, flags: Uts46Flags, output: &mut String, errors: &mut Vec<Error>) {
fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
match *find_char(codepoint) {
Mapping::Valid => output.push(codepoint),
Mapping::Ignored => {},
@@ -185,7 +192,7 @@ fn passes_bidi(label: &str, transitional_processing: bool) -> bool {
}
/// http://www.unicode.org/reports/tr46/#Validity_Criteria
fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec<Error>) {
fn validate(label: &str, flags: Flags, errors: &mut Vec<Error>) {
if label.nfc().ne(label.chars()) {
errors.push(Error::ValidityCriteria);
}
@@ -212,7 +219,7 @@ fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec<Error>) {
}
/// http://www.unicode.org/reports/tr46/#Processing
fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec<Error>) -> String {
fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
let mut mapped = String::new();
for c in domain.chars() {
map_char(c, flags, &mut mapped, errors)
@@ -226,7 +233,7 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec<Error>) ->
if label.starts_with("xn--") {
match punycode::decode_to_string(&label["xn--".len()..]) {
Some(decoded_label) => {
let flags = Uts46Flags { transitional_processing: false, ..flags };
let flags = Flags { transitional_processing: false, ..flags };
validate(&decoded_label, flags, errors);
validated.push_str(&decoded_label)
}
@@ -241,14 +248,14 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec<Error>) ->
}
#[derive(Copy, Clone)]
pub struct Uts46Flags {
pub struct Flags {
pub use_std3_ascii_rules: bool,
pub transitional_processing: bool,
pub verify_dns_length: bool,
}
#[derive(PartialEq, Eq, Clone, Copy, Debug)]
pub enum Error {
enum Error {
PunycodeError,
ValidityCriteria,
DissallowedByStd3AsciiRules,
@@ -257,11 +264,18 @@ pub enum Error {
TooLongForDns,
}
/// Errors recorded during UTS #46 processing.
///
/// This is opaque for now, only indicating the precense of at least one error.
/// More details may be exposed in the future.
#[derive(Debug)]
pub struct Errors(Vec<Error>);
/// http://www.unicode.org/reports/tr46/#ToASCII
pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result<String, Vec<Error>> {
pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> {
let mut errors = Vec::new();
let mut result = String::new();
for label in uts46_processing(domain, flags, &mut errors).split('.') {
for label in processing(domain, flags, &mut errors).split('.') {
if result.len() > 0 {
result.push('.');
}
@@ -288,36 +302,21 @@ pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result<String, Vec<Err
if errors.is_empty() {
Ok(result)
} else {
Err(errors)
Err(Errors(errors))
}
}
/// https://url.spec.whatwg.org/#concept-domain-to-ascii
pub fn domain_to_ascii(domain: &str) -> Result<String, Vec<Error>> {
uts46_to_ascii(domain, Uts46Flags {
use_std3_ascii_rules: false,
transitional_processing: true, // XXX: switch when Firefox does
verify_dns_length: false,
})
}
/// http://www.unicode.org/reports/tr46/#ToUnicode
///
/// Only `use_std3_ascii_rules` is used in `flags`.
pub fn uts46_to_unicode(domain: &str, mut flags: Uts46Flags) -> (String, Vec<Error>) {
pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) {
flags.transitional_processing = false;
let mut errors = Vec::new();
let domain = uts46_processing(domain, flags, &mut errors);
let domain = processing(domain, flags, &mut errors);
let errors = if errors.is_empty() {
Ok(())
} else {
Err(Errors(errors))
};
(domain, errors)
}
/// https://url.spec.whatwg.org/#concept-domain-to-unicode
pub fn domain_to_unicode(domain: &str) -> (String, Vec<Error>) {
uts46_to_unicode(domain, Uts46Flags {
use_std3_ascii_rules: false,
// Unused:
transitional_processing: true,
verify_dns_length: false,
})
}
+8 -3
View File
@@ -1,7 +1,12 @@
extern crate url;
extern crate rustc_serialize;
// Copyright 2013 Simon Sapin.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use url::punycode::{decode, encode_str};
use idna::punycode::{decode, encode_str};
use rustc_serialize::json::{Json, Object};
fn one_test(description: &str, decoded: &str, encoded: &str) {
+5
View File
@@ -0,0 +1,5 @@
extern crate idna;
extern crate rustc_serialize;
mod punycode;
mod uts46;
+9 -3
View File
@@ -1,7 +1,13 @@
extern crate url;
// Copyright 2013-2014 Valentin Gosu.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::char;
use url::idna;
use idna::uts46;
#[test]
fn test_uts46() {
@@ -35,7 +41,7 @@ fn test_uts46() {
continue;
}
let result = idna::uts46_to_ascii(&source, idna::Uts46Flags {
let result = uts46::to_ascii(&source, uts46::Flags {
use_std3_ascii_rules: true,
transitional_processing: test_type == "T",
verify_dns_length: true,
+1 -4
View File
@@ -141,8 +141,7 @@ extern crate serde;
#[cfg(feature="heap_size")]
#[macro_use] extern crate heapsize;
extern crate unicode_normalization;
extern crate unicode_bidi;
extern crate idna;
use std::fmt::{self, Formatter};
use std::str;
@@ -170,9 +169,7 @@ mod parser;
pub mod urlutils;
pub mod percent_encoding;
pub mod form_urlencoded;
pub mod punycode;
pub mod format;
pub mod idna;
/// The parsed representation of an absolute URL.
#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)]