servo: Merge #7449 - Issue #7393: Properly sniff mislabeled feeds (from simartin:issue_7393); r=metajack

Hi,

This patch is an attempt to fix https://github.com/servo/servo/issues/7393, where the code detecting mislabeled feeds (see https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-feed) had spurious space in the URLs we need to match.

Note that my testing (in particular rdf_rss_ko_2.xml) highlighted a flaw in "matches", that failed to check that there were more bytes in the string being checked than in the string we're checking against, which completely broke the whole step 5.2.7.

Thanks in advance for your review.

Cheers,
  Simon

Source-Repo: https://github.com/servo/servo
Source-Revision: 9f85370885c84ebb58cd7f4a72a6e78948f468dc
This commit is contained in:
Simon Martin 2015-09-01 11:22:52 -06:00
parent 9d89ba65ed
commit b1a37d6095
7 changed files with 52 additions and 6 deletions

View File

@ -125,6 +125,10 @@ impl <'a, T: Iterator<Item=&'a u8> + Clone> Matches for T {
// Side effects
// moves the iterator when match is found
fn matches(&mut self, matches: &[u8]) -> bool {
if self.clone().nth(matches.len()).is_none() {
// there are less than matches.len() elements in self
return false
}
let result = self.clone().zip(matches).all(|(s, m)| *s == *m);
if result {
self.nth(matches.len());
@ -381,9 +385,10 @@ where T: Iterator<Item=&'a u8> + Clone {
struct FeedsClassifier;
impl FeedsClassifier {
// Implements sniffing for mislabeled feeds (https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-feed)
fn classify_impl(&self, data: &[u8]) -> Option<(&'static str, &'static str)> {
// can not be feed unless length is > 3
// Step 4: can not be feed unless length is > 3
if data.len() < 3 {
return None;
}
@ -403,6 +408,7 @@ impl FeedsClassifier {
return None;
}
// Steps 5.2.1 to 5.2.4
match eats_until(&mut matcher, b"?", b"?>")
.chain(|| eats_until(&mut matcher, b"!--", b"-->"))
.chain(|| eats_until(&mut matcher, b"!", b">")) {
@ -411,20 +417,23 @@ impl FeedsClassifier {
Match::Start => return None
}
// Step 5.2.5
if matcher.matches(b"rss") {
return Some(("application", "rss+xml"));
}
// Step 5.2.6
if matcher.matches(b"feed") {
return Some(("application", "atom+xml"));
}
if matcher.matches(b"rdf: RDF") {
// Step 5.2.7
if matcher.matches(b"rdf:RDF") {
while matcher.next().is_some() {
match eats_until(&mut matcher,
b"http: //purl.org/rss/1.0/",
b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#")
b"http://purl.org/rss/1.0/",
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#")
.chain(|| eats_until(&mut matcher,
b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#",
b"http: //purl.org/rss/1.0/")) {
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
b"http://purl.org/rss/1.0/")) {
Match::StartAndEnd => return Some(("application", "rss+xml")),
Match::DidNotMatch => {},
Match::Start => return None

View File

@ -434,7 +434,14 @@ fn test_sniff_utf_8_bom() {
#[test]
fn test_sniff_rss_feed() {
// RSS feeds
test_sniff_full(&PathBuf::from("text/xml/feed.rss"), "application", "rss+xml", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss.xml"), "application", "rss+xml", Some(("text", "html")));
// Not RSS feeds
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_1.xml"), "text", "html", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_2.xml"), "text", "html", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_3.xml"), "text", "html", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_4.xml"), "text", "html", Some(("text", "html")));
}
#[test]

View File

@ -0,0 +1,7 @@
<!-- Good format for a "RDF feed" -->
<?xml version="1.0"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
</rdf:RDF>

View File

@ -0,0 +1,7 @@
<!-- Bad format for a "RDF feed" (space between "rdf:" and "RDF") -->
<?xml version="1.0"?>
<rdf: RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
</rdf:RDF>

View File

@ -0,0 +1,3 @@
<!-- Bad format for a "RDF feed" (2 missing URLs) -->
<?xml version="1.0"?>
<rdf:RDF/>

View File

@ -0,0 +1,6 @@
<!-- Bad format for a "RDF feed" (one missing URL) -->
<?xml version="1.0"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
</rdf:RDF>

View File

@ -0,0 +1,7 @@
<!-- Bad format for a "RDF feed" (unexpected space in first URL) -->
<?xml version="1.0"?>
<rdf:RDF
xmlns:rdf="http: //www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
</rdf:RDF>