servo: Merge #17037 - Added Async HTML Tokenizer (from cynicaldevil:impl-Sink); r=nox

Design: I realized having two different parsers for sync and async was wrong, because the API for both was fundamentally the same. All I needed to do was create another Tokenizer, because `ParseNode` ( representation for nodes which are yet to be created) is used by the TreeBuilder and the Sink, and the `Tokenizer` is the 'lowermost' type concerned with these two types.

Therefore, I created one and placed it in `async_html.rs`, and also created a new Sink which deals with `ParseNode`s. I changed the methods in ServoParser to take an `async` argument too, which decides which Tokenizer will be used. The Tokenizer isn't exactly *async* for now, but this PR separates action creation from execution, which allows the async behaviour to be implemented later. Right now, all actions are executed soon after they are created.

The Sink consists of two Hashmaps, `nodes`, which contains the actual nodes, with the key being their corresponding `ParseNode`'s id, and `parse_node_data`, which contains metadata about the nodes.

It's still a bit rough, (I can't figure out how to deal with `complete_script` and `is_mathml_annotation_xml_integration_point`, along with some other parts I wrote in a hurry), but I believe the overall design is sound. I'd like to hear what you think about it.

Source-Repo: https://github.com/servo/servo
Source-Revision: 9c2dffdf72efe4274bb514407edc552b14fc0a4d

--HG--
extra : subtree_source : https%3A//hg.mozilla.org/projects/converted-servo-linear
extra : subtree_revision : 706e41f53fd00305d695faa3c78757c6771cf6c9
This commit is contained in:
Nikhil Shagrithaya 2017-06-17 02:21:19 -07:00
parent a679f93781
commit a03ae92655
3 changed files with 515 additions and 5 deletions

View File

@ -0,0 +1,493 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#![allow(unrooted_must_root)]
use dom::bindings::codegen::Bindings::HTMLTemplateElementBinding::HTMLTemplateElementMethods;
use dom::bindings::codegen::Bindings::NodeBinding::NodeMethods;
use dom::bindings::inheritance::Castable;
use dom::bindings::js::{JS, MutNullableJS, Root};
use dom::bindings::str::DOMString;
use dom::bindings::trace::JSTraceable;
use dom::comment::Comment;
use dom::document::Document;
use dom::documenttype::DocumentType;
use dom::element::{Element, ElementCreator};
use dom::htmlformelement::{FormControlElementHelpers, HTMLFormElement};
use dom::htmlscriptelement::HTMLScriptElement;
use dom::htmltemplateelement::HTMLTemplateElement;
use dom::node::Node;
use dom::processinginstruction::ProcessingInstruction;
use dom::virtualmethods::vtable_for;
use html5ever::{Attribute, QualName, ExpandedName};
use html5ever::buffer_queue::BufferQueue;
use html5ever::tendril::StrTendril;
use html5ever::tokenizer::{Tokenizer as HtmlTokenizer, TokenizerOpts, TokenizerResult};
use html5ever::tree_builder::{NodeOrText, TreeSink, NextParserState, QuirksMode, ElementFlags};
use html5ever::tree_builder::{Tracer as HtmlTracer, TreeBuilder, TreeBuilderOpts};
use js::jsapi::JSTracer;
use servo_url::ServoUrl;
use std::ascii::AsciiExt;
use std::borrow::Cow;
use std::cell::Cell;
use std::collections::HashMap;
use style::context::QuirksMode as ServoQuirksMode;
#[derive(HeapSizeOf, JSTraceable)]
#[must_root]
pub struct Tokenizer {
#[ignore_heap_size_of = "Defined in html5ever"]
inner: HtmlTokenizer<TreeBuilder<ParseNode, Sink>>,
}
impl Tokenizer {
pub fn new(
document: &Document,
url: ServoUrl,
fragment_context: Option<super::FragmentContext>)
-> Self {
let mut sink = Sink::new(url, document);
let options = TreeBuilderOpts {
ignore_missing_rules: true,
.. Default::default()
};
let inner = if let Some(fc) = fragment_context {
let ctxt_parse_node = sink.new_parse_node();
sink.nodes.insert(ctxt_parse_node.id, JS::from_ref(fc.context_elem));
let form_parse_node = fc.form_elem.map(|form_elem| {
let node = sink.new_parse_node();
sink.nodes.insert(node.id, JS::from_ref(form_elem));
node
});
let tb = TreeBuilder::new_for_fragment(
sink,
ctxt_parse_node,
form_parse_node,
options);
let tok_options = TokenizerOpts {
initial_state: Some(tb.tokenizer_state_for_context_elem()),
.. Default::default()
};
HtmlTokenizer::new(tb, tok_options)
} else {
HtmlTokenizer::new(TreeBuilder::new(sink, options), Default::default())
};
Tokenizer {
inner: inner,
}
}
pub fn feed(&mut self, input: &mut BufferQueue) -> Result<(), Root<HTMLScriptElement>> {
match self.inner.feed(input) {
TokenizerResult::Done => Ok(()),
TokenizerResult::Script(script) => {
let nodes = &self.inner.sink.sink.nodes;
let script = nodes.get(&script.id).unwrap();
Err(Root::from_ref(script.downcast().unwrap()))
},
}
}
pub fn end(&mut self) {
self.inner.end();
}
pub fn url(&self) -> &ServoUrl {
&self.inner.sink.sink.base_url
}
pub fn set_plaintext_state(&mut self) {
self.inner.set_plaintext_state();
}
}
#[allow(unsafe_code)]
unsafe impl JSTraceable for HtmlTokenizer<TreeBuilder<ParseNode, Sink>> {
unsafe fn trace(&self, trc: *mut JSTracer) {
struct Tracer(*mut JSTracer);
let tracer = Tracer(trc);
impl HtmlTracer for Tracer {
type Handle = ParseNode;
#[allow(unrooted_must_root)]
fn trace_handle(&self, node: &ParseNode) {
unsafe { node.trace(self.0); }
}
}
let tree_builder = &self.sink;
tree_builder.trace_handles(&tracer);
tree_builder.sink.trace(trc);
}
}
type ParseNodeID = usize;
#[derive(JSTraceable, Clone, HeapSizeOf)]
pub struct ParseNode {
id: ParseNodeID,
qual_name: Option<QualName>,
}
#[derive(JSTraceable, HeapSizeOf)]
struct ParseNodeData {
target: Option<String>,
data: Option<String>,
contents: Option<ParseNode>,
is_integration_point: bool,
}
impl Default for ParseNodeData {
fn default() -> ParseNodeData {
ParseNodeData {
target: None,
data: None,
contents: None,
is_integration_point: false,
}
}
}
enum ParseOperation {
GetTemplateContents(ParseNodeID, ParseNodeID),
CreateElement(ParseNodeID, QualName, Vec<Attribute>),
CreateComment(StrTendril, ParseNodeID),
// sibling, node to be inserted
AppendBeforeSibling(ParseNodeID, NodeOrText<ParseNode>),
// parent, node to be inserted
Append(ParseNodeID, NodeOrText<ParseNode>),
AppendDoctypeToDocument(StrTendril, StrTendril, StrTendril),
AddAttrsIfMissing(ParseNodeID, Vec<Attribute>),
RemoveFromParent(ParseNodeID),
MarkScriptAlreadyStarted(ParseNodeID),
ReparentChildren(ParseNodeID, ParseNodeID),
AssociateWithForm(ParseNodeID, ParseNodeID),
CreatePI(ParseNodeID),
Pop(ParseNodeID),
}
#[derive(JSTraceable, HeapSizeOf)]
#[must_root]
pub struct Sink {
base_url: ServoUrl,
document: JS<Document>,
current_line: u64,
script: MutNullableJS<HTMLScriptElement>,
parse_node_data: HashMap<ParseNodeID, ParseNodeData>,
next_parse_node_id: Cell<ParseNodeID>,
nodes: HashMap<ParseNodeID, JS<Node>>,
document_node: ParseNode,
}
impl Sink {
fn new(base_url: ServoUrl, document: &Document) -> Sink {
let mut sink = Sink {
base_url: base_url,
document: JS::from_ref(document),
current_line: 1,
script: Default::default(),
parse_node_data: HashMap::new(),
next_parse_node_id: Cell::new(1),
nodes: HashMap::new(),
document_node: ParseNode {
id: 0,
qual_name: None,
}
};
let data = ParseNodeData::default();
sink.insert_parse_node_data(0, data);
sink.insert_node(0, JS::from_ref(document.upcast()));
sink
}
fn new_parse_node(&mut self) -> ParseNode {
let id = self.next_parse_node_id.get();
let data = ParseNodeData::default();
self.insert_parse_node_data(id, data);
self.next_parse_node_id.set(id + 1);
ParseNode {
id: id,
qual_name: None,
}
}
fn insert_node(&mut self, id: ParseNodeID, node: JS<Node>) {
assert!(self.nodes.insert(id, node).is_none());
}
fn get_node<'a>(&'a self, id: &ParseNodeID) -> &'a JS<Node> {
self.nodes.get(id).expect("Node not found!")
}
fn insert_parse_node_data(&mut self, id: ParseNodeID, data: ParseNodeData) {
assert!(self.parse_node_data.insert(id, data).is_none());
}
fn get_parse_node_data<'a>(&'a self, id: &'a ParseNodeID) -> &'a ParseNodeData {
self.parse_node_data.get(id).expect("Parse Node data not found!")
}
fn get_parse_node_data_mut<'a>(&'a mut self, id: &'a ParseNodeID) -> &'a mut ParseNodeData {
self.parse_node_data.get_mut(id).expect("Parse Node data not found!")
}
fn process_operation(&mut self, op: ParseOperation) {
let document = Root::from_ref(&**self.get_node(&0));
let document = document.downcast::<Document>().expect("Document node should be downcasted!");
match op {
ParseOperation::GetTemplateContents(target, contents) => {
let target = Root::from_ref(&**self.get_node(&target));
let template = target.downcast::<HTMLTemplateElement>().expect(
"Tried to extract contents from non-template element while parsing");
self.insert_node(contents, JS::from_ref(template.Content().upcast()));
}
ParseOperation::CreateElement(id, name, attrs) => {
let elem = Element::create(name, &*self.document,
ElementCreator::ParserCreated(self.current_line));
for attr in attrs {
elem.set_attribute_from_parser(attr.name, DOMString::from(String::from(attr.value)), None);
}
self.insert_node(id, JS::from_ref(elem.upcast()));
}
ParseOperation::CreateComment(text, id) => {
let comment = Comment::new(DOMString::from(String::from(text)), document);
self.insert_node(id, JS::from_ref(&comment.upcast()));
}
ParseOperation::AppendBeforeSibling(sibling, node) => {
let node = match node {
NodeOrText::AppendNode(n) => NodeOrText::AppendNode(JS::from_ref(&**self.get_node(&n.id))),
NodeOrText::AppendText(text) => NodeOrText::AppendText(text)
};
let sibling = &**self.get_node(&sibling);
let parent = &*sibling.GetParentNode().expect("append_before_sibling called on node without parent");
super::insert(parent, Some(sibling), node);
}
ParseOperation::Append(parent, node) => {
let node = match node {
NodeOrText::AppendNode(n) => NodeOrText::AppendNode(JS::from_ref(&**self.get_node(&n.id))),
NodeOrText::AppendText(text) => NodeOrText::AppendText(text)
};
let parent = &**self.get_node(&parent);
super::insert(parent, None, node);
}
ParseOperation::AppendDoctypeToDocument(name, public_id, system_id) => {
let doctype = DocumentType::new(
DOMString::from(String::from(name)), Some(DOMString::from(String::from(public_id))),
Some(DOMString::from(String::from(system_id))), document);
document.upcast::<Node>().AppendChild(doctype.upcast()).expect("Appending failed");
}
ParseOperation::AddAttrsIfMissing(target_id, attrs) => {
let elem = self.get_node(&target_id).downcast::<Element>()
.expect("tried to set attrs on non-Element in HTML parsing");
for attr in attrs {
elem.set_attribute_from_parser(attr.name, DOMString::from(String::from(attr.value)), None);
}
}
ParseOperation::RemoveFromParent(target) => {
if let Some(ref parent) = self.get_node(&target).GetParentNode() {
parent.RemoveChild(&**self.get_node(&target)).unwrap();
}
}
ParseOperation::MarkScriptAlreadyStarted(node) => {
let script = self.get_node(&node).downcast::<HTMLScriptElement>();
script.map(|script| script.set_already_started(true));
}
ParseOperation::ReparentChildren(parent, new_parent) => {
let parent = self.get_node(&parent);
let new_parent = self.get_node(&new_parent);
while let Some(child) = parent.GetFirstChild() {
new_parent.AppendChild(&child).unwrap();
}
}
ParseOperation::AssociateWithForm(target, form) => {
let form = self.get_node(&form);
let form = Root::downcast::<HTMLFormElement>(Root::from_ref(&**form))
.expect("Owner must be a form element");
let node = self.get_node(&target);
let elem = node.downcast::<Element>();
let control = elem.and_then(|e| e.as_maybe_form_control());
if let Some(control) = control {
control.set_form_owner_from_parser(&form);
} else {
// TODO remove this code when keygen is implemented.
assert!(node.NodeName() == "KEYGEN", "Unknown form-associatable element");
}
}
ParseOperation::Pop(node) => {
vtable_for(self.get_node(&node)).pop();
}
ParseOperation::CreatePI(node) => {
let pi;
{
let data = self.get_parse_node_data(&node);
pi = ProcessingInstruction::new(
DOMString::from(data.target.clone().unwrap()),
DOMString::from(data.data.clone().unwrap()),
document);
}
self.insert_node(node, JS::from_ref(pi.upcast()));
}
}
}
}
#[allow(unrooted_must_root)]
impl TreeSink for Sink {
type Output = Self;
fn finish(self) -> Self { self }
type Handle = ParseNode;
fn get_document(&mut self) -> Self::Handle {
self.document_node.clone()
}
fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle {
if let Some(ref contents) = self.get_parse_node_data(&target.id).contents {
return contents.clone();
}
let node = self.new_parse_node();
{
let mut data = self.get_parse_node_data_mut(&target.id);
data.contents = Some(node.clone());
}
self.process_operation(ParseOperation::GetTemplateContents(target.id, node.id));
node
}
fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
x.id == y.id
}
fn elem_name<'a>(&self, target: &'a Self::Handle) -> ExpandedName<'a> {
target.qual_name.as_ref().expect("Expected qual name of node!").expanded()
}
fn same_tree(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
let x = self.get_node(&x.id);
let y = self.get_node(&y.id);
let x = x.downcast::<Element>().expect("Element node expected");
let y = y.downcast::<Element>().expect("Element node expected");
x.is_in_same_home_subtree(y)
}
fn create_element(&mut self, name: QualName, attrs: Vec<Attribute>, _flags: ElementFlags)
-> Self::Handle {
let mut node = self.new_parse_node();
node.qual_name = Some(name.clone());
{
let mut node_data = self.get_parse_node_data_mut(&node.id);
node_data.is_integration_point = attrs.iter()
.any(|attr| {
let attr_value = &String::from(attr.value.clone());
(attr.name.local == local_name!("encoding") && attr.name.ns == ns!()) &&
(attr_value.eq_ignore_ascii_case("text/html") ||
attr_value.eq_ignore_ascii_case("application/xhtml+xml"))
});
}
self.process_operation(ParseOperation::CreateElement(node.id, name, attrs));
node
}
fn create_comment(&mut self, text: StrTendril) -> Self::Handle {
let node = self.new_parse_node();
self.process_operation(ParseOperation::CreateComment(text, node.id));
node
}
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> ParseNode {
let node = self.new_parse_node();
{
let mut node_data = self.get_parse_node_data_mut(&node.id);
node_data.target = Some(String::from(target));
node_data.data = Some(String::from(data));
}
self.process_operation(ParseOperation::CreatePI(node.id));
node
}
fn has_parent_node(&self, node: &Self::Handle) -> bool {
self.get_node(&node.id).GetParentNode().is_some()
}
fn associate_with_form(&mut self, target: &Self::Handle, form: &Self::Handle) {
self.process_operation(ParseOperation::AssociateWithForm(target.id, form.id));
}
fn append_before_sibling(&mut self,
sibling: &Self::Handle,
new_node: NodeOrText<Self::Handle>) {
self.process_operation(ParseOperation::AppendBeforeSibling(sibling.id, new_node));
}
fn parse_error(&mut self, msg: Cow<'static, str>) {
debug!("Parse error: {}", msg);
}
fn set_quirks_mode(&mut self, mode: QuirksMode) {
let mode = match mode {
QuirksMode::Quirks => ServoQuirksMode::Quirks,
QuirksMode::LimitedQuirks => ServoQuirksMode::LimitedQuirks,
QuirksMode::NoQuirks => ServoQuirksMode::NoQuirks,
};
self.document.set_quirks_mode(mode);
}
fn append(&mut self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
self.process_operation(ParseOperation::Append(parent.id, child));
}
fn append_doctype_to_document(&mut self, name: StrTendril, public_id: StrTendril,
system_id: StrTendril) {
self.process_operation(ParseOperation::AppendDoctypeToDocument(name, public_id, system_id));
}
fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec<Attribute>) {
self.process_operation(ParseOperation::AddAttrsIfMissing(target.id, attrs));
}
fn remove_from_parent(&mut self, target: &Self::Handle) {
self.process_operation(ParseOperation::RemoveFromParent(target.id));
}
fn mark_script_already_started(&mut self, node: &Self::Handle) {
self.process_operation(ParseOperation::MarkScriptAlreadyStarted(node.id));
}
fn complete_script(&mut self, _: &Self::Handle) -> NextParserState {
panic!("complete_script should not be called here!");
}
fn reparent_children(&mut self, parent: &Self::Handle, new_parent: &Self::Handle) {
self.process_operation(ParseOperation::ReparentChildren(parent.id, new_parent.id));
}
/// https://html.spec.whatwg.org/multipage/#html-integration-point
/// Specifically, the <annotation-xml> cases.
fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool {
let node_data = self.get_parse_node_data(&handle.id);
node_data.is_integration_point
}
fn set_current_line(&mut self, line_number: u64) {
self.current_line = line_number;
}
fn pop(&mut self, node: &Self::Handle) {
self.process_operation(ParseOperation::Pop(node.id));
}
}

View File

@ -43,6 +43,7 @@ use profile_traits::time::{TimerMetadata, TimerMetadataFrameType};
use profile_traits::time::{TimerMetadataReflowType, ProfilerCategory, profile};
use script_thread::ScriptThread;
use script_traits::DocumentActivity;
use servo_config::prefs::PREFS;
use servo_config::resource_files::read_resource_file;
use servo_url::ServoUrl;
use std::ascii::AsciiExt;
@ -51,6 +52,7 @@ use std::cell::Cell;
use std::mem;
use style::context::QuirksMode as ServoQuirksMode;
mod async_html;
mod html;
mod xml;
@ -102,10 +104,17 @@ enum LastChunkState {
impl ServoParser {
pub fn parse_html_document(document: &Document, input: DOMString, url: ServoUrl) {
let parser = ServoParser::new(document,
Tokenizer::Html(self::html::Tokenizer::new(document, url, None)),
LastChunkState::NotReceived,
ParserKind::Normal);
let parser = if PREFS.get("dom.servoparser.async_html_tokenizer.enabled").as_boolean().unwrap() {
ServoParser::new(document,
Tokenizer::AsyncHtml(self::async_html::Tokenizer::new(document, url, None)),
LastChunkState::NotReceived,
ParserKind::Normal)
} else {
ServoParser::new(document,
Tokenizer::Html(self::html::Tokenizer::new(document, url, None)),
LastChunkState::NotReceived,
ParserKind::Normal)
};
parser.parse_string_chunk(String::from(input));
}
@ -138,6 +147,7 @@ impl ServoParser {
// Step 11.
let form = context_node.inclusive_ancestors()
.find(|element| element.is::<HTMLFormElement>());
let fragment_context = FragmentContext {
context_elem: context_node,
form_elem: form.r(),
@ -145,7 +155,7 @@ impl ServoParser {
let parser = ServoParser::new(&document,
Tokenizer::Html(self::html::Tokenizer::new(&document,
url.clone(),
url,
Some(fragment_context))),
LastChunkState::Received,
ParserKind::Normal);
@ -485,6 +495,7 @@ enum ParserKind {
#[must_root]
enum Tokenizer {
Html(self::html::Tokenizer),
AsyncHtml(self::async_html::Tokenizer),
Xml(self::xml::Tokenizer),
}
@ -492,6 +503,7 @@ impl Tokenizer {
fn feed(&mut self, input: &mut BufferQueue) -> Result<(), Root<HTMLScriptElement>> {
match *self {
Tokenizer::Html(ref mut tokenizer) => tokenizer.feed(input),
Tokenizer::AsyncHtml(ref mut tokenizer) => tokenizer.feed(input),
Tokenizer::Xml(ref mut tokenizer) => tokenizer.feed(input),
}
}
@ -499,6 +511,7 @@ impl Tokenizer {
fn end(&mut self) {
match *self {
Tokenizer::Html(ref mut tokenizer) => tokenizer.end(),
Tokenizer::AsyncHtml(ref mut tokenizer) => tokenizer.end(),
Tokenizer::Xml(ref mut tokenizer) => tokenizer.end(),
}
}
@ -506,6 +519,7 @@ impl Tokenizer {
fn url(&self) -> &ServoUrl {
match *self {
Tokenizer::Html(ref tokenizer) => tokenizer.url(),
Tokenizer::AsyncHtml(ref tokenizer) => tokenizer.url(),
Tokenizer::Xml(ref tokenizer) => tokenizer.url(),
}
}
@ -513,6 +527,7 @@ impl Tokenizer {
fn set_plaintext_state(&mut self) {
match *self {
Tokenizer::Html(ref mut tokenizer) => tokenizer.set_plaintext_state(),
Tokenizer::AsyncHtml(ref mut tokenizer) => tokenizer.set_plaintext_state(),
Tokenizer::Xml(_) => unimplemented!(),
}
}
@ -520,6 +535,7 @@ impl Tokenizer {
fn profiler_category(&self) -> ProfilerCategory {
match *self {
Tokenizer::Html(_) => ProfilerCategory::ScriptParseHTML,
Tokenizer::AsyncHtml(_) => ProfilerCategory::ScriptParseHTML,
Tokenizer::Xml(_) => ProfilerCategory::ScriptParseXML,
}
}

View File

@ -10,6 +10,7 @@
"dom.permissions.enabled": false,
"dom.permissions.testing.allowed_in_nonsecure_contexts": false,
"dom.serviceworker.timeout_seconds": 60,
"dom.servoparser.async_html_tokenizer.enabled": false,
"dom.testable_crash.enabled": false,
"dom.testbinding.enabled": false,
"dom.webvr.enabled": false,