From 21b830a978743768386edde1d872dc69cc51ee77 Mon Sep 17 00:00:00 2001 From: Margaret Meyerhofer Date: Tue, 29 May 2012 17:32:07 -0700 Subject: [PATCH] servo: Merge #9 - Added css datastructures, pretty-printing, and a lexer (from mmeyerho:cssmatching) Source-Repo: https://github.com/servo/servo Source-Revision: a2174ba81845a51e0d12aa187b1757e87cc5ab4b --- servo/src/servo/content.rs | 2 +- servo/src/servo/dom/base.rs | 7 +- servo/src/servo/dom/style.rs | 125 ++++++ servo/src/servo/parser/html.rs | 268 ------------- servo/src/servo/parser/html_builder.rs | 4 +- servo/src/servo/parser/lexer.rs | 501 +++++++++++++++++++++++++ servo/src/servo/servo.rc | 7 +- servo/src/servo/servo.rs | 4 +- 8 files changed, 639 insertions(+), 279 deletions(-) create mode 100644 servo/src/servo/dom/style.rs delete mode 100644 servo/src/servo/parser/html.rs create mode 100644 servo/src/servo/parser/lexer.rs diff --git a/servo/src/servo/content.rs b/servo/src/servo/content.rs index 4f69ee499fd4..3841a27d1a8b 100644 --- a/servo/src/servo/content.rs +++ b/servo/src/servo/content.rs @@ -40,7 +40,7 @@ fn content(to_layout: chan) -> chan { // Note: we can parse the next document in parallel // with any previous documents. - let stream = html::spawn_parser_task(filename); + let stream = lexer::spawn_html_parser_task(filename); let root = parser::html_builder::build_dom(scope, stream); // Now, join the layout so that they will see the latest diff --git a/servo/src/servo/dom/base.rs b/servo/src/servo/dom/base.rs index 1228b3edbdfe..ec95bd5120ca 100644 --- a/servo/src/servo/dom/base.rs +++ b/servo/src/servo/dom/base.rs @@ -54,9 +54,10 @@ enum element_subclass { es_head } -#[doc="The rd_aux data is a (weak) pointer to the layout data, which contains - the CSS info as well as the primary box. Note that there may be multiple - boxes per DOM node."] +#[doc="The rd_aux data is a (weak) pointer to the layout data, which + contains the CSS info as well as the primary box. Note that + there may be multiple boxes per DOM node."] + type node = rcu::handle; type node_scope = rcu::scope; diff --git a/servo/src/servo/dom/style.rs b/servo/src/servo/dom/style.rs new file mode 100644 index 000000000000..3095e5aebc9a --- /dev/null +++ b/servo/src/servo/dom/style.rs @@ -0,0 +1,125 @@ +import io::println; + + +enum display_type{ + block, + inline +} + +enum style_decl{ + font_size(uint), + display(display_type), + text_color(uint), + background_color(uint) +} + +enum attr{ + exists(str), + exact(str, str), + includes(str, str), + starts_with(str, str) +} + +enum selector{ + element(str, [attr]), + child(~selector, ~selector), + descendant(~selector, ~selector), + sibling(~selector, ~selector) +} + +type rule = (selector, [style_decl]); + +type stylesheet = [rule]; + + +fn print_list(list : [T], print : fn(T) -> str) -> str { + let l = vec::len(list); + if l == 0u { ret "" } + + let mut res = print(list[0]); + let mut i = 1u; + + while i < l { + res += ", "; + res += print(list[i]); + i += 1u; + } + + ret res; +} + +fn print_display(dis_ty : display_type) -> str { + alt dis_ty { + block { "block" } + inline { "inline" } + } +} + +fn print_style(decl : style_decl) -> str{ + alt decl { + font_size(s) { #fmt("Font size = %u px", s) } + display(dis_ty) { #fmt("Display style = %s", print_display(dis_ty)) } + text_color(c) { #fmt("Text color = 0x%06x", c) } + background_color(c) { #fmt("Background color = 0x%06x", c) } + } +} + +fn print_attr(attribute : attr) -> str { + alt attribute { + exists(att) { #fmt("[%s]", att) } + exact(att, val) { #fmt("[%s = %s]", att, val) } + includes(att, val) { #fmt("[%s ~= %s]", att, val) } + starts_with(att, val) { #fmt("[%s |= %s]", att, val) } + } +} + +fn print_selector(select : ~selector) -> str { + alt *select { + element(s, attrs) { #fmt("Element %s with attributes: %s", s, + print_list(attrs, print_attr)) } + child(sel1, sel2) { #fmt("(%s) > (%s)", print_selector(sel1), + print_selector(sel2)) } + descendant(sel1, sel2) { #fmt("(%s) (%s)", print_selector(sel1), + print_selector(sel2)) } + sibling(sel1, sel2) { #fmt("(%s) + (%s)", print_selector(sel1), + print_selector(sel2)) } + } +} + +fn print_rule(rule : rule) -> str { + alt rule { + (sel, styles) { + let sel_str = print_selector(~(copy sel)); + let sty_str = print_list(styles, print_style); + + #fmt("Selector: %s, Style: {%s}", sel_str, sty_str) + } + } +} + +fn print_sheet(sheet : stylesheet) -> str { + #fmt("CSS Rules: %s", print_list(sheet, print_rule)) +} + +#[test] +fn test_pretty_print() { + let test1 = [(element("p", []), [font_size(32u)])]; + let actual1 = print_sheet(test1); + let expected1 = "CSS Rules: Selector: Element p with attributes: ," + + " Style: {Font size = 32 px}"; + + assert(actual1 == expected1); + + let elmt1 = ~element("*", []); + let elmt2 = ~element("body", [exact("class", "2")]); + + let test2 = [(descendant(elmt1, elmt2), + [display(block), text_color(0u)])]; + + let actual2 = print_sheet(test2); + let expected2 = "CSS Rules: Selector: (Element * with attributes: ) " + + "(Element body with attributes: [class = 2]), " + + "Style: {Display style = block, Text color = 0x000000}"; + + assert(actual2 == expected2); +} diff --git a/servo/src/servo/parser/html.rs b/servo/src/servo/parser/html.rs deleted file mode 100644 index 13c9273dc92d..000000000000 --- a/servo/src/servo/parser/html.rs +++ /dev/null @@ -1,268 +0,0 @@ -import comm::{port, chan}; - -enum parse_state { - ps_normal, - ps_tag -} - -type parser = { - mut lookahead: option, - mut state: parse_state, - reader: io::reader -}; - -enum token { - to_start_opening_tag(str), - to_end_opening_tag, - to_end_tag(str), - to_self_close_tag, - to_text(str), - to_attr(str, str), - to_doctype, - to_eof -} - -enum char_or_eof { - coe_char(u8), - coe_eof -} - -impl u8_methods for u8 { - fn is_alpha() -> bool { - ret (self >= ('A' as u8) && self <= ('Z' as u8)) || - (self >= ('a' as u8) && self <= ('z' as u8)); - } -} - -impl u8_vec_methods for [u8] { - fn to_str() -> str { ret str::from_bytes(self); } - fn to_str_token() -> token { ret to_text(self.to_str()); } -} - -impl methods for parser { - fn get() -> char_or_eof { - alt self.lookahead { - some(coe) { - let rv = coe; - self.lookahead = none; - ret rv; - } - none { - /* fall through */ - } - } - - if self.reader.eof() { ret coe_eof; } - ret coe_char(self.reader.read_byte() as u8); - } - - fn unget(ch: u8) { - assert self.lookahead.is_none(); - self.lookahead = some(coe_char(ch)); - } - - fn parse_err(err: str) -> ! { - fail err - } - - fn expect(ch: u8) { - alt self.get() { - coe_char(c) { - if c != ch { - self.parse_err(#fmt("expected '%c'", ch as char)); - } - } - coe_eof { - self.parse_err(#fmt("expected '%c' at eof", ch as char)); - } - } - } - - fn parse_ident() -> str { - let mut result: [u8] = []; - loop { - alt self.get() { - coe_char(c) { - if (c.is_alpha()) { - result += [c]; - } else if result.len() == 0u { - self.parse_err("expected ident"); - } else { - self.unget(c); - break; - } - } - coe_eof { - self.parse_err("expected ident"); - } - } - } - ret str::from_bytes(result); - } - - fn expect_ident(expected: str) { - let actual = self.parse_ident(); - if expected != actual { - self.parse_err(#fmt("expected '%s' but found '%s'", - expected, actual)); - } - } - - fn eat_whitespace() { - loop { - alt self.get() { - coe_char(c) { - if c != (' ' as u8) && c != ('\n' as u8) && - c != ('\t' as u8) { - self.unget(c); - ret; - } - } - coe_eof { - ret; - } - } - } - } - - fn parse() -> token { - let mut ch: u8; - alt self.get() { - coe_char(c) { ch = c; } - coe_eof { ret to_eof; } - } - - let token = alt self.state { - ps_normal { self.parse_in_normal_state(ch) } - ps_tag { self.parse_in_tag_state(ch) } - }; - - #debug["token=%?", token]; - ret token; - } - - fn parse_in_normal_state(c: u8) -> token { - let mut ch = c; - if ch == ('<' as u8) { - alt self.get() { - coe_char(c) { ch = c; } - coe_eof { self.parse_err("eof after '<'") } - } - - if ch == ('!' as u8) { - self.eat_whitespace(); - self.expect_ident("DOCTYPE"); - self.eat_whitespace(); - self.expect_ident("html"); - self.eat_whitespace(); - self.expect('>' as u8); - ret to_doctype; - } - - if ch == ('/' as u8) { - let ident = self.parse_ident(); - self.expect('>' as u8); - ret to_end_tag(ident); - } - - self.unget(ch); - - self.eat_whitespace(); - let ident = self.parse_ident(); - self.eat_whitespace(); - - self.state = ps_tag; - ret to_start_opening_tag(ident); - } - - // Make a text node. - let mut s: [u8] = [ch]; - loop { - alt self.get() { - coe_char(c) { - if c == ('<' as u8) { - self.unget(c); - ret s.to_str_token(); - } - s += [c]; - } - coe_eof { ret s.to_str_token(); } - } - } - } - - fn parse_in_tag_state(c: u8) -> token { - let mut ch = c; - - if ch == ('>' as u8) { - self.state = ps_normal; - ret to_end_opening_tag; - } - - if ch == ('/' as u8) { - self.state = ps_normal; - ret to_self_close_tag; - } - - if !ch.is_alpha() { - fail #fmt("expected alphabetical in tag but found %c", ch as char); - } - - // Parse an attribute. - let mut attribute_name = [ch]; - loop { - alt self.get() { - coe_char(c) { - if c == ('=' as u8) { break; } - attribute_name += [c]; - } - coe_eof { - ret to_attr(attribute_name.to_str(), - attribute_name.to_str()); } - } - } - - // Parse the attribute value. - self.expect('"' as u8); - let mut attribute_value = []; - loop { - alt self.get() { - coe_char(c) { - if c == ('"' as u8) { break; } - attribute_value += [c]; - } - coe_eof { - ret to_attr(attribute_name.to_str(), - attribute_value.to_str()); - } - } - } - - // Eat whitespace. - self.eat_whitespace(); - - ret to_attr(attribute_name.to_str(), attribute_value.to_str()); - } -} - -fn parser(reader: io::reader) -> parser { - ret { mut lookahead: none, mut state: ps_normal, reader: reader }; -} - -fn spawn_parser_task(filename: str) -> port { - let result_port = port(); - let result_chan = chan(result_port); - task::spawn {|| - let file_data = io::read_whole_file(filename).get(); - let reader = io::bytes_reader(file_data); - let parser = parser(reader); - - loop { - let token = parser.parse(); - result_chan.send(token); - if token == to_eof { break; } - } - }; - ret result_port; -} - diff --git a/servo/src/servo/parser/html_builder.rs b/servo/src/servo/parser/html_builder.rs index 8ea396b040fc..babe0f1dd78f 100644 --- a/servo/src/servo/parser/html_builder.rs +++ b/servo/src/servo/parser/html_builder.rs @@ -5,8 +5,8 @@ import dom::base::{attr, element, element_subclass, es_div, es_head, es_img}; import dom::base::{es_unknown, methods, nk_element, nk_text, rd_tree_ops}; import dom::base::{wr_tree_ops}; import dom = dom::base; -import parser = parser::html; -import html::token; +import parser = parser::lexer::html; +import parser::token; import gfx::geom; import dvec::extensions; diff --git a/servo/src/servo/parser/lexer.rs b/servo/src/servo/parser/lexer.rs new file mode 100644 index 000000000000..4e53ef2e0977 --- /dev/null +++ b/servo/src/servo/parser/lexer.rs @@ -0,0 +1,501 @@ +import comm::{port, chan}; +import html::html_methods; +import css::css_methods; +import dom::style; + +enum parse_state { + ps_html_normal, + ps_html_tag, + ps_css_elmt, + ps_css_relation, + ps_css_desc, + ps_css_attribute +} + +type parser = { + mut lookahead: option, + mut state: parse_state, + reader: io::reader +}; + +enum char_or_eof { + coe_char(u8), + coe_eof +} + +impl u8_methods for u8 { + fn is_whitespace() -> bool { + ret self == ' ' as u8 || self == '\n' as u8 + || self == '\t' as u8; + } + + fn is_alpha() -> bool { + ret (self >= ('A' as u8) && self <= ('Z' as u8)) || + (self >= ('a' as u8) && self <= ('z' as u8)); + } +} + +impl u8_vec_methods for [u8] { + fn to_str() -> str { ret str::from_bytes(self); } + fn to_html_token() -> html::token { ret html::to_text(self.to_str()); } + fn to_css_token() -> html::token { ret html::to_text(self.to_str()); } +} + +impl util_methods for parser { + fn get() -> char_or_eof { + alt self.lookahead { + some(coe) { + let rv = coe; + self.lookahead = none; + ret rv; + } + none { + /* fall through */ + } + } + + if self.reader.eof() { ret coe_eof; } + ret coe_char(self.reader.read_byte() as u8); + } + + fn unget(ch: u8) { + assert self.lookahead.is_none(); + self.lookahead = some(coe_char(ch)); + } + + fn parse_err(err: str) -> ! { + fail err + } + + fn expect(ch: u8) { + alt self.get() { + coe_char(c) { + if c != ch { + self.parse_err(#fmt("expected '%c'", ch as char)); + } + } + coe_eof { + self.parse_err(#fmt("expected '%c' at eof", ch as char)); + } + } + } + + fn parse_ident() -> str { + let mut result: [u8] = []; + loop { + alt self.get() { + coe_char(c) { + if (c.is_alpha()) { + result += [c]; + } else if result.len() == 0u { + self.parse_err("expected ident"); + } else { + self.unget(c); + break; + } + } + coe_eof { + self.parse_err("expected ident"); + } + } + } + ret str::from_bytes(result); + } + + fn expect_ident(expected: str) { + let actual = self.parse_ident(); + if expected != actual { + self.parse_err(#fmt("expected '%s' but found '%s'", + expected, actual)); + } + } + + fn eat_whitespace() { + loop { + alt self.get() { + coe_char(c) { + if c.is_whitespace() { + self.unget(c); + ret; + } + } + coe_eof { + ret; + } + } + } + } + + fn parse_html() -> html::token { + let mut ch: u8; + alt self.get() { + coe_char(c) { ch = c; } + coe_eof { ret html::to_eof; } + } + + let token = alt self.state { + ps_html_normal { self.parse_in_normal_state(ch) } + ps_html_tag { self.parse_in_tag_state(ch) } + _ { fail "Parsing in html mode when not in " + + "an html state" } + }; + + #debug["token=%?", token]; + ret token; + } + + fn parse_css() -> css::token { + let mut ch: u8; + alt self.get() { + coe_char(c) { ch = c; } + coe_eof { ret css::to_eof; } + } + + let token = alt self.state { + ps_css_desc { self.parse_css_description(ch) } + ps_css_attribute { self.parse_css_attribute(ch) } + ps_css_elmt { self.parse_css_element(ch) } + ps_css_relation { self.parse_css_relation(ch) } + _ { fail "Parsing in css mode when not in " + + "a css state" } + }; + + #debug["token=%?", token]; + ret token; + } +} + +mod html { + enum token { + to_start_opening_tag(str), + to_end_opening_tag, + to_end_tag(str), + to_self_close_tag, + to_text(str), + to_attr(str, str), + to_doctype, + to_eof + } + + impl html_methods for parser { + fn parse_in_normal_state(c: u8) -> token { + let mut ch = c; + if ch == ('<' as u8) { + alt self.get() { + coe_char(c) { ch = c; } + coe_eof { self.parse_err("eof after '<'") } + } + + if ch == ('!' as u8) { + self.eat_whitespace(); + self.expect_ident("DOCTYPE"); + self.eat_whitespace(); + self.expect_ident("html"); + self.eat_whitespace(); + self.expect('>' as u8); + ret to_doctype; + } + + if ch == ('/' as u8) { + let ident = self.parse_ident(); + self.expect('>' as u8); + ret to_end_tag(ident); + } + + self.unget(ch); + + self.eat_whitespace(); + let ident = self.parse_ident(); + self.eat_whitespace(); + + self.state = ps_html_tag; + ret to_start_opening_tag(ident); + } + + // Make a text node. + let mut s: [u8] = [ch]; + loop { + alt self.get() { + coe_char(c) { + if c == ('<' as u8) { + self.unget(c); + ret s.to_html_token(); + } + s += [c]; + } + coe_eof { ret s.to_html_token(); } + } + } + } + + fn parse_in_tag_state(c: u8) -> token { + let mut ch = c; + + if ch == ('>' as u8) { + self.state = ps_html_normal; + ret to_end_opening_tag; + } + + if ch == ('/' as u8) { + self.state = ps_html_normal; + ret to_self_close_tag; + } + + if !ch.is_alpha() { + fail #fmt("expected alphabetical in tag but found %c", + ch as char); + } + + // Parse an attribute. + let mut attribute_name = [ch]; + loop { + alt self.get() { + coe_char(c) { + if c == ('=' as u8) { break; } + attribute_name += [c]; + } + coe_eof { + ret to_attr(attribute_name.to_str(), + attribute_name.to_str()); } + } + } + + // Parse the attribute value. + self.expect('"' as u8); + let mut attribute_value = []; + loop { + alt self.get() { + coe_char(c) { + if c == ('"' as u8) { break; } + attribute_value += [c]; + } + coe_eof { + ret to_attr(attribute_name.to_str(), + attribute_value.to_str()); + } + } + } + + // Eat whitespacpe. + self.eat_whitespace(); + + ret to_attr(attribute_name.to_str(), attribute_value.to_str()); + } + } +} + +mod css { + enum token { + to_start_desc, + to_end_desc, + to_descendant, + to_child, + to_sibling, + to_comma, + to_elmt(str), + to_attr(style::attr), + to_desc(str, str), + to_eof + } + + impl css_methods for parser { + fn parse_css_relation(c : u8) -> token { + self.state = ps_css_elmt; + + let token = alt c { + '{' as u8 { self.state = ps_css_desc; to_start_desc } + '>' as u8 { to_child } + '+' as u8 { to_sibling } + ',' as u8 { to_comma } + _ { to_descendant } + }; + + self.eat_whitespace(); + + ret token; + } + + fn parse_css_element(c : u8) -> token { + /* Check for special attributes with an implied element.*/ + if c == '.' as u8 || c == '#' as u8 { + self.state = ps_css_attribute; + self.unget(c); + ret to_elmt("*"); + } + + let element = self.parse_ident(); + self.state = ps_css_attribute; + + ret to_elmt(element); + } + + fn parse_css_attribute(c : u8) -> token { + let mut ch = c; + + /* If we've reached the end of this list of attributes, + look for the relation to the next element.*/ + if c.is_whitespace() { + self.state = ps_css_relation; + self.eat_whitespace(); + + alt self.get() { + coe_char(c) { ch = c } + coe_eof { fail "File ended before description " + + "of style" } + } + + ret self.parse_css_relation(ch); + } + + alt ch { + '.' as u8 { ret to_attr( + style::includes("class", self.parse_ident())); } + '#' as u8 { ret to_attr( + style::includes("id", self.parse_ident())); } + '[' as u8 { + let attr_name = self.parse_ident(); + + alt self.get() { + coe_char(c) { ch = c; } + coe_eof { fail "File ended before " + + "description finished"; } + } + + if ch == ']' as u8 { + ret to_attr(style::exists(attr_name)); + } else if ch == '=' as u8 { + let attr_val = self.parse_ident(); + self.expect(']' as u8); + ret to_attr(style::exact(attr_name, attr_val)); + } else if ch == '~' as u8 { + self.expect('=' as u8); + let attr_val = self.parse_ident(); + self.expect(']' as u8); + ret to_attr(style::includes(attr_name, attr_val)); + } else if ch == '|' as u8 { + self.expect('=' as u8); + let attr_val = self.parse_ident(); + self.expect(']' as u8); + ret to_attr(style::starts_with(attr_name, attr_val)); + } + + fail #fmt("Unexpected symbol %c in attribute", ch as char); + } + _ { fail #fmt("Unexpected symbol %c in attribute", + ch as char); } + } + } + + fn parse_css_description(c: u8) -> token { + let mut ch = c; + + if ch.is_whitespace() { + self.eat_whitespace(); + + alt self.get() { + coe_char(c) { ch = c } + coe_eof { fail "Reached end of file " + + "in CSS description" } + } + } + + let mut desc_name = []; + + // Get the name of the descriptor + loop { + if ch.is_whitespace() { + self.eat_whitespace(); + } else if ch == ':' as u8 { + if desc_name.len() == 0u { + fail "Expected descriptor name"; + } else { + break; + } + } else { + desc_name += [ch]; + } + + alt self.get() { + coe_char(c) { ch = c } + coe_eof { fail "Reached end of file " + + "in CSS description" } + } + } + + self.eat_whitespace(); + let mut desc_val = []; + + // Get the value of the descriptor + loop { + alt self.get() { + coe_char(c) { ch = c } + coe_eof { fail "Reached end of file " + + "in CSS description" } + } + + if ch.is_whitespace() { + self.eat_whitespace(); + } else if ch == '}' as u8 { + if desc_val.len() == 0u { + fail "Expected descriptor value"; + } else { + self.state = ps_css_elmt; + break; + } + } else if ch == ';' as u8 { + if desc_val.len() == 0u { + fail "Expected descriptor value"; + } else { + break; + } + } else { + desc_val += [ch]; + } + } + + ret to_desc(desc_name.to_str(), desc_val.to_str()); + } + } +} + +fn parser(reader: io::reader, state : parse_state) -> parser { + ret { mut lookahead: none, mut state: state, reader: reader }; +} + +fn spawn_html_parser_task(filename: str) -> port { + let result_port = port(); + let result_chan = chan(result_port); + task::spawn {|| + let file_data = io::read_whole_file(filename).get(); + let reader = io::bytes_reader(file_data); + + assert filename.ends_with(".html"); + let parser = parser(reader, ps_html_normal); + + loop { + let token = parser.parse_html(); + result_chan.send(token); + if token == html::to_eof { break; } + } + }; + ret result_port; +} + +fn spawn_css_parser_task(filename: str) -> port { + let result_port = port(); + let result_chan = chan(result_port); + task::spawn {|| + let file_data = io::read_whole_file(filename).get(); + let reader = io::bytes_reader(file_data); + + assert filename.ends_with(".css"); + let parser : parser = parser(reader, ps_css_elmt); + + loop { + let token = parser.parse_css(); + result_chan.send(token); + if token == css::to_eof { break; } + } + }; + ret result_port; +} diff --git a/servo/src/servo/servo.rc b/servo/src/servo/servo.rc index ddb3764d6893..7cba2bddf350 100755 --- a/servo/src/servo/servo.rc +++ b/servo/src/servo/servo.rc @@ -16,6 +16,7 @@ use stb_image; mod dom { mod base; mod rcu; + mod style; } mod gfx { @@ -26,7 +27,7 @@ mod gfx { } mod image { - mod base; + mod base; mod encode { mod tga; } @@ -34,7 +35,7 @@ mod image { mod layout { mod style { - mod apply; + mod apply; mod style; } @@ -48,7 +49,7 @@ mod layout { } mod parser { - mod html; + mod lexer; mod html_builder; } diff --git a/servo/src/servo/servo.rs b/servo/src/servo/servo.rs index c20d3208afbd..e7607e55f831 100644 --- a/servo/src/servo/servo.rs +++ b/servo/src/servo/servo.rs @@ -1,6 +1,6 @@ import comm::*; -import parser::html; -import parser::html::methods; +import parser::lexer; +//import parser::lexer::util_methods; import result::extensions; import gfx::renderer; import platform::osmain;