From f71bc2de38c04f08a08368fcfa443b587e37dd76 Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Tue, 15 Dec 2015 00:21:37 -0500 Subject: [PATCH] start implementing tree construction algorithm --- parse-html.coffee | 246 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 197 insertions(+), 49 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index db1837b..6344839 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -28,11 +28,11 @@ TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" -TYPE_WHITESPACE = 2 -TYPE_COMMENT = 3 +TYPE_COMMENT = 2 +TYPE_DOCTYPE = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] -TYPE_CLOSE_TAG = 5 # name +TYPE_END_TAG = 5 # name TYPE_EOF = 6 lc_alpha = "abcdefghijklmnopqrstuvwxqz" @@ -40,6 +40,11 @@ uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" +scopers = { # FIXME these are supposed to be namespace specific + 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object', + 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml', + 'foreignObject', 'desc', 'title' +} # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -129,6 +134,38 @@ mathml_elements = [ # foreign_elements = [svg_elements..., mathml_elements...] #normal_elements = All other allowed HTML elements are normal elements. +special_elements = { + # from HTML: + address: true, applet: true, area: true, article: true, aside: true, + base: true, basefont: true, bgsound: true, blockquote: true, body: true, + br: true, button: true, caption: true, center: true, col: true, + colgroup: true, dd: true, details: true, dir: true, div: true, dl: true, + dt: true, embed: true, fieldset: true, figcaption: true, figure: true, + footer: true, form: true, frame: true, frameset: true, h1: true, h2: true, + h3: true, h4: true, h5: true, h6: true, head: true, header: true, + hgroup: true, hr: true, html: true, iframe: true, img: true, input: true, + isindex: true, li: true, link: true, listing: true, main: true, + marquee: true, meta: true, nav: true, noembed: true, noframes: true, + noscript: true, object: true, ol: true, p: true, param: true, + plaintext: true, pre: true, script: true, section: true, select: true, + source: true, style: true, summary: true, table: true, tbody: true, + td: true, template: true, textarea: true, tfoot: true, th: true, + thead: true, title: true, tr: true, track: true, ul: true, wbr: true, + xmp: true, + + # from MathML: + mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true, + + # from SVG: + foreignObject: true, desc: true, title: true +} + +formatting_elements = { + a: true, b: true, big: true, code: true, em: true, font: true, i: true, + nobr: true, s: true, small: true, strike: true, strong: true, tt: true, + u: true +} + # decode_named_char_ref() # @@ -160,10 +197,159 @@ parse_html = (txt) -> tree_state = null tok_state = null tok_cur_tag = null # partially parsed tag + flag_frameset_ok = null + flag_parsing = null parse_error = -> console.log "Parse error at character #{cur} of #{txt.length}" + + # the functions below impliment the Tree Contstruction algorithm + # http://www.w3.org/TR/html5/syntax.html#tree-construction + + # But first... the helpers + template_tag_is_open = -> + for t of open_tags + if t[0] is TYPE_TAG and t[1] is 'template' + return true + return false + is_in_scope = (tag_name) -> + for t of open_tags + if t[0] is TYPE_TAG and t[1] is tag_name + return true + # FIXME bail if in scopers + return false + + reconstruct_active_formatting_elements = -> + # FIXME implement this + + # http://www.w3.org/TR/html5/syntax.html#close-a-p-element + # FIXME implement this + close_p_if_in_button_scope = -> + if open_tags[0][1] is 'p' # FIXME + open_tags.pop() + return + #p = find_button_scope 'p' + #if p? + # TODO generate_implied_end_tags except for p tags + # TODO parse_error unless open_tags[0][1] is 'p' + # TODO pop stack until 'p' popped + + + + # http://www.w3.org/TR/html5/syntax.html#insert-a-character + tree_insert_a_character = (t) -> + # FIXME read spec for "adjusted insertion location, etc, this might be wrong + if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT + open_tags[0][3][open_tags[0][3].length - 1][1] += t[1] + else + open_tags[0][3].push t + + # FIXME read spec, do this right + # note: this assumes it's an open tag + tree_insert_tag = (t) -> + t[0] = TYPE_TAG # not TYPE_OPEN_TAG + # convert attributes into a hash + attrs = {} + while t[2].length + a = t[2].pop() + attrs[a[0]] = a[1] + t[2] = attrs + open_tags[0][3].push t + open_tags.unshift t + + # http://www.w3.org/TR/html5/syntax.html#insert-a-comment + tree_insert_a_comment = (t) -> + # FIXME read spec for "adjusted insertion location, etc, this might be wrong + open_tags[0][3].push t + + # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody + tree_in_body = (t) -> + switch t[0] + when TYPE_TEXT + switch t[1] + when "\u0000" + parse_error() + when "\t", "\u000a", "\u000c", "\u000d", ' ' + reconstruct_active_formatting_elements() + tree_insert_a_character t + else + reconstruct_active_formatting_elements() + tree_insert_a_character t + flag_frameset_ok = false + when TYPE_COMMENT + tree_insert_a_comment t + when TYPE_DOCTYPE + parse_error() + when TYPE_OPEN_TAG + switch t[1] + when 'html' + parse_error() + return if template_tag_is_open() + root_attrs = open_tags[open_tags.length - 1][3] + for k, v of t[2] + root_attrs[k] = v unless root_attrs[k]? + when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' + # FIXME also do this for (end tag) + return tree_in_head t + when 'body' + parse_error() + # TODO + when 'frameset' + parse_error() + # TODO + when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul' + close_p_if_in_button_scope() + tree_insert_tag t + when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' + close_p_if_in_button_scope() + if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + parse_error() + open_tags.shift() + tree_insert_tag t + # TODO lots more to implement here + else # any other start tag + reconstruct_active_formatting_elements() + tree_insert_tag t + when TYPE_EOF + ok_tags = { + dd: true, dt: true, li: true, p: true, tbody: true, td: true, + tfoot: true, th: true, thead: true, tr: true, body: true, html: true, + } + for t in open_tags + unless ok_tags[t[1]]? + parse_error() + break + # TODO stack of template insertion modes thing + flag_parsing = false # stop parsing + when TYPE_END_TAG + switch t[1] + when 'body' + unless is_in_scope 'body' + parse_error() + return + # TODO implement parse error and move to tree_after_body + when 'html' + unless is_in_scope 'body' # weird, but it's what the spec says + parse_error() + return + # TODO implement parse error and move to tree_after_body, reprocess + # TODO lots more close tags to implement here + else + for node, i in open_tags + if node[1] is t[1] + # FIXME generate implied end tags except those with name==t[1] + parse_error() unless i is 0 + while i > 0 + open_tags.shift() + i -= 1 + open_tags.shift() + return + if special_elements[node[1]]? + parse_error() + return + + # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization @@ -223,10 +409,10 @@ parse_html = (txt) -> return [TYPE_TEXT, ' -1 - tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []] + tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []] tok_state = tok_state_tag_name else if lc_alpha.indexOf(c) > -1 - tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []] + tok_cur_tag = [TYPE_END_TAG, c, [], []] tok_state = tok_state_tag_name else parse_error() @@ -504,62 +690,23 @@ parse_html = (txt) -> return '&' return # never reached - # the functions below impliment the Tree Contstruction algorithm here: - # http://www.w3.org/TR/html5/syntax.html#tree-construction - # FIXME this is just a bit of a hack that makes sense... read spec and do it that way - tree_append = (t) -> - switch t[0] - when TYPE_TEXT - if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT - open_tags[0][3][open_tags[0][3].length - 1][1] += t[1] - else - open_tags[0][3].push t - when TYPE_OPEN_TAG - t[0] = TYPE_TAG - # convert attributes into a hash - attrs = {} - while t[2].length - a = t[2].pop() - attrs[a[0]] = a[1] - t[2] = attrs - # FIXME probs have to auto-close things first - open_tags[0][3].push t - open_tags.unshift t - # TODO implement formatting elements thing - when TYPE_CLOSE_TAG - # FIXME this is just a hack for now - if open_tags.length < 2 - parse_error() - return - if open_tags[0][1] isnt t[1] - parse_error() - # fall through and close something anyway - open_tags.shift() - when TYPE_EOF - return - # TODO implement close tags - # TODO implement self-closing tags - else - console.log "UNIMPLEMENTED tag type: #{t[0]}" - return - # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data tree = [TYPE_TAG, 'html', {}, []] open_tags = [tree] - tree_state = tree_append + tree_state = tree_in_body + flag_frameset_ok = true + flag_parsing = true # tokenizer initialization tok_state = tok_state_data # proccess input - loop + while flag_parsing t = tok_state() if t? tree_state t - if t[0] is TYPE_EOF - return tree[3] - return # never reached + return tree[3] # everything below is tests on the above test_equals = (description, fn, args..., expected_output) -> @@ -584,3 +731,4 @@ test_equals "attribute entity exceptions dq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' test_equals "attribute entity exceptions uq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' test_equals "matching closing tags", html_to_json, "foohi
1
foo
2
bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]' +test_equals "mis-matched closing tags", html_to_json, "foo
barbaz
qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]' -- 1.7.10.4