JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
start implementing tree construction algorithm
authorJason Woofenden <jason@jasonwoof.com>
Tue, 15 Dec 2015 05:21:37 +0000 (00:21 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Tue, 15 Dec 2015 05:21:37 +0000 (00:21 -0500)
parse-html.coffee

index db1837b..6344839 100644 (file)
 
 TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
-TYPE_WHITESPACE = 2
-TYPE_COMMENT = 3
+TYPE_COMMENT = 2
+TYPE_DOCTYPE = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_CLOSE_TAG = 5 # name
+TYPE_END_TAG = 5 # name
 TYPE_EOF = 6
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
@@ -40,6 +40,11 @@ uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 digits = "0123456789"
 alnum = lc_alpha + uc_alpha + digits
 hex_chars = digits + "abcdefABCDEF"
+scopers = { # FIXME these are supposed to be namespace specific
+       'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
+       'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
+       'foreignObject', 'desc', 'title'
+}
 
 # some SVG elements have dashes in them
 tag_name_chars = alnum + "-"
@@ -129,6 +134,38 @@ mathml_elements = [
 # foreign_elements = [svg_elements..., mathml_elements...]
 #normal_elements = All other allowed HTML elements are normal elements.
 
+special_elements = {
+       # from HTML:
+       address: true, applet: true, area: true, article: true, aside: true,
+       base: true, basefont: true, bgsound: true, blockquote: true, body: true,
+       br: true, button: true, caption: true, center: true, col: true,
+       colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
+       dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
+       footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
+       h3: true, h4: true, h5: true, h6: true, head: true, header: true,
+       hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
+       isindex: true, li: true, link: true, listing: true, main: true,
+       marquee: true, meta: true, nav: true, noembed: true, noframes: true,
+       noscript: true, object: true, ol: true, p: true, param: true,
+       plaintext: true, pre: true, script: true, section: true, select: true,
+       source: true, style: true, summary: true, table: true, tbody: true,
+       td: true, template: true, textarea: true, tfoot: true, th: true,
+       thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
+       xmp: true,
+
+       # from MathML:
+       mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
+
+       # from SVG:
+       foreignObject: true, desc: true, title: true
+}
+
+formatting_elements = {
+        a: true, b: true, big: true, code: true, em: true, font: true, i: true,
+        nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
+        u: true
+}
+
 
 # decode_named_char_ref()
 #
@@ -160,10 +197,159 @@ parse_html = (txt) ->
        tree_state = null
        tok_state = null
        tok_cur_tag = null # partially parsed tag
+       flag_frameset_ok = null
+       flag_parsing = null
 
        parse_error = ->
                console.log "Parse error at character #{cur} of #{txt.length}"
 
+
+       # the functions below impliment the Tree Contstruction algorithm
+       # http://www.w3.org/TR/html5/syntax.html#tree-construction
+
+       # But first... the helpers
+       template_tag_is_open = ->
+               for t of open_tags
+                       if t[0] is TYPE_TAG and t[1] is 'template'
+                               return true
+               return false
+       is_in_scope = (tag_name) ->
+               for t of open_tags
+                       if t[0] is TYPE_TAG and t[1] is tag_name
+                               return true
+                       # FIXME bail if in scopers
+               return false
+
+       reconstruct_active_formatting_elements = ->
+               # FIXME implement this
+
+       # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
+       # FIXME implement this
+       close_p_if_in_button_scope = ->
+               if open_tags[0][1] is 'p' # FIXME
+                       open_tags.pop()
+               return
+               #p = find_button_scope 'p'
+               #if p?
+                       # TODO generate_implied_end_tags except for p tags
+                       # TODO parse_error unless open_tags[0][1] is 'p'
+                       # TODO pop stack until 'p' popped
+
+
+
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-character
+       tree_insert_a_character = (t) ->
+               # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+               if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+                       open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+               else
+                       open_tags[0][3].push t
+
+       # FIXME read spec, do this right
+       # note: this assumes it's an open tag
+       tree_insert_tag = (t) ->
+               t[0] = TYPE_TAG # not TYPE_OPEN_TAG
+               # convert attributes into a hash
+               attrs = {}
+               while t[2].length
+                       a = t[2].pop()
+                       attrs[a[0]] = a[1]
+               t[2] = attrs
+               open_tags[0][3].push t
+               open_tags.unshift t
+
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
+       tree_insert_a_comment = (t) ->
+               # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+               open_tags[0][3].push t
+
+       # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+       tree_in_body = (t) ->
+               switch t[0]
+                       when TYPE_TEXT
+                               switch t[1]
+                                       when "\u0000"
+                                               parse_error()
+                                       when "\t", "\u000a", "\u000c", "\u000d", ' '
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_a_character t
+                                       else
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_a_character t
+                                               flag_frameset_ok = false
+                       when TYPE_COMMENT
+                               tree_insert_a_comment t
+                       when TYPE_DOCTYPE
+                               parse_error()
+                       when TYPE_OPEN_TAG
+                               switch t[1]
+                                       when 'html'
+                                               parse_error()
+                                               return if template_tag_is_open()
+                                               root_attrs = open_tags[open_tags.length - 1][3]
+                                               for k, v of t[2]
+                                                       root_attrs[k] = v unless root_attrs[k]?
+                                       when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
+                                               # FIXME also do this for </template> (end tag)
+                                               return tree_in_head t
+                                       when 'body'
+                                               parse_error()
+                                               # TODO
+                                       when 'frameset'
+                                               parse_error()
+                                               # TODO
+                                       when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
+                                               close_p_if_in_button_scope()
+                                               tree_insert_tag t
+                                       when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+                                               close_p_if_in_button_scope()
+                                               if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+                                                       parse_error()
+                                                       open_tags.shift()
+                                               tree_insert_tag t
+                                       # TODO lots more to implement here
+                                       else # any other start tag
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_tag t
+                       when TYPE_EOF
+                               ok_tags = {
+                                       dd: true, dt: true, li: true, p: true, tbody: true, td: true,
+                                       tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
+                               }
+                               for t in open_tags
+                                       unless ok_tags[t[1]]?
+                                               parse_error()
+                                               break
+                               # TODO stack of template insertion modes thing
+                               flag_parsing = false # stop parsing
+                       when TYPE_END_TAG
+                               switch t[1]
+                                       when 'body'
+                                               unless is_in_scope 'body'
+                                                       parse_error()
+                                                       return
+                                               # TODO implement parse error and move to tree_after_body
+                                       when 'html'
+                                               unless is_in_scope 'body' # weird, but it's what the spec says
+                                                       parse_error()
+                                                       return
+                                               # TODO implement parse error and move to tree_after_body, reprocess
+                                       # TODO lots more close tags to implement here
+                                       else
+                                               for node, i in open_tags
+                                                       if node[1] is t[1]
+                                                               # FIXME generate implied end tags except those with name==t[1]
+                                                               parse_error() unless i is 0
+                                                               while i > 0
+                                                                       open_tags.shift()
+                                                                       i -= 1
+                                                               open_tags.shift()
+                                                               return
+                                                       if special_elements[node[1]]?
+                                                               parse_error()
+                                                               return
+
+
        # the functions below implement the tokenizer stats described here:
        # http://www.w3.org/TR/html5/syntax.html#tokenization
 
@@ -223,10 +409,10 @@ parse_html = (txt) ->
                                return [TYPE_TEXT, '</']
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+                                       tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
                                        tok_state = tok_state_tag_name
                                else if lc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+                                       tok_cur_tag = [TYPE_END_TAG, c, [], []]
                                        tok_state = tok_state_tag_name
                                else
                                        parse_error()
@@ -504,62 +690,23 @@ parse_html = (txt) ->
                                        return '&'
                return # never reached
 
-       # the functions below impliment the Tree Contstruction algorithm here:
-       # http://www.w3.org/TR/html5/syntax.html#tree-construction
-       # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
-       tree_append = (t) ->
-               switch t[0]
-                       when TYPE_TEXT
-                               if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
-                                       open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
-                               else
-                                       open_tags[0][3].push t
-                       when TYPE_OPEN_TAG
-                               t[0] = TYPE_TAG
-                               # convert attributes into a hash
-                               attrs = {}
-                               while t[2].length
-                                       a = t[2].pop()
-                                       attrs[a[0]] = a[1]
-                               t[2] = attrs
-                               # FIXME probs have to auto-close things first
-                               open_tags[0][3].push t
-                               open_tags.unshift t
-                               # TODO implement formatting elements thing
-                       when TYPE_CLOSE_TAG
-                               # FIXME this is just a hack for now
-                               if open_tags.length < 2
-                                       parse_error()
-                                       return
-                               if open_tags[0][1] isnt t[1]
-                                       parse_error()
-                                       # fall through and close something anyway
-                               open_tags.shift()
-                       when TYPE_EOF
-                               return
-                       # TODO implement close tags
-                       # TODO implement self-closing tags
-                       else
-                               console.log "UNIMPLEMENTED tag type: #{t[0]}"
-               return
-
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
        tree = [TYPE_TAG, 'html', {}, []]
        open_tags = [tree]
-       tree_state = tree_append
+       tree_state = tree_in_body
+       flag_frameset_ok = true
+       flag_parsing = true
 
        # tokenizer initialization
        tok_state = tok_state_data
 
        # proccess input
-       loop
+       while flag_parsing
                t = tok_state()
                if t?
                        tree_state t
-                       if t[0] is TYPE_EOF
-                               return tree[3]
-       return # never reached
+       return tree[3]
 
 # everything below is tests on the above
 test_equals = (description, fn, args..., expected_output) ->
@@ -584,3 +731,4 @@ test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=
 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
+test_equals "mis-matched closing tags", html_to_json, "foo<div>bar<span>baz</div>qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'