TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
-TYPE_WHITESPACE = 2
-TYPE_COMMENT = 3
+TYPE_COMMENT = 2
+TYPE_DOCTYPE = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_CLOSE_TAG = 5 # name
+TYPE_END_TAG = 5 # name
TYPE_EOF = 6
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
+scopers = { # FIXME these are supposed to be namespace specific
+ 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
+ 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
+ 'foreignObject', 'desc', 'title'
+}
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
# foreign_elements = [svg_elements..., mathml_elements...]
#normal_elements = All other allowed HTML elements are normal elements.
+special_elements = {
+ # from HTML:
+ address: true, applet: true, area: true, article: true, aside: true,
+ base: true, basefont: true, bgsound: true, blockquote: true, body: true,
+ br: true, button: true, caption: true, center: true, col: true,
+ colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
+ dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
+ footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
+ h3: true, h4: true, h5: true, h6: true, head: true, header: true,
+ hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
+ isindex: true, li: true, link: true, listing: true, main: true,
+ marquee: true, meta: true, nav: true, noembed: true, noframes: true,
+ noscript: true, object: true, ol: true, p: true, param: true,
+ plaintext: true, pre: true, script: true, section: true, select: true,
+ source: true, style: true, summary: true, table: true, tbody: true,
+ td: true, template: true, textarea: true, tfoot: true, th: true,
+ thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
+ xmp: true,
+
+ # from MathML:
+ mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
+
+ # from SVG:
+ foreignObject: true, desc: true, title: true
+}
+
+formatting_elements = {
+ a: true, b: true, big: true, code: true, em: true, font: true, i: true,
+ nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
+ u: true
+}
+
# decode_named_char_ref()
#
tree_state = null
tok_state = null
tok_cur_tag = null # partially parsed tag
+ flag_frameset_ok = null
+ flag_parsing = null
parse_error = ->
console.log "Parse error at character #{cur} of #{txt.length}"
+
+ # the functions below impliment the Tree Contstruction algorithm
+ # http://www.w3.org/TR/html5/syntax.html#tree-construction
+
+ # But first... the helpers
+ template_tag_is_open = ->
+ for t of open_tags
+ if t[0] is TYPE_TAG and t[1] is 'template'
+ return true
+ return false
+ is_in_scope = (tag_name) ->
+ for t of open_tags
+ if t[0] is TYPE_TAG and t[1] is tag_name
+ return true
+ # FIXME bail if in scopers
+ return false
+
+ reconstruct_active_formatting_elements = ->
+ # FIXME implement this
+
+ # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
+ # FIXME implement this
+ close_p_if_in_button_scope = ->
+ if open_tags[0][1] is 'p' # FIXME
+ open_tags.pop()
+ return
+ #p = find_button_scope 'p'
+ #if p?
+ # TODO generate_implied_end_tags except for p tags
+ # TODO parse_error unless open_tags[0][1] is 'p'
+ # TODO pop stack until 'p' popped
+
+
+
+ # http://www.w3.org/TR/html5/syntax.html#insert-a-character
+ tree_insert_a_character = (t) ->
+ # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+ if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+ open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+ else
+ open_tags[0][3].push t
+
+ # FIXME read spec, do this right
+ # note: this assumes it's an open tag
+ tree_insert_tag = (t) ->
+ t[0] = TYPE_TAG # not TYPE_OPEN_TAG
+ # convert attributes into a hash
+ attrs = {}
+ while t[2].length
+ a = t[2].pop()
+ attrs[a[0]] = a[1]
+ t[2] = attrs
+ open_tags[0][3].push t
+ open_tags.unshift t
+
+ # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
+ tree_insert_a_comment = (t) ->
+ # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+ open_tags[0][3].push t
+
+ # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+ tree_in_body = (t) ->
+ switch t[0]
+ when TYPE_TEXT
+ switch t[1]
+ when "\u0000"
+ parse_error()
+ when "\t", "\u000a", "\u000c", "\u000d", ' '
+ reconstruct_active_formatting_elements()
+ tree_insert_a_character t
+ else
+ reconstruct_active_formatting_elements()
+ tree_insert_a_character t
+ flag_frameset_ok = false
+ when TYPE_COMMENT
+ tree_insert_a_comment t
+ when TYPE_DOCTYPE
+ parse_error()
+ when TYPE_OPEN_TAG
+ switch t[1]
+ when 'html'
+ parse_error()
+ return if template_tag_is_open()
+ root_attrs = open_tags[open_tags.length - 1][3]
+ for k, v of t[2]
+ root_attrs[k] = v unless root_attrs[k]?
+ when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
+ # FIXME also do this for </template> (end tag)
+ return tree_in_head t
+ when 'body'
+ parse_error()
+ # TODO
+ when 'frameset'
+ parse_error()
+ # TODO
+ when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
+ close_p_if_in_button_scope()
+ tree_insert_tag t
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+ close_p_if_in_button_scope()
+ if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+ parse_error()
+ open_tags.shift()
+ tree_insert_tag t
+ # TODO lots more to implement here
+ else # any other start tag
+ reconstruct_active_formatting_elements()
+ tree_insert_tag t
+ when TYPE_EOF
+ ok_tags = {
+ dd: true, dt: true, li: true, p: true, tbody: true, td: true,
+ tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
+ }
+ for t in open_tags
+ unless ok_tags[t[1]]?
+ parse_error()
+ break
+ # TODO stack of template insertion modes thing
+ flag_parsing = false # stop parsing
+ when TYPE_END_TAG
+ switch t[1]
+ when 'body'
+ unless is_in_scope 'body'
+ parse_error()
+ return
+ # TODO implement parse error and move to tree_after_body
+ when 'html'
+ unless is_in_scope 'body' # weird, but it's what the spec says
+ parse_error()
+ return
+ # TODO implement parse error and move to tree_after_body, reprocess
+ # TODO lots more close tags to implement here
+ else
+ for node, i in open_tags
+ if node[1] is t[1]
+ # FIXME generate implied end tags except those with name==t[1]
+ parse_error() unless i is 0
+ while i > 0
+ open_tags.shift()
+ i -= 1
+ open_tags.shift()
+ return
+ if special_elements[node[1]]?
+ parse_error()
+ return
+
+
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
return [TYPE_TEXT, '</']
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+ tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
tok_state = tok_state_tag_name
else if lc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+ tok_cur_tag = [TYPE_END_TAG, c, [], []]
tok_state = tok_state_tag_name
else
parse_error()
return '&'
return # never reached
- # the functions below impliment the Tree Contstruction algorithm here:
- # http://www.w3.org/TR/html5/syntax.html#tree-construction
- # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
- tree_append = (t) ->
- switch t[0]
- when TYPE_TEXT
- if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
- open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
- else
- open_tags[0][3].push t
- when TYPE_OPEN_TAG
- t[0] = TYPE_TAG
- # convert attributes into a hash
- attrs = {}
- while t[2].length
- a = t[2].pop()
- attrs[a[0]] = a[1]
- t[2] = attrs
- # FIXME probs have to auto-close things first
- open_tags[0][3].push t
- open_tags.unshift t
- # TODO implement formatting elements thing
- when TYPE_CLOSE_TAG
- # FIXME this is just a hack for now
- if open_tags.length < 2
- parse_error()
- return
- if open_tags[0][1] isnt t[1]
- parse_error()
- # fall through and close something anyway
- open_tags.shift()
- when TYPE_EOF
- return
- # TODO implement close tags
- # TODO implement self-closing tags
- else
- console.log "UNIMPLEMENTED tag type: #{t[0]}"
- return
-
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
tree = [TYPE_TAG, 'html', {}, []]
open_tags = [tree]
- tree_state = tree_append
+ tree_state = tree_in_body
+ flag_frameset_ok = true
+ flag_parsing = true
# tokenizer initialization
tok_state = tok_state_data
# proccess input
- loop
+ while flag_parsing
t = tok_state()
if t?
tree_state t
- if t[0] is TYPE_EOF
- return tree[3]
- return # never reached
+ return tree[3]
# everything below is tests on the above
test_equals = (description, fn, args..., expected_output) ->
test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
+test_equals "mis-matched closing tags", html_to_json, "foo<div>bar<span>baz</div>qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'