# along with this program. If not, see <http://www.gnu.org/licenses/>.
-# This file implements a parser for html snippets, meant to be used by a
+# This file implements a thorough parser for html5, meant to be used by a
# WYSIWYG editor.
# The implementation is a pretty direct implementation of the parsing algorithm
# described here:
-# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
#
-# Deviations from that spec:
+# http://www.w3.org/TR/html5/syntax.html
#
-# Purposeful: search this file for "WHATWG"
+# except for some places marked "WHATWG" that are implemented as described here:
#
-# Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
+# https://html.spec.whatwg.org/multipage/syntax.html
+#
+# This code passes all of the tests in the .dat files at:
+#
+# https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
+
+
+##################################
+## how to use this code
+##################################
+#
+# See README.md for how to pre-compile this file, or compile it in the browser.
+#
+# This file exports a single useful function: parse_tml
+#
+# Once you include this file in a page (see index.html for an example) you'll
+# have window.wheic
+#
+# Call it like this:
+#
+# wheic.parse_html({html: "<p><b>hi</p>"})
+#
+# Or, if you don't want <html><head><body>/etc, do this:
+#
+# wheic.parse_html({fragment: "body", html: "<p><b>hi</p>"})
+#
+# This code can _almost_ run outside the browser (eg under node.js). To get it
+# to run without the browser would require native implementation of
+# decode_named_char_ref(). The current implementation of that function uses the
+# browser's DOM api, to save space (the list of valid named characters is
+# massive.)
+# This code is a work in progress, eg try search this file for "fixfull",
+# "TODO" and "FIXME"
-# stacks/lists
+
+# Notes: stacks/lists
#
-# the spec uses a many different words do indicate which ends of lists/stacks
-# they are talking about (and relative movement within the lists/stacks). This
-# section splains. I'm implementing "lists" (afe and open_els) the same way
-# (both as stacks)
+# Jason was frequently confused by the terminology used to refer to different
+# parts of the stacks and lists in the spec, so he made this chart to help keep
+# his head straight:
#
# stacks grow downward (current element is index=0)
#
# 1: b
# 0: a "end of the list", "current node", "bottommost", "last"
-
-# browser
-# note: to get this to run outside a browser, you'll have to write a native
-# implementation of decode_named_char_ref()
unless module?.exports?
window.wheic = {}
module = exports: window.wheic
NS_MATHML = 2
NS_SVG = 3
+# quirks mode constants
+QUIRKS_NO = 1
+QUIRKS_LIMITED = 2
+QUIRKS_YES = 3
+
+# queue up debug logs, so eg they can be shown only for tests that fail
g_debug_log = []
debug_log_reset = ->
g_debug_log = []
+ return
debug_log = (str) ->
g_debug_log.push str
+ return
debug_log_each = (cb) ->
for str in g_debug_log
cb str
+ return
prev_node_id = 0
class Node
@token.flag 'did_self_close', true
else
@flag 'did_self_close', true
+ return
flag: (key, value = null) ->
if value?
@flags[key] = value
else
return @flags[key]
- serialize: (shallow = false, show_ids = false) -> # for unit tests
- ret = ''
- switch @type
- when TYPE_TAG
- ret += 'tag:'
- ret += JSON.stringify @name
- ret += ','
- if show_ids
- ret += "##{@id},"
- if shallow
- break
- attr_keys = []
- for k of @attrs
- attr_keys.push k
- attr_keys.sort()
- ret += '{'
- sep = ''
- for k in attr_keys
- ret += sep
- sep = ','
- ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
- ret += '},['
- sep = ''
- for c in @children
- ret += sep
- sep = ','
- ret += c.serialize shallow, show_ids
- ret += ']'
- when TYPE_TEXT
- ret += 'text:'
- ret += JSON.stringify @text
- when TYPE_COMMENT
- ret += 'comment:'
- ret += JSON.stringify @text
- when TYPE_DOCTYPE
- ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
- when TYPE_AFE_MARKER
- ret += 'marker'
- when TYPE_AAA_BOOKMARK
- ret += 'aaa_bookmark'
- else
- ret += 'unknown:'
- console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
- return ret
+ return
# helpers: (only take args that are normally known when parser creates nodes)
new_open_tag = (name) ->
unicode_fixes[0x9E] = "\u017E"
unicode_fixes[0x9F] = "\u0178"
+quirks_yes_pi_prefixes = [
+ "+//silmaril//dtd html pro v0r11 19970101//"
+ "-//as//dtd html 3.0 aswedit + extensions//"
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
+ "-//ietf//dtd html 2.0 level 1//"
+ "-//ietf//dtd html 2.0 level 2//"
+ "-//ietf//dtd html 2.0 strict level 1//"
+ "-//ietf//dtd html 2.0 strict level 2//"
+ "-//ietf//dtd html 2.0 strict//"
+ "-//ietf//dtd html 2.0//"
+ "-//ietf//dtd html 2.1e//"
+ "-//ietf//dtd html 3.0//"
+ "-//ietf//dtd html 3.2 final//"
+ "-//ietf//dtd html 3.2//"
+ "-//ietf//dtd html 3//"
+ "-//ietf//dtd html level 0//"
+ "-//ietf//dtd html level 1//"
+ "-//ietf//dtd html level 2//"
+ "-//ietf//dtd html level 3//"
+ "-//ietf//dtd html strict level 0//"
+ "-//ietf//dtd html strict level 1//"
+ "-//ietf//dtd html strict level 2//"
+ "-//ietf//dtd html strict level 3//"
+ "-//ietf//dtd html strict//"
+ "-//ietf//dtd html//"
+ "-//metrius//dtd metrius presentational//"
+ "-//microsoft//dtd internet explorer 2.0 html strict//"
+ "-//microsoft//dtd internet explorer 2.0 html//"
+ "-//microsoft//dtd internet explorer 2.0 tables//"
+ "-//microsoft//dtd internet explorer 3.0 html strict//"
+ "-//microsoft//dtd internet explorer 3.0 html//"
+ "-//microsoft//dtd internet explorer 3.0 tables//"
+ "-//netscape comm. corp.//dtd html//"
+ "-//netscape comm. corp.//dtd strict html//"
+ "-//o'reilly and associates//dtd html 2.0//"
+ "-//o'reilly and associates//dtd html extended 1.0//"
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//"
+ "-//sq//dtd html 2.0 hotmetal + extensions//"
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
+ "-//spyglass//dtd html 2.0 extended//"
+ "-//sun microsystems corp.//dtd hotjava html//"
+ "-//sun microsystems corp.//dtd hotjava strict html//"
+ "-//w3c//dtd html 3 1995-03-24//"
+ "-//w3c//dtd html 3.2 draft//"
+ "-//w3c//dtd html 3.2 final//"
+ "-//w3c//dtd html 3.2//"
+ "-//w3c//dtd html 3.2s draft//"
+ "-//w3c//dtd html 4.0 frameset//"
+ "-//w3c//dtd html 4.0 transitional//"
+ "-//w3c//dtd html experimental 19960712//"
+ "-//w3c//dtd html experimental 970421//"
+ "-//w3c//dtd w3 html//"
+ "-//w3o//dtd w3 html 3.0//"
+ "-//webtechs//dtd mozilla html 2.0//"
+ "-//webtechs//dtd mozilla html//"
+]
+
# These are the character references that don't need a terminating semicolon
# min length: 2, max: 6, none are a prefix of any other.
legacy_char_refs = {
stop_parsing = ->
flag_parsing = false
+ return
parse_error = ->
if args.error_cb?
args.error_cb cur
else
console.log "Parse error at character #{cur} of #{txt.length}"
+ return
+ # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+ # "Noah's Ark clause" but with three
afe_push = (new_el) ->
matches = 0
for el, i in afe
+ if el.type is TYPE_AFE_MARKER
+ break
if el.name is new_el.name and el.namespace is new_el.namespace
+ attrs_match = true
for k, v of el.attrs
- continue unless new_el.attrs[k] is v
- for k, v of new_el.attrs
- continue unless el.attrs[k] is v
- matches += 1
- if matches is 3
- afe.splice i, 1
- break
+ unless new_el.attrs[k] is v
+ attrs_match = false
+ break
+ if attrs_match
+ for k, v of new_el.attrs
+ unless el.attrs[k] is v
+ attrs_match = false
+ break
+ if attrs_match
+ matches += 1
+ if matches is 3
+ afe.splice i, 1
+ break
afe.unshift new_el
+ return
+
afe_push_marker = ->
afe.unshift new_afe_marker()
+ return
# the functions below impliment the Tree Contstruction algorithm
# http://www.w3.org/TR/html5/syntax.html#tree-construction
# But first... the helpers
template_tag_is_open = ->
- for t in open_els
- if t.name is 'template' and t.namespace is NS_HTML
+ for el in open_els
+ if el.name is 'template' and el.namespace is NS_HTML
return true
return false
is_in_scope_x = (tag_name, scope, namespace) ->
- for t in open_els
- if t.name is tag_name and (namespace is null or namespace is t.namespace)
+ for el in open_els
+ if el.name is tag_name and (namespace is null or namespace is el.namespace)
return true
- if scope[t.name] is t.namespace
+ if scope[el.name] is el.namespace
return false
return false
is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
- for t in open_els
- if t.name is tag_name and (namespace is null or namespace is t.namespace)
+ for el in open_els
+ if el.name is tag_name and (namespace is null or namespace is el.namespace)
return true
- if scope[t.name] is t.namespace
+ if scope[el.name] is el.namespace
return false
- if scope2[t.name] is t.namespace
+ if scope2[el.name] is el.namespace
return false
return false
standard_scopers = {
loop
if node_i is open_els.length - 1
last = true
- # fixfull (fragment case)
-
+ if flag_fragment_parsing
+ node = context_element
# 4. If node is a select element, run these substeps:
if node.name is 'select' and node.namespace is NS_HTML
# 1. If last is true, jump to the step below labeled done.
node_i += 1
node = open_els[node_i]
# 19. Return to the step labeled loop.
+ return
# 8.2.3.2
afe[i] = el
break if i is 0
i -= 1 # Advance
+ return
# http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
# adoption agency algorithm
# http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
# http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
adoption_agency = (subject) ->
- debug_log "adoption_agency()"
- debug_log "tree: #{serialize_els doc.children, false, true}"
- debug_log "open_els: #{serialize_els open_els, true, true}"
- debug_log "afe: #{serialize_els afe, true, true}"
# this block implements tha W3C spec
# # 1. If the current node is an HTML element whose tag name is subject,
# # then run these substeps:
# if t is el
# afe.splice i, 1
# break
-# debug_log "aaa: starting off with subject on top of stack, exiting"
# return
# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
# If the current node is an HTML element whose tag name is subject, and
# then pop the current node off the stack of open elements, and abort
# these steps.
if open_els[0].name is subject and open_els[0].namespace is NS_HTML
- debug_log "aaa: starting off with subject on top of stack, exiting"
# remove it from the list of active formatting elements (if found)
in_afe = false
for el, i in afe
in_afe = true
break
unless in_afe
- debug_log "aaa: ...and not in afe, aaa done"
open_els.shift()
return
# fall through
# If there is no such element, then abort these steps and instead
# act as described in the "any other end tag" entry above.
if fe is null
- debug_log "aaa: fe not found in afe"
in_body_any_other_end_tag subject
return
# 6. If formatting element is not in the stack of open elements,
in_open_els = true
break
unless in_open_els
- debug_log "aaa: fe not found in open_els"
parse_error()
# "remove it from the list" must mean afe, since it's not in open_els
afe.splice fe_of_afe, 1
# the element is not in scope, then this is a parse error; abort
# these steps.
unless el_is_in_scope fe
- debug_log "aaa: fe not in scope"
parse_error()
return
# 8. If formatting element is not the current node, this is a parse
# formatting element from the list of active formatting elements,
# and finally abort these steps.
if fb is null
- debug_log "aaa: no fb"
loop
t = open_els.shift()
if t is fe
node_next = open_els[i + 1]
break
node = node_next ? node_above
- debug_log "inner loop #{inner}"
- debug_log "tree: #{serialize_els doc.children, false, true}"
- debug_log "open_els: #{serialize_els open_els, true, true}"
- debug_log "afe: #{serialize_els afe, true, true}"
- debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
- debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
- debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
- debug_log "node: #{node.serialize true, true}"
# TODO make sure node_above gets re-set if/when node is removed from open_els
# 4. If node is formatting element, then go to the next step in
# the overall algorithm.
if node is fe
break
- debug_log "the meat"
# 5. If inner loop counter is greater than three and node is in
# the list of active formatting elements, then remove node from
# the list of active formatting elements.
if t is node
if inner > 3
afe.splice i, 1
- debug_log "max out inner"
else
node_in_afe = true
- debug_log "in afe"
break
# 6. If node is not in the list of active formatting elements,
# then remove node from the stack of open elements and then go
# back to the step labeled inner loop.
unless node_in_afe
- debug_log "not in afe"
for t, i in open_els
if t is node
node_above = open_els[i + 1]
open_els.splice i, 1
break
continue
- debug_log "the bones"
# 7. create an element for the token for which the element node
# was created, in the HTML namespace, with common ancestor as
# the intended parent; replace the entry for node in the list
for t, i in afe
if t is node
afe[i] = new_node
- debug_log "replaced in afe"
break
for t, i in open_els
if t is node
node_above = open_els[i + 1]
open_els[i] = new_node
- debug_log "replaced in open_els"
break
node = new_node
# 8. If last node is furthest block, then move the
for t, i in afe
if t is bookmark
afe.splice i, 1
- debug_log "removed bookmark"
break
for t, i in afe
if t is node
# "after" means lower
afe.splice i, 0, bookmark # "after as <-
- debug_log "placed bookmark after node"
- debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
break
# 9. Insert last node into node, first removing it from its
# previous parent node if any.
if last_node.parent?
- debug_log "last_node has parent"
for c, i in last_node.parent.children
if c is last_node
- debug_log "removing last_node from parent"
last_node.parent.children.splice i, 1
break
node.children.push last_node
last_node.parent = node
# 10. Let last node be node.
last_node = node
- debug_log "at last"
# 11. Return to the step labeled inner loop.
# 14. Insert whatever last node ended up being in the previous step
# at the appropriate place for inserting a node, but using common
# * last_node is fb
# * last_node is still in the tree (not a duplicate)
if last_node.parent?
- debug_log "FEFIRST? last_node has parent"
for c, i in last_node.parent.children
if c is last_node
- debug_log "removing last_node from parent"
last_node.parent.children.splice i, 1
break
-
- debug_log "after aaa inner loop"
- debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
- debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
- debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
- debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
- debug_log "tree: #{serialize_els doc.children, false, true}"
-
- debug_log "insert"
-
-
# can't use standard insert token thing, because it's already in
# open_els and must stay at it's current position in open_els
dest = adjusted_insertion_location ca
dest[0].children.splice dest[1], 0, last_node
last_node.parent = dest[0]
-
-
- debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
- debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
- debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
- debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
- debug_log "tree: #{serialize_els doc.children, false, true}"
-
# 15. Create an element for the token for which formatting element
# was created, in the HTML namespace, with furthest block as the
# intended parent.
open_els.splice i, 0, new_element
break
# 20. Jump back to the step labeled outer loop.
- debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
- debug_log "tree: #{serialize_els doc.children, false, true}"
- debug_log "open_els: #{serialize_els open_els, true, true}"
- debug_log "afe: #{serialize_els afe, true, true}"
- debug_log "AAA DONE"
+ return
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
close_p_element = ->
el = open_els.shift()
if el.name is 'p' and el.namespace is NS_HTML
return
+ return
close_p_if_in_button_scope = ->
if is_in_button_scope 'p', NS_HTML
close_p_element()
+ return
# http://www.w3.org/TR/html5/syntax.html#insert-a-character
# aka insert_a_character = (t) ->
prev.text += t.text
return
dest[0].children.splice dest[1], 0, t
-
+ return
# 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
process_token = (t) ->
return el
# http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
insert_html_element = (token) ->
- insert_foreign_element token, NS_HTML
+ return insert_foreign_element token, NS_HTML
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
# position should be [node, index_within_children]
insert_comment = (t, position = null) ->
position ?= adjusted_insertion_location()
position[0].children.splice position[1], 0, t
+ return
# 8.2.5.2
# http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
tok_state = tok_state_rawtext
original_ins_mode = ins_mode
ins_mode = ins_mode_text
+ return
parse_generic_rcdata_text = (t) ->
insert_html_element t
tok_state = tok_state_rcdata
original_ins_mode = ins_mode
ins_mode = ins_mode_text
+ return
# 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
# http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
generate_implied_end_tags = (except = null) ->
while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
open_els.shift()
+ return
# 8.2.5.4 The rules for parsing tokens in HTML content
# http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
# 8.2.5.4.1 The "initial" insertion mode
# http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
+ is_quirks_yes_doctype = (t) ->
+ if t.flag 'force-quirks'
+ return true
+ if t.name isnt 'html'
+ return true
+ if t.public_identifier?
+ pi = t.public_identifier.toLowerCase()
+ for p in quirks_yes_pi_prefixes
+ if pi.substr(0, p.length) is p
+ return true
+ if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
+ return true
+ if t.system_identifier?
+ if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
+ return true
+ else if t.public_identifier?
+ # already did this: pi = t.public_identifier.toLowerCase()
+ if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+ return true
+ return false
+ is_quirks_limited_doctype = (t) ->
+ if t.public_identifier?
+ pi = t.public_identifier.toLowerCase()
+ if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
+ return true
+ if t.system_identifier?
+ if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+ return true
+ return false
ins_mode_initial = (t) ->
if is_space_tok t
return
doc.children.push t
return
if t.type is TYPE_DOCTYPE
- # FIXME check identifiers, set quirks, etc
- # fixfull
+ # fixfull syntax error from first paragraph and following bullets
+ # fixfull set doc.doctype
+ # fixfull is the "not an iframe srcdoc" thing relevant?
+ if is_quirks_yes_doctype t
+ doc.flag 'quirks mode', QUIRKS_YES
+ else if is_quirks_limited_doctype t
+ doc.flag 'quirks mode', QUIRKS_LIMITED
doc.children.push t
ins_mode = ins_mode_before_html
return
# Anything else
- #fixfull (iframe, quirks)
+ # fixfull not iframe srcdoc?
+ parse_error()
+ doc.flag 'quirks mode', QUIRKS_YES
ins_mode = ins_mode_before_html
process_token t
return
if t.type is TYPE_START_TAG and t.name is 'html'
el = token_to_element t, NS_HTML, doc
doc.children.push el
+ el.document = doc
open_els.unshift(el)
# fixfull (big paragraph in spec about manifest, fragment, urls, etc)
ins_mode = ins_mode_before_head
parse_error()
return
# Anything else
- html_tok = new_open_tag 'html'
- el = token_to_element html_tok, NS_HTML, doc
+ el = token_to_element new_open_tag('html'), NS_HTML, doc
doc.children.push el
+ el.document = doc
open_els.unshift el
# ?fixfull browsing context
ins_mode = ins_mode_before_head
parse_error()
return
# Anything else
- head_tok = new_open_tag 'head'
- el = insert_html_element head_tok
+ el = insert_html_element new_open_tag 'head'
head_element_pointer = el
ins_mode = ins_mode_in_head
process_token t
+ return
# 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
open_els.shift() # spec says this will be a 'head' node
ins_mode = ins_mode_after_head
process_token t
+ return
ins_mode_in_head = (t) ->
if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
insert_character t
parse_error()
return
ins_mode_in_head_else t
+ return
# 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
ins_mode_in_head_noscript_else = (t) ->
open_els.shift()
ins_mode = ins_mode_in_head
process_token t
+ return
ins_mode_in_head_noscript = (t) ->
if t.type is TYPE_DOCTYPE
parse_error()
ins_mode_in_head_noscript_else t
return
-
-
# 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
ins_mode_after_head_else = (t) ->
body_tok = new_open_tag 'body'
if el is head_element_pointer
open_els.splice i, 1
return
- console.log "warning: 23904 couldn't find head element in open_els"
return
if t.type is TYPE_END_TAG and t.name is 'template'
ins_mode_in_head t
return
# Anything else
ins_mode_after_head_else t
+ return
# 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
- for el, i in open_els
- if el.name is name and el.namespace is NS_HTML
+ node = open_els[0]
+ loop
+ if node.name is name and node.namespace is NS_HTML
generate_implied_end_tags name # arg is exception
- parse_error() unless i is 0
- while i >= 0
- open_els.shift()
- i -= 1
- return
- if special_elements[el.name] is el.namespace
+ unless node is open_els[0]
+ parse_error()
+ loop
+ el = open_els.shift()
+ if el is node
+ return
+ if special_elements[node.name] is node.namespace
parse_error()
return
+ for el, i in open_els
+ if node is el
+ node = open_els[i + 1]
+ break
return
ins_mode_in_body = (t) ->
if t.type is TYPE_TEXT and t.text is "\u0000"
if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
close_p_if_in_button_scope()
insert_html_element t
- # spec: If the next token is a "LF" (U+000A) character token, then
- # ignore that token and move on to the next one. (Newlines at the
- # start of pre blocks are ignored as an authoring convenience.)
- if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
- cur += 1
+ eat_next_token_if_newline()
flag_frameset_ok = false
return
if t.type is TYPE_START_TAG and t.name is 'form'
return
if t.type is TYPE_START_TAG and t.name is 'nobr'
reconstruct_afe()
+ if is_in_scope 'nobr', NS_HTML
+ parse_error()
+ adoption_agency 'nobr'
+ reconstruct_afe()
el = insert_html_element t
afe_push el
return
clear_afe_to_marker()
return
if t.type is TYPE_START_TAG and t.name is 'table'
- close_p_if_in_button_scope() # fixfull quirksmode thing
+ unless doc.flag('quirks mode') is QUIRKS_YES
+ close_p_if_in_button_scope() # test
insert_html_element t
flag_frameset_ok = false
ins_mode = ins_mode_in_table
return
if t.type is TYPE_END_TAG and t.name is 'br'
parse_error()
- t.type is TYPE_START_TAG
+ # W3C: t.type = TYPE_START_TAG
+ t = new_open_tag 'br' # WHATWG
# fall through
if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
reconstruct_afe()
unless is_input_hidden_tok t
flag_frameset_ok = false
return
- if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+ if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+ # WHATWG adds 'menuitem' for this block
insert_html_element t
open_els.shift()
t.acknowledge_self_closing()
return
if t.type is TYPE_START_TAG and t.name is 'textarea'
insert_html_element t
- if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
- cur += 1
+ eat_next_token_if_newline()
tok_state = tok_state_rcdata
original_ins_mode = ins_mode
flag_frameset_ok = false
open_els.shift()
ins_mode = original_ins_mode
return
- console.log 'warning: end of ins_mode_text reached'
+ return
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
ins_mode_in_body t
else
ins_mode_in_table_else t
+ return
# 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
pending_table_character_tokens = []
ins_mode = original_ins_mode
process_token t
+ return
# 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
ins_mode_in_caption = (t) ->
return
# Anything else
ins_mode_in_body t
+ return
# 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
ins_mode_in_column_group = (t) ->
return
# Anything else
ins_mode_in_table t
+ return
# 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
ins_mode_in_row = (t) ->
return
# Anything else
ins_mode_in_table t
+ return
# http://www.w3.org/TR/html5/syntax.html#close-the-cell
close_the_cell = ->
break
clear_afe_to_marker()
ins_mode = ins_mode_in_row
+ return
# 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
ins_mode_in_cell = (t) ->
return
# Anything Else
ins_mode_in_body t
+ return
# 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
ins_mode_in_select = (t) ->
insert_html_element t
return
if t.type is TYPE_END_TAG and t.name is 'optgroup'
- if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+ if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
open_els.shift()
if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
return
if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
parse_error()
- if is_in_select_scope 'select', NS_HTML
+ unless is_in_select_scope 'select', NS_HTML
return
loop
el = open_els.shift()
template_ins_modes.shift()
reset_ins_mode()
process_token t
+ return
# 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
ins_mode_after_body = (t) ->
parse_error()
ins_mode = ins_mode_in_body
process_token t
+ return
# 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
ins_mode_in_frameset = (t) ->
if node.namespace is NS_HTML
break
ins_mode t # explicitly call HTML insertion mode
+ return
# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
- return new_text_node "\ufffd"
+ return new_text_node c
when '' # EOF
return new_eof_token()
else
# http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
is_appropriate_end_tag = (t) ->
- # spec says to check against "the tag name of the last start tag to
- # have been emitted from this tokenizer", but this is only called from
- # the various "raw" states, so it's hopefully ok to assume that
- # open_els[0].name will work instead TODO: verify this after the script
- # data states are implemented
- debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+ # fixfull: this assumes that open_els[0].name is "the tag name of the last
+ # start tag to have been emitted from this tokenizer"
return t.type is TYPE_END_TAG and t.name is open_els[0].name
# 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
return
if c is '>'
tok_state = tok_state_data
- return
+ return tok_cur_tag
if is_uc_alpha(c)
tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
tok_state = tok_state_attribute_name
# Anything else
tok_cur_tag.attrs_a.unshift [c, '']
tok_state = tok_state_attribute_name
+ return
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state_before_attribute_value = ->
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 3
- return new_character_token val # fixfull split
+ val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
+ if val.length > 0
+ return new_character_token val # fixfull split
+ return null
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
return '&'
return # never reached
+ eat_next_token_if_newline = ->
+ old_cur = cur
+ t = null
+ until t?
+ t = tok_state()
+ if t.type is TYPE_TEXT
+ # definition of a newline depends on whether it was a character ref or not
+ if cur - old_cur is 1
+ # not a character reference
+ if t.text is "\u000d" or t.text is "\u000a"
+ return
+ else
+ if t.text is "\u000a"
+ return
+ # not a "newline"
+ cur = old_cur
+ return
+
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
txt = args.html
cur = 0
- doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+ doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
+ fragment_root = null # fragment parsing algorithm returns children of this
open_els = []
afe = [] # active formatting elements
template_ins_modes = []
temporary_buffer = null
pending_table_character_tokens = []
head_element_pointer = null
- flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
- context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+ flag_fragment_parsing = false
+ context_element = null
prev_node_id = 0 # just for debugging
# tokenizer initialization
tok_state = tok_state_data
- # text pre-processing
- # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
- txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
- txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
- txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+ parse_init = ->
+ # fragment parsing (text arg)
+ if args.fragment?
+ # this handles the fragment from the tests in the format described here:
+ # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
+ f = args.fragment
+ ns = NS_HTML
+ if f.substr(0, 5) is 'math '
+ f = f.substr 5
+ ns = NS_MATHML
+ else if f.substr(0, 4) is 'svg '
+ f = f.substr 4
+ ns = NS_SVG
+ t = new_open_tag f
+ context_element = token_to_element t, ns
+ context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+ context_element.document.flag 'quirks mode', QUIRKS_NO
+ # fragment parsing (Node arg)
+ if args.context?
+ context_element = args.context
+
+ # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+ # fragment parsing algorithm
+ if context_element?
+ flag_fragment_parsing = true
+ doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ # search up the tree from context, to try to find it's document,
+ # because this file only puts a "document" property on the root
+ # element.
+ old_doc = null
+ el = context_element
+ loop
+ if el.document?
+ old_doc = el.document
+ break
+ if el.parent
+ el = el.parent
+ else
+ break
+ if old_doc
+ doc.flag 'quirks mode', old_doc.flag 'quirks mode'
+ # set tok_state
+ if context_element.namespace is NS_HTML
+ switch context_element.name
+ when 'title', 'textarea'
+ tok_state = tok_state_rcdata
+ when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
+ tok_state = tok_state_rawtext
+ when 'script'
+ tok_state = tok_state_script_data
+ when 'noscript'
+ if flag_scripting
+ tok_state = tok_state_rawtext
+ when 'plaintext'
+ tok_state = tok_state_plaintext
+ fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ doc.children.push fragment_root
+ fragment_root.document = doc
+ open_els = [fragment_root]
+ if context_element.name is 'template' and context_element.namespace is NS_HTML
+ template_ins_modes.unshift ins_mode_in_template
+ # fixfull create token for context (it should have it's original one already)
+ reset_ins_mode()
+ # set form_element pointer... in the foreign doc?!
+ el = context_element
+ loop
+ if el.name is 'form' and el.namespace is NS_HTML
+ form_element_pointer = el
+ break
+ if el.parent
+ el = el.parent
+ else
+ break
+
+ # text pre-processing
+ # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+ txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+ txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+
+ return
- if args.name is "tests18.dat #17"
- console.log "hi"
- # proccess input
# http://www.w3.org/TR/html5/syntax.html#tree-construction
- while flag_parsing
- t = tok_state()
- if t?
- process_token t
- # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
- return doc.children
+ parse_main_loop = ->
+ while flag_parsing
+ t = tok_state()
+ if t?
+ process_token t
+ # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+ return
+ parse_init()
+ parse_main_loop()
-serialize_els = (els, shallow, show_ids) ->
- serialized = ''
- sep = ''
- for t in els
- serialized += sep
- sep = ','
- serialized += t.serialize shallow, show_ids
- return serialized
+ if flag_fragment_parsing
+ return fragment_root.children
+ return doc.children
module.exports.parse_html = parse_html
module.exports.debug_log_reset = debug_log_reset
module.exports.NS_HTML = NS_HTML
module.exports.NS_MATHML = NS_MATHML
module.exports.NS_SVG = NS_SVG
+module.exports.QUIRKS_NO = QUIRKS_NO
+module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+module.exports.QUIRKS_YES = QUIRKS_YES