# This file implements a parser for html snippets, meant to be used by a
-# WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
-# or <body> tags, nor does it produce the top level "document" node in the dom
-# tree, nor nodes for html, head or body. Comments containing "fixfull"
-# indicate places where additional code is needed for full HTML document
-# parsing.
+# WYSIWYG editor.
+
+# The implementation is a pretty direct implementation of the parsing algorithm
+# described here:
+# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+#
+# Deviations from that spec:
+#
+# Purposeful: search this file for "WHATWG"
#
-# Instead, the data structure produced by this parser is an array of Nodes.
+# Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
# stacks/lists
window.wheic = {}
module = exports: window.wheic
+from_code_point = (x) ->
+ if String.fromCodePoint?
+ return String.fromCodePoint x
+ else
+ if x <= 0xffff
+ return String.fromCharCode x
+ x -= 0x10000
+ return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
+
# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
NS_MATHML = 2
NS_SVG = 3
+# quirks mode constants
+QUIRKS_NO = 1
+QUIRKS_LIMITED = 2
+QUIRKS_YES = 3
+
g_debug_log = []
debug_log_reset = ->
g_debug_log = []
@id = "#{++prev_node_id}"
acknowledge_self_closing: ->
if @token?
- @token.flag 'did_self_close'
+ @token.flag 'did_self_close', true
else
@flag 'did_self_close', true
flag: (key, value = null) ->
# https://en.wikipedia.org/wiki/Whitespace_character#Unicode
whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
+unicode_fixes = {}
+unicode_fixes[0x00] = "\uFFFD"
+unicode_fixes[0x80] = "\u20AC"
+unicode_fixes[0x82] = "\u201A"
+unicode_fixes[0x83] = "\u0192"
+unicode_fixes[0x84] = "\u201E"
+unicode_fixes[0x85] = "\u2026"
+unicode_fixes[0x86] = "\u2020"
+unicode_fixes[0x87] = "\u2021"
+unicode_fixes[0x88] = "\u02C6"
+unicode_fixes[0x89] = "\u2030"
+unicode_fixes[0x8A] = "\u0160"
+unicode_fixes[0x8B] = "\u2039"
+unicode_fixes[0x8C] = "\u0152"
+unicode_fixes[0x8E] = "\u017D"
+unicode_fixes[0x91] = "\u2018"
+unicode_fixes[0x92] = "\u2019"
+unicode_fixes[0x93] = "\u201C"
+unicode_fixes[0x94] = "\u201D"
+unicode_fixes[0x95] = "\u2022"
+unicode_fixes[0x96] = "\u2013"
+unicode_fixes[0x97] = "\u2014"
+unicode_fixes[0x98] = "\u02DC"
+unicode_fixes[0x99] = "\u2122"
+unicode_fixes[0x9A] = "\u0161"
+unicode_fixes[0x9B] = "\u203A"
+unicode_fixes[0x9C] = "\u0153"
+unicode_fixes[0x9E] = "\u017E"
+unicode_fixes[0x9F] = "\u0178"
+
+quirks_yes_pi_prefixes = [
+ "+//silmaril//dtd html pro v0r11 19970101//"
+ "-//as//dtd html 3.0 aswedit + extensions//"
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
+ "-//ietf//dtd html 2.0 level 1//"
+ "-//ietf//dtd html 2.0 level 2//"
+ "-//ietf//dtd html 2.0 strict level 1//"
+ "-//ietf//dtd html 2.0 strict level 2//"
+ "-//ietf//dtd html 2.0 strict//"
+ "-//ietf//dtd html 2.0//"
+ "-//ietf//dtd html 2.1e//"
+ "-//ietf//dtd html 3.0//"
+ "-//ietf//dtd html 3.2 final//"
+ "-//ietf//dtd html 3.2//"
+ "-//ietf//dtd html 3//"
+ "-//ietf//dtd html level 0//"
+ "-//ietf//dtd html level 1//"
+ "-//ietf//dtd html level 2//"
+ "-//ietf//dtd html level 3//"
+ "-//ietf//dtd html strict level 0//"
+ "-//ietf//dtd html strict level 1//"
+ "-//ietf//dtd html strict level 2//"
+ "-//ietf//dtd html strict level 3//"
+ "-//ietf//dtd html strict//"
+ "-//ietf//dtd html//"
+ "-//metrius//dtd metrius presentational//"
+ "-//microsoft//dtd internet explorer 2.0 html strict//"
+ "-//microsoft//dtd internet explorer 2.0 html//"
+ "-//microsoft//dtd internet explorer 2.0 tables//"
+ "-//microsoft//dtd internet explorer 3.0 html strict//"
+ "-//microsoft//dtd internet explorer 3.0 html//"
+ "-//microsoft//dtd internet explorer 3.0 tables//"
+ "-//netscape comm. corp.//dtd html//"
+ "-//netscape comm. corp.//dtd strict html//"
+ "-//o'reilly and associates//dtd html 2.0//"
+ "-//o'reilly and associates//dtd html extended 1.0//"
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//"
+ "-//sq//dtd html 2.0 hotmetal + extensions//"
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
+ "-//spyglass//dtd html 2.0 extended//"
+ "-//sun microsystems corp.//dtd hotjava html//"
+ "-//sun microsystems corp.//dtd hotjava strict html//"
+ "-//w3c//dtd html 3 1995-03-24//"
+ "-//w3c//dtd html 3.2 draft//"
+ "-//w3c//dtd html 3.2 final//"
+ "-//w3c//dtd html 3.2//"
+ "-//w3c//dtd html 3.2s draft//"
+ "-//w3c//dtd html 4.0 frameset//"
+ "-//w3c//dtd html 4.0 transitional//"
+ "-//w3c//dtd html experimental 19960712//"
+ "-//w3c//dtd html experimental 970421//"
+ "-//w3c//dtd w3 html//"
+ "-//w3o//dtd w3 html 3.0//"
+ "-//webtechs//dtd mozilla html 2.0//"
+ "-//webtechs//dtd mozilla html//"
+]
+
# These are the character references that don't need a terminating semicolon
# min length: 2, max: 6, none are a prefix of any other.
legacy_char_refs = {
h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
- listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
- noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
- ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
- script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
- style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
- template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
- thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
- wbr:NS_HTML, xmp:NS_HTML,
+ listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
+
+ menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
+
+ meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
+ noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
+ plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
+ select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
+ table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
+ textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
+ tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
# MathML:
mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
diffuseconstant: 'diffuseConstant'
edgemode: 'edgeMode'
externalresourcesrequired: 'externalResourcesRequired'
- filterres: 'filterRes'
+ # WHATWG removes this: filterres: 'filterRes'
filterunits: 'filterUnits'
glyphref: 'glyphRef'
gradienttransform: 'gradientTransform'
ychannelselector: 'yChannelSelector'
zoomandpan: 'zoomAndPan'
}
+foreign_attr_fixes = {
+ 'xlink:actuate': 'xlink actuate'
+ 'xlink:arcrole': 'xlink arcrole'
+ 'xlink:href': 'xlink href'
+ 'xlink:role': 'xlink role'
+ 'xlink:show': 'xlink show'
+ 'xlink:title': 'xlink title'
+ 'xlink:type': 'xlink type'
+ 'xml:base': 'xml base'
+ 'xml:lang': 'xml lang'
+ 'xml:space': 'xml space'
+ 'xmlns': 'xmlns'
+ 'xmlns:xlink': 'xmlns xlink'
+}
adjust_mathml_attributes = (t) ->
for a in t.attrs_a
if a[0] is 'definitionurl'
return
adjust_foreign_attributes = (t) ->
# fixfull
+ for a in t.attrs_a
+ if foreign_attr_fixes[a[0]]?
+ a[0] = foreign_attr_fixes[a[0]]
return
# decode_named_char_ref()
else
console.log "Parse error at character #{cur} of #{txt.length}"
+ # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+ # "Noah's Ark clause" but with three
afe_push = (new_el) ->
matches = 0
for el, i in afe
+ if el.type is TYPE_AFE_MARKER
+ break
if el.name is new_el.name and el.namespace is new_el.namespace
+ attrs_match = true
for k, v of el.attrs
- continue unless new_el.attrs[k] is v
- for k, v of new_el.attrs
- continue unless el.attrs[k] is v
- matches += 1
- if matches is 3
- afe.splice i, 1
- break
+ unless new_el.attrs[k] is v
+ attrs_match = false
+ break
+ if attrs_match
+ for k, v of new_el.attrs
+ unless el.attrs[k] is v
+ attrs_match = false
+ break
+ if attrs_match
+ matches += 1
+ if matches is 3
+ afe.splice i, 1
+ break
afe.unshift new_el
afe_push_marker = ->
afe.unshift new_afe_marker()
# But first... the helpers
template_tag_is_open = ->
- for t in open_els
- if t.name is 'template' and t.namespace is NS_HTML
+ for el in open_els
+ if el.name is 'template' and el.namespace is NS_HTML
return true
return false
is_in_scope_x = (tag_name, scope, namespace) ->
- for t in open_els
- if t.name is tag_name and (namespace is null or namespace is t.namespace)
+ for el in open_els
+ if el.name is tag_name and (namespace is null or namespace is el.namespace)
return true
- if scope[t.name] is t.namespace
+ if scope[el.name] is el.namespace
return false
return false
is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
- for t in open_els
- if t.name is tag_name and (namespace is null or namespace is t.namespace)
+ for el in open_els
+ if el.name is tag_name and (namespace is null or namespace is el.namespace)
return true
- if scope[t.name] is t.namespace
+ if scope[el.name] is el.namespace
return false
- if scope2[t.name] is t.namespace
+ if scope2[el.name] is el.namespace
return false
return false
standard_scopers = {
applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
- template: NS_HTML, mi: NS_MATHML,
+ template: NS_HTML,
- mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
- 'annotation-xml': NS_MATHML,
+ mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
+ mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
}
for t in open_els
if t.name is tag_name and (namespace is null or namespace is t.namespace)
return true
- if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
+ if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
return false
return false
# this checks for a particular element, not by name
# fixfull (fragment case)
# 4. If node is a select element, run these substeps:
- if node.name is 'select'
+ if node.name is 'select' and node.namespace is NS_HTML
# 1. If last is true, jump to the step below labeled done.
unless last
# 2. Let ancestor be node.
ancestor = open_els[ancestor_i]
# 5. If ancestor is a template node, jump to the step below
# labeled done.
- if ancestor.name is 'template'
+ if ancestor.name is 'template' and ancestor.namespace is NS_HTML
break
# 6. If ancestor is a table node, switch the insertion mode
# to "in select in table" and abort these steps.
- if ancestor.name is 'table'
+ if ancestor.name is 'table' and ancestor.namespace is NS_HTML
ins_mode = ins_mode_in_select_in_table
return
# 7. Jump back to the step labeled loop.
return
# 5. If node is a td or th element and last is false, then switch
# the insertion mode to "in cell" and abort these steps.
- if (node.name is 'td' or node.name is 'th') and last is false
+ if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
ins_mode = ins_mode_in_cell
return
# 6. If node is a tr element, then switch the insertion mode to "in
# row" and abort these steps.
- if node.name is 'tr'
+ if node.name is 'tr' and node.namespace is NS_HTML
ins_mode = ins_mode_in_row
return
# 7. If node is a tbody, thead, or tfoot element, then switch the
# insertion mode to "in table body" and abort these steps.
- if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
+ if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
ins_mode = ins_mode_in_table_body
return
# 8. If node is a caption element, then switch the insertion mode
# to "in caption" and abort these steps.
- if node.name is 'caption'
+ if node.name is 'caption' and node.namespace is NS_HTML
ins_mode = ins_mode_in_caption
return
# 9. If node is a colgroup element, then switch the insertion mode
# to "in column group" and abort these steps.
- if node.name is 'colgroup'
+ if node.name is 'colgroup' and node.namespace is NS_HTML
ins_mode = ins_mode_in_column_group
return
# 10. If node is a table element, then switch the insertion mode to
# "in table" and abort these steps.
- if node.name is 'table'
+ if node.name is 'table' and node.namespace is NS_HTML
ins_mode = ins_mode_in_table
return
# 11. If node is a template element, then switch the insertion mode
# to the current template insertion mode and abort these steps.
- # fixfull (template insertion mode stack)
-
+ if node.name is 'template' and node.namespace is NS_HTML
+ ins_mode = template_ins_modes[0]
+ return
# 12. If node is a head element and last is true, then switch the
# insertion mode to "in body" ("in body"! not "in head"!) and abort
# these steps. (fragment case)
- if node.name is 'head' and last
+ if node.name is 'head' and node.namespace is NS_HTML and last
ins_mode = ins_mode_in_body
return
# 13. If node is a head element and last is false, then switch the
# insertion mode to "in head" and abort these steps.
- if node.name is 'head' and last is false
+ if node.name is 'head' and node.namespace is NS_HTML and last is false
ins_mode = ins_mode_in_head
return
# 14. If node is a body element, then switch the insertion mode to
# "in body" and abort these steps.
- if node.name is 'body'
+ if node.name is 'body' and node.namespace is NS_HTML
ins_mode = ins_mode_in_body
return
# 15. If node is a frameset element, then switch the insertion mode
# to "in frameset" and abort these steps. (fragment case)
- if node.name is 'frameset'
+ if node.name is 'frameset' and node.namespace is NS_HTML
ins_mode = ins_mode_in_frameset
return
# 16. If node is an html element, run these substeps:
- if node.name is 'html'
+ if node.name is 'html' and node.namespace is NS_HTML
# 1. If the head element pointer is null, switch the insertion
# mode to "before head" and abort these steps. (fragment case)
if head_element_pointer is null
debug_log "tree: #{serialize_els doc.children, false, true}"
debug_log "open_els: #{serialize_els open_els, true, true}"
debug_log "afe: #{serialize_els afe, true, true}"
+# this block implements tha W3C spec
+# # 1. If the current node is an HTML element whose tag name is subject,
+# # then run these substeps:
+# #
+# # 1. Let element be the current node.
+# #
+# # 2. Pop element off the stack of open elements.
+# #
+# # 3. If element is also in the list of active formatting elements,
+# # remove the element from the list.
+# #
+# # 4. Abort the adoption agency algorithm.
+# if open_els[0].name is subject and open_els[0].namespace is NS_HTML
+# el = open_els.shift()
+# # remove it from the list of active formatting elements (if found)
+# for t, i in afe
+# if t is el
+# afe.splice i, 1
+# break
+# debug_log "aaa: starting off with subject on top of stack, exiting"
+# return
+# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
+ # If the current node is an HTML element whose tag name is subject, and
+ # the current node is not in the list of active formatting elements,
+ # then pop the current node off the stack of open elements, and abort
+ # these steps.
if open_els[0].name is subject and open_els[0].namespace is NS_HTML
- el = open_els[0]
- open_els.shift()
+ debug_log "aaa: starting off with subject on top of stack, exiting"
# remove it from the list of active formatting elements (if found)
- for t, i in afe
- if t is el
- afe.splice i, 1
+ in_afe = false
+ for el, i in afe
+ if el is open_els[0]
+ in_afe = true
break
- debug_log "aaa: starting off with subject on top of stack, exiting"
- return
+ unless in_afe
+ debug_log "aaa: ...and not in afe, aaa done"
+ open_els.shift()
+ return
+ # fall through
+# END WHATWG
outer = 0
loop
if outer >= 8
ins_mode t
return
if is_mathml_text_integration_point(acn)
- if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
+ if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
ins_mode t
return
if t.type is TYPE_TEXT
# 8.2.5.4.1 The "initial" insertion mode
# http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
+ is_quirks_yes_doctype = (t) ->
+ if t.flag 'force-quirks'
+ return true
+ if t.name isnt 'html'
+ return true
+ if t.public_identifier?
+ pi = t.public_identifier.toLowerCase()
+ for p in quirks_yes_pi_prefixes
+ if pi.substr(0, p.length) is p
+ return true
+ if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
+ return true
+ if t.system_identifier?
+ if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
+ return true
+ else if t.public_identifier?
+ # already did this: pi = t.public_identifier.toLowerCase()
+ if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+ return true
+ return false
+ is_quirks_limited_doctype = (t) ->
+ if t.public_identifier?
+ pi = t.public_identifier.toLowerCase()
+ if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
+ return true
+ if t.system_identifier?
+ if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+ return true
+ return false
ins_mode_initial = (t) ->
if is_space_tok t
return
doc.children.push t
return
if t.type is TYPE_DOCTYPE
- # FIXME check identifiers, set quirks, etc
- # fixfull
+ # fixfull syntax error from first paragraph and following bullets
+ # fixfull set doc.doctype
+ # fixfull is the "not an iframe srcdoc" thing relevant?
+ if is_quirks_yes_doctype t
+ doc.flag 'quirks mode', QUIRKS_YES
+ else if is_quirks_limited_doctype t
+ doc.flag 'quirks mode', QUIRKS_LIMITED
doc.children.push t
ins_mode = ins_mode_before_html
return
# Anything else
- #fixfull (iframe, quirks)
+ # fixfull not iframe srcdoc?
+ parse_error()
+ doc.flag 'quirks mode', QUIRKS_YES
ins_mode = ins_mode_before_html
process_token t
return
parse_error()
return
# Anything else
- html_tok = new_open_tag 'html'
- el = token_to_element html_tok, NS_HTML, doc
+ el = token_to_element new_open_tag('html'), NS_HTML, doc
doc.children.push el
+ el.parent = doc
open_els.unshift el
# ?fixfull browsing context
ins_mode = ins_mode_before_head
parse_error()
return
# Anything else
- head_tok = new_open_tag 'head'
- el = insert_html_element head_tok
+ el = insert_html_element new_open_tag 'head'
head_element_pointer = el
ins_mode = ins_mode_in_head
process_token t
parse_error()
open_els.unshift head_element_pointer
ins_mode_in_head t
- for el, i of open_els
+ for el, i in open_els
if el is head_element_pointer
open_els.splice i, 1
return
# 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
- for el, i in open_els
- if el.name is name and el.namespace is NS_HTML
+ node = open_els[0]
+ loop
+ if node.name is name and node.namespace is NS_HTML
generate_implied_end_tags name # arg is exception
- parse_error() unless i is 0
- while i >= 0
- open_els.shift()
- i -= 1
- return
- if special_elements[el.name] is el.namespace
+ unless node is open_els[0]
+ parse_error()
+ loop
+ el = open_els.shift()
+ if el is node
+ return
+ if special_elements[node.name] is node.namespace
parse_error()
return
+ for el, i in open_els
+ if node is el
+ node = open_els[i + 1]
+ break
return
ins_mode_in_body = (t) ->
if t.type is TYPE_TEXT and t.text is "\u0000"
parse_error()
return if template_tag_is_open()
root_attrs = open_els[open_els.length - 1].attrs
- for a of t.attrs_a
+ for a in t.attrs_a
root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
return
parse_error()
return if open_els.length < 2
second = open_els[open_els.length - 2]
- return unless second.ns is NS_HTML
+ return unless second.namespace is NS_HTML
return unless second.name is 'body'
return if template_tag_is_open()
- frameset_ok_flag = false
- for a of t.attrs_a
+ flag_frameset_ok = false
+ for a in t.attrs_a
second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
return
if t.type is TYPE_START_TAG and t.name is 'frameset'
return if open_els.length < 2
second_i = open_els.length - 2
second = open_els[second_i]
- return unless second.ns is NS_HTML
+ return unless second.namespace is NS_HTML
return unless second.name is 'body'
- flag_frameset_ok = false
+ if flag_frameset_ok is false
+ return
if second.parent?
for el, i in second.parent.children
if el is second
if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
close_p_if_in_button_scope()
insert_html_element t
- # spec: If the next token is a "LF" (U+000A) character token, then
- # ignore that token and move on to the next one. (Newlines at the
- # start of pre blocks are ignored as an authoring convenience.)
- if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
- cur += 1
+ eat_next_token_if_newline()
flag_frameset_ok = false
return
if t.type is TYPE_START_TAG and t.name is 'form'
return
if t.type is TYPE_START_TAG and t.name is 'nobr'
reconstruct_afe()
+ if is_in_scope 'nobr', NS_HTML
+ parse_error()
+ adoption_agency 'nobr'
+ reconstruct_afe()
el = insert_html_element t
afe_push el
return
clear_afe_to_marker()
return
if t.type is TYPE_START_TAG and t.name is 'table'
- close_p_if_in_button_scope() # fixfull quirksmode thing
+ unless doc.flag('quirks mode') is QUIRKS_YES
+ close_p_if_in_button_scope() # test
insert_html_element t
flag_frameset_ok = false
ins_mode = ins_mode_in_table
return
if t.type is TYPE_END_TAG and t.name is 'br'
parse_error()
- t.type is TYPE_START_TAG
+ # W3C: t.type = TYPE_START_TAG
+ t = new_open_tag 'br' # WHATWG
# fall through
if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
reconstruct_afe()
unless is_input_hidden_tok t
flag_frameset_ok = false
return
- if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+ if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+ # WHATWG adds 'menuitem' for this block
insert_html_element t
open_els.shift()
t.acknowledge_self_closing()
return
if t.type is TYPE_START_TAG and t.name is 'textarea'
insert_html_element t
- if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
- cur += 1
+ eat_next_token_if_newline()
tok_state = tok_state_rcdata
original_ins_mode = ins_mode
flag_frameset_ok = false
reconstruct_afe()
insert_html_element t
return
- if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
+# this comment block implements the W3C spec
+# if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
+# if is_in_scope 'ruby', NS_HTML
+# generate_implied_end_tags()
+# unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
+# parse_error()
+# insert_html_element t
+# return
+# if t.type is TYPE_START_TAG and t.name is 'rt'
+# if is_in_scope 'ruby', NS_HTML
+# generate_implied_end_tags 'rtc' # arg is exception
+# unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
+# parse_error()
+# insert_html_element t
+# return
+# below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
+ if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
if is_in_scope 'ruby', NS_HTML
generate_implied_end_tags()
unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
parse_error()
insert_html_element t
return
- if t.type is TYPE_START_TAG and t.name is 'rt'
+ if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
if is_in_scope 'ruby', NS_HTML
- generate_implied_end_tags 'rtc' # arg is exception
+ generate_implied_end_tags 'rtc'
unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
parse_error()
insert_html_element t
return
+# end WHATWG chunk
if t.type is TYPE_START_TAG and t.name is 'math'
reconstruct_afe()
adjust_mathml_attributes t
ins_mode_in_table = (t) ->
switch t.type
when TYPE_TEXT
- if t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr'
+ if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
+ pending_table_character_tokens = []
original_ins_mode = ins_mode
ins_mode = ins_mode_in_table_text
process_token t
# 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
ins_mode_in_table_text = (t) ->
if t.type is TYPE_TEXT and t.text is "\u0000"
- # huh? I thought the tokenizer didn't emit these
+ # from javascript?
parse_error()
return
if t.type is TYPE_TEXT
insert_character old
else
for old in pending_table_character_tokens
- ins_mode_table_else old
- pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
+ ins_mode_in_table_else old
+ pending_table_character_tokens = []
ins_mode = original_ins_mode
process_token t
insert_html_element t
return
if t.type is TYPE_END_TAG and t.name is 'optgroup'
- if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+ if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
open_els.shift()
if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
return
if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
parse_error()
- if is_in_select_scope 'select', NS_HTML
+ unless is_in_select_scope 'select', NS_HTML
return
loop
el = open_els.shift()
ins_mode_in_body t
return
if t.type is TYPE_COMMENT
- insert_comment t, [open_els[0], open_els[0].children.length]
+ first = open_els[open_els.length - 1]
+ insert_comment t, [first, first.children.length]
return
if t.type is TYPE_DOCTYPE
parse_error()
ins_mode_in_body t
return
if t.type is TYPE_END_TAG and t.name is 'html'
- # fixfull fragment case
+ if flag_fragment_parsing
+ parse_error()
+ return
ins_mode = ins_mode_after_after_body
return
if t.type is TYPE_EOF
ins_mode_in_body t
return
if t.type is TYPE_END_TAG and t.name is 'html'
- insert_mode = ins_mode_after_after_frameset
+ ins_mode = ins_mode_after_after_frameset
return
if t.type is TYPE_START_TAG and t.name is 'noframes'
ins_mode_in_head t
# Anything else
parse_error()
ins_mode = ins_mode_in_body
+ process_token t
return
# 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
if t.name is 'script'
t.acknowledge_self_closing()
in_foreign_content_end_script()
+ # fixfull
else
open_els.shift()
t.acknowledge_self_closing()
return
loop # is this safe?
open_els.shift()
- cn = open_els[0]
- if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
+ if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
break
process_token t
return
in_foreign_content_end_script()
return
if t.type is TYPE_END_TAG
- if open_els[0].name.toLowerCase() isnt t.name
+ i = 0
+ node = open_els[i]
+ if node.name.toLowerCase() isnt t.name
parse_error()
- for node in open_els
+ loop
if node is open_els[open_els.length - 1]
return
if node.name.toLowerCase() is t.name
el = open_els.shift()
if el is node
return
+ i += 1
+ node = open_els[i]
if node.namespace is NS_HTML
break
ins_mode t # explicitly call HTML insertion mode
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
- return new_text_node c
+ return new_text_node "\ufffd"
when '' # EOF
return new_eof_token()
else
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
- switch c = txt.charAt(cur++)
- when '!'
- tok_state = tok_state_markup_declaration_open
- when '/'
- tok_state = tok_state_end_tag_open
- when '?'
- parse_error()
- tok_cur_tag = new_comment_token '?'
- tok_state = tok_state_bogus_comment
- else
- if is_lc_alpha(c)
- tok_cur_tag = new_open_tag c
- tok_state = tok_state_tag_name
- else if is_uc_alpha(c)
- tok_cur_tag = new_open_tag c.toLowerCase()
- tok_state = tok_state_tag_name
- else
- parse_error()
- tok_state = tok_state_data
- cur -= 1 # we didn't parse/handle the char after <
- return new_text_node '<'
- return null
+ c = txt.charAt(cur++)
+ if c is '!'
+ tok_state = tok_state_markup_declaration_open
+ return
+ if c is '/'
+ tok_state = tok_state_end_tag_open
+ return
+ if is_uc_alpha(c)
+ tok_cur_tag = new_open_tag c.toLowerCase()
+ tok_state = tok_state_tag_name
+ return
+ if is_lc_alpha(c)
+ tok_cur_tag = new_open_tag c
+ tok_state = tok_state_tag_name
+ return
+ if c is '?'
+ parse_error()
+ tok_cur_tag = new_comment_token '?' # FIXME right?
+ tok_state = tok_state_bogus_comment
+ return
+ # Anything else
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # we didn't parse/handle the char after <
+ return new_text_node '<'
# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
tok_state_end_tag_open = ->
- switch c = txt.charAt(cur++)
- when '>'
- parse_error()
- tok_state = tok_state_data
- when '' # EOF
- parse_error()
- tok_state = tok_state_data
- return new_text_node '</'
- else
- if is_uc_alpha(c)
- tok_cur_tag = new_end_tag c.toLowerCase()
- tok_state = tok_state_tag_name
- else if is_lc_alpha(c)
- tok_cur_tag = new_end_tag c
- tok_state = tok_state_tag_name
- else
- parse_error()
- tok_cur_tag = new_comment_token '/'
- tok_state = tok_state_bogus_comment
+ c = txt.charAt(cur++)
+ if is_uc_alpha(c)
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ tok_state = tok_state_tag_name
+ return
+ if is_lc_alpha(c)
+ tok_cur_tag = new_end_tag c
+ tok_state = tok_state_tag_name
+ return
+ if c is '>'
+ parse_error()
+ tok_state = tok_state_data
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ return new_text_node '</'
+ # Anything else
+ parse_error()
+ tok_cur_tag = new_comment_token c
+ tok_state = tok_state_bogus_comment
return null
# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
# Anything else
tok_state = tok_state_script_data_escaped
cur -= 1 # Reconsume
- return new_character_token c
+ return new_character_token '<'
# 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
tok_state_script_data_escaped_end_tag_open = ->
return
if c is '>'
tok_state = tok_state_data
- return
+ return tok_cur_tag
if is_uc_alpha(c)
tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
tok_state = tok_state_attribute_name
tok_state_self_closing_start_tag = ->
c = txt.charAt(cur++)
if c is '>'
- tok_cur_tag.flag 'self-closing'
+ tok_cur_tag.flag 'self-closing', true
tok_state = tok_state_data
return tok_cur_tag
if c is ''
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 1
- val = val.replace "\u0000", "\ufffd"
+ val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
tok_cur_tag.text += val
tok_state = tok_state_data
return tok_cur_tag
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 3
- val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
- val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
- val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
- return new_character_token val # fixfull split
+ if val.length > 0
+ return new_character_token val # fixfull split
+ return null
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
if cur + 1 >= txt.length
return '&'
if txt.charAt(cur + 1).toLowerCase() is 'x'
- prefix = '#x'
+ base = 16
charset = hex_chars
start = cur + 2
else
charset = digits
start = cur + 1
- prefix = '#'
+ base = 10
i = 0
while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
i += 1
if i is 0
return '&'
+ cur = start + i
if txt.charAt(start + i) is ';'
- i += 1
- # FIXME This is supposed to generate parse errors for some chars
- decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
- if decoded?
- cur = start + i
- return decoded
- return '&'
+ cur += 1
+ else
+ parse_error()
+ code_point = txt.substr(start, i)
+ while code_point.charAt(0) is '0' and code_point.length > 1
+ code_point = code_point.substr 1
+ code_point = parseInt(code_point, base)
+ if unicode_fixes[code_point]?
+ parse_error()
+ return unicode_fixes[code_point]
+ else
+ if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
+ parse_error()
+ return "\ufffd"
+ else
+ if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
+ parse_error()
+ return from_code_point code_point
+ return
else
for i in [0...31]
if alnum.indexOf(txt.charAt(cur + i)) is -1
return '&'
return # never reached
+ eat_next_token_if_newline = ->
+ old_cur = cur
+ t = null
+ until t?
+ t = tok_state()
+ if t.type is TYPE_TEXT
+ # definition of a newline depends on whether it was a character ref or not
+ if cur - old_cur is 1
+ # not a character reference
+ if t.text is "\u000d" or t.text is "\u000a"
+ return
+ else
+ if t.text is "\u000a"
+ return
+ # not a "newline"
+ cur = old_cur
+ return
+
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
txt = args.html
cur = 0
doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
open_els = []
afe = [] # active formatting elements
template_ins_modes = []
head_element_pointer = null
flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+ prev_node_id = 0 # just for debugging
# tokenizer initialization
tok_state = tok_state_data
- if args.name is "namespace-sensitivity.dat #1"
+ # text pre-processing
+ # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+ txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+ txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+ txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+
+ if args.name is "webkit01.dat #12"
console.log "hi"
# proccess input
# http://www.w3.org/TR/html5/syntax.html#tree-construction
- while flag_parsing
- t = tok_state()
- if t?
- process_token t
- # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+ parse_main_loop = ->
+ while flag_parsing
+ t = tok_state()
+ if t?
+ process_token t
+ # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+ parse_main_loop()
return doc.children
serialize_els = (els, shallow, show_ids) ->
module.exports.NS_HTML = NS_HTML
module.exports.NS_MATHML = NS_MATHML
module.exports.NS_SVG = NS_SVG
+module.exports.QUIRKS_NO = QUIRKS_NO
+module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+module.exports.QUIRKS_YES = QUIRKS_YES