# This file implements a parser for html snippets, meant to be used by a
-# WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
-# or <body> tags, nor does it produce the top level "document" node in the dom
-# tree, nor nodes for html, head or body. Comments containing "fixfull"
-# indicate places where additional code is needed for full HTML document
-# parsing.
+# WYSIWYG editor.
+
+# The implementation is a pretty direct implementation of the parsing algorithm
+# described here:
+# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+#
+# Deviations from that spec:
+#
+# Purposeful: search this file for "WTAG"
#
-# Instead, the data structure produced by this parser is an array of Nodes.
+# Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
# stacks/lists
@id = "#{++prev_node_id}"
acknowledge_self_closing: ->
if @token?
- @token.flag 'did_self_close'
+ @token.flag 'did_self_close', true
else
@flag 'did_self_close', true
flag: (key, value = null) ->
h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
- listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
- noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
- ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
- script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
- style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
- template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
- thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
- wbr:NS_HTML, xmp:NS_HTML,
+ listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
+
+ menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
+
+ meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
+ noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
+ plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
+ select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
+ table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
+ textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
+ tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
# MathML:
mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
ychannelselector: 'yChannelSelector'
zoomandpan: 'zoomAndPan'
}
+foreign_attr_fixes = {
+ 'xlink:actuate': 'xlink actuate'
+ 'xlink:arcrole': 'xlink arcrole'
+ 'xlink:href': 'xlink href'
+ 'xlink:role': 'xlink role'
+ 'xlink:show': 'xlink show'
+ 'xlink:title': 'xlink title'
+ 'xlink:type': 'xlink type'
+ 'xml:base': 'xml base'
+ 'xml:lang': 'xml lang'
+ 'xml:space': 'xml space'
+ 'xmlns': 'xmlns'
+ 'xmlns:xlink': 'xmlns xlink'
+}
adjust_mathml_attributes = (t) ->
for a in t.attrs_a
if a[0] is 'definitionurl'
return
adjust_foreign_attributes = (t) ->
# fixfull
+ for a in t.attrs_a
+ if foreign_attr_fixes[a[0]]?
+ a[0] = foreign_attr_fixes[a[0]]
return
# decode_named_char_ref()
standard_scopers = {
applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
- template: NS_HTML, mi: NS_MATHML,
+ template: NS_HTML,
- mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
- 'annotation-xml': NS_MATHML,
+ mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
+ mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
}
for t in open_els
if t.name is tag_name and (namespace is null or namespace is t.namespace)
return true
- if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
+ if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
return false
return false
# this checks for a particular element, not by name
ins_mode t
return
if is_mathml_text_integration_point(acn)
- if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
+ if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
ins_mode t
return
if t.type is TYPE_TEXT
parse_error()
return if open_els.length < 2
second = open_els[open_els.length - 2]
- return unless second.ns is NS_HTML
+ return unless second.namespace is NS_HTML
return unless second.name is 'body'
return if template_tag_is_open()
- frameset_ok_flag = false
+ flag_frameset_ok = false
for a of t.attrs_a
second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
return
return if open_els.length < 2
second_i = open_els.length - 2
second = open_els[second_i]
- return unless second.ns is NS_HTML
+ return unless second.namespace is NS_HTML
return unless second.name is 'body'
- flag_frameset_ok = false
+ if flag_frameset_ok is false
+ return
if second.parent?
for el, i in second.parent.children
if el is second
reconstruct_afe()
insert_html_element t
return
- if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
+# this comment block implements the W3C spec
+# if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
+# if is_in_scope 'ruby', NS_HTML
+# generate_implied_end_tags()
+# unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
+# parse_error()
+# insert_html_element t
+# return
+# if t.type is TYPE_START_TAG and t.name is 'rt'
+# if is_in_scope 'ruby', NS_HTML
+# generate_implied_end_tags 'rtc' # arg is exception
+# unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
+# parse_error()
+# insert_html_element t
+# return
+# below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
+ if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
if is_in_scope 'ruby', NS_HTML
generate_implied_end_tags()
unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
parse_error()
insert_html_element t
return
- if t.type is TYPE_START_TAG and t.name is 'rt'
+ if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
if is_in_scope 'ruby', NS_HTML
- generate_implied_end_tags 'rtc' # arg is exception
+ generate_implied_end_tags 'rtc'
unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
parse_error()
insert_html_element t
return
+# end WATWG chunk
if t.type is TYPE_START_TAG and t.name is 'math'
reconstruct_afe()
adjust_mathml_attributes t
ins_mode_in_table = (t) ->
switch t.type
when TYPE_TEXT
- if t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr'
+ if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
+ pending_table_character_tokens = []
original_ins_mode = ins_mode
ins_mode = ins_mode_in_table_text
process_token t
# 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
ins_mode_in_table_text = (t) ->
if t.type is TYPE_TEXT and t.text is "\u0000"
- # huh? I thought the tokenizer didn't emit these
+ # from javascript?
parse_error()
return
if t.type is TYPE_TEXT
insert_character old
else
for old in pending_table_character_tokens
- ins_mode_table_else old
- pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
+ ins_mode_in_table_else old
+ pending_table_character_tokens = []
ins_mode = original_ins_mode
process_token t
ins_mode_in_body t
return
if t.type is TYPE_END_TAG and t.name is 'html'
- # fixfull fragment case
+ if flag_fragment_parsing
+ parse_error()
+ return
ins_mode = ins_mode_after_after_body
return
if t.type is TYPE_EOF
# Anything else
parse_error()
ins_mode = ins_mode_in_body
+ process_token t
return
# 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
if t.name is 'script'
t.acknowledge_self_closing()
in_foreign_content_end_script()
+ # fixfull
else
open_els.shift()
t.acknowledge_self_closing()
return
loop # is this safe?
open_els.shift()
- cn = open_els[0]
- if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
+ if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
break
process_token t
return
in_foreign_content_end_script()
return
if t.type is TYPE_END_TAG
- if open_els[0].name.toLowerCase() isnt t.name
+ i = 0
+ node = open_els[i]
+ if node.name.toLowerCase() isnt t.name
parse_error()
- for node in open_els
+ loop
if node is open_els[open_els.length - 1]
return
if node.name.toLowerCase() is t.name
el = open_els.shift()
if el is node
return
+ i += 1
+ node = open_els[i]
if node.namespace is NS_HTML
break
ins_mode t # explicitly call HTML insertion mode
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
- return new_text_node c
+ return new_text_node "\ufffd"
when '' # EOF
return new_eof_token()
else
tok_state_self_closing_start_tag = ->
c = txt.charAt(cur++)
if c is '>'
- tok_cur_tag.flag 'self-closing'
+ tok_cur_tag.flag 'self-closing', true
tok_state = tok_state_data
return tok_cur_tag
if c is ''
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 1
- val = val.replace "\u0000", "\ufffd"
+ val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
tok_cur_tag.text += val
tok_state = tok_state_data
return tok_cur_tag
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 3
- val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
- val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
- val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
return new_character_token val # fixfull split
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# tokenizer initialization
tok_state = tok_state_data
- if args.name is "namespace-sensitivity.dat #1"
+ # text pre-processing
+ # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+ txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+ txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+ txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+
+ if args.name is "plain-text-unsafe.dat #4"
console.log "hi"
# proccess input
# http://www.w3.org/TR/html5/syntax.html#tree-construction