# HTML parser meant to run in a browser, in support of WYSIWYG editor
# Copyright 2015 Jason Woofenden
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
# This file implements a parser for html snippets, meant to be used by a
# WYSIWYG editor. Hence it does not attempt to parse doctypes, ,
# or tags, nor does it produce the top level "document" node in the dom
# tree, nor nodes for html, head or body. Comments containing "fixfull"
# indicate places where additional code is needed for full HTML document
# parsing.
#
# Instead, the data structure produced by this parser is an array of nodes.
#
# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
TYPE_COMMENT = 2
TYPE_DOCTYPE = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
TYPE_END_TAG = 5 # name
TYPE_EOF = 6
TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
# namespace constants
NS_HTML = 1
NS_MATHML = 2
NS_SVG = 3
class Node
constructor: (type, args = {}) ->
@type = type # one of the TYPE_* constants above
@name = args.name ? '' # tag name
@text = args.text ? '' # contents for text/comment nodes
@attrs = args.attrs ? {}
@attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
@children = args.children ? []
@namespace = args.namespace ? NS_HTML
@parent = args.parent ? null
shallow_clone: -> # return a new node that's the same except without the children or parent
# WARNING this doesn't work right on open tags that are still being parsed
attrs = {}
attrs[k] = v for k, v of @attrs
return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
serialize: -> # for unit tests
ret = ''
switch @type
when TYPE_TAG
ret += 'tag:'
ret += JSON.stringify @name
ret += ','
ret += JSON.stringify @attrs
ret += ',['
sep = ''
for c in @children
ret += sep
sep = ','
ret += c.serialize()
ret += ']'
when TYPE_TEXT
ret += 'text:'
ret += JSON.stringify @text
when TYPE_COMMENT
ret += 'comment:'
ret += JSON.stringify @text
when TYPE_DOCTYPE
ret += 'doctype'
# FIXME
else
ret += 'unknown:'
console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
return ret
# helpers: (only take args that are normally known when parser creates nodes)
new_open_tag = (name) ->
return new Node TYPE_OPEN_TAG, name: name
new_end_tag = (name) ->
return new Node TYPE_END_TAG, name: name
new_text_node = (txt) ->
return new Node TYPE_TEXT, text: txt
new_comment_node = (txt) ->
return new Node TYPE_COMMENT, text: txt
new_eof_token = ->
return new Node TYPE_EOF
new_aaa_bookmark = ->
return new Node TYPE_AAA_BOOKMARK
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
# http://www.w3.org/TR/html5/infrastructure.html#space-character
space_chars = "\u0009\u000a\u000c\u000d\u0020"
# https://en.wikipedia.org/wiki/Whitespace_character#Unicode
whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
# These are the character references that don't need a terminating semicolon
# min length: 2, max: 6, none are a prefix of any other.
legacy_char_refs = {
Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
yen: '¥', yuml: 'ÿ'
}
void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
raw_text_elements = ['script', 'style']
escapable_raw_text_elements = ['textarea', 'title']
# http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
svg_elements = [
'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
'view', 'vkern'
]
# http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
mathml_elements = [
'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
'determinant', 'diff', 'divergence', 'divide', 'domain',
'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
'vectorproduct', 'xor'
]
# foreign_elements = [svg_elements..., mathml_elements...]
#normal_elements = All other allowed HTML elements are normal elements.
special_elements = {
# HTML:
address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
wbr:NS_HTML, xmp:NS_HTML,
# MathML:
mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
'annotation-xml':NS_MATHML,
# SVG:
foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
}
formatting_elements = {
a: true, b: true, big: true, code: true, em: true, font: true, i: true,
nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
u: true
}
el_is_special = (e) ->
return special_elements[e] is e.namespace
# decode_named_char_ref()
#
# The list of named character references is _huge_ so ask the browser to decode
# for us instead of wasting bandwidth/space on including the table here.
#
# Pass without the "&" but with the ";" examples:
# for "&" pass "amp;"
# for "′" pass "x2032;"
g_dncr = {
cache: {}
textarea: document.createElement('textarea')
}
# TODO test this in IE8
decode_named_char_ref = (txt) ->
txt = "{txt}"
decoded = g_dncr.cache[txt]
return decoded if decoded?
g_dncr.textarea.innerHTML = txt
decoded = g_dncr.textarea.value
return null if decoded is txt
return g_dncr.cache[txt] = decoded
parse_html = (txt, parse_error_cb = null) ->
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
open_els = [] # stack of open elements
tree_state = null
tok_state = null
tok_cur_tag = null # partially parsed tag
flag_frameset_ok = null
flag_parsing = null
flag_foster_parenting = null
afe = [] # active formatting elements
parse_error = ->
if parse_error_cb?
parse_error_cb cur
else
console.log "Parse error at character #{cur} of #{txt.length}"
# the functions below impliment the Tree Contstruction algorithm
# http://www.w3.org/TR/html5/syntax.html#tree-construction
# But first... the helpers
template_tag_is_open = ->
for t in open_els
if t.type is TYPE_TAG and t.name is 'template'
return true
return false
is_in_scope_x = (tag_name, scope) ->
for t in open_els
if t.name is tag_name
return true
if t.name of scope
return false
return false
is_in_scope_x_y = (tag_name, scope, scope2) ->
for t in open_els
if t.name is tag_name
return true
if t.name of scope
return false
if t.name of scope2
return false
return false
standard_scopers = { # FIXME these are supposed to be namespace specific
'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
'foreignObject': true, 'desc': true, 'title'
}
button_scopers = button: true
li_scopers = ol: true, ul: true
table_scopers = html: true, table: true, template: true
is_in_scope = (tag_name) ->
return is_in_scope_x tag_name, standard_scopers
is_in_button_scope = (tag_name) ->
return is_in_scope_x_y tag_name, standard_scopers, button_scopers
is_in_table_scope = (tag_name) ->
return is_in_scope_x tag_name, table_scopers
is_in_select_scope = (tag_name) ->
for t in open_els
if t.name is tag_name
return true
if t.name isnt 'optgroup' and t.name isnt 'option'
return false
return false
# this checks for a particular element, not by name
el_is_in_scope = (el) ->
for t in open_els
if t is el
return true
if t.name of standard_scopers
return false
return false
# http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
# this implementation is structured (mostly) as described at the link above.
# capitalized comments are the "labels" described at the link above.
reconstruct_active_formatting_elements = ->
return if afe.length is 0
if afe[0].type is TYPE_MARKER or afe[0] in open_els
return
# Rewind
i = 0
loop
if i is afe.length - 1
break
i += 1
if afe[i].type is TYPE_MARKER or afe[i] in open_els
i -= 1 # Advance
break
# Create
loop
el = afe[i].shallow_clone()
tree_insert_element el
afe[i] = el
break if i is 0
i -= 1
# http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
# adoption agency algorithm
adoption_agency = (subject) ->
if open_els[0].name is subject
el = open_els[0]
open_els.shift()
# remove it from the list of active formatting elements (if found)
for t, i in afe
if t is el
afe.splice i, 1
break
return
outer = 0
loop
if outer >= 8
return
outer += 1
fe = null
for t, fe_index in afe
if t.type is TYPE_MARKER
break
if t.name is subject
fe = t
break
if fe is null
in_body_any_other_end_tag subject
return
in_open_els = false
for t in open_els
if t is fe
in_open_els = true
break
unless in_open_els
parse_error()
# "remove it from the list" must mean afe, since it's not in open_els
afe.splice fe_index, 1
return
unless el_is_in_scope fe
parse_error()
return
unless open_els[0] is fe
parse_error()
# continue
fb = null
fb_index
for t, i in open_els
if t is fe
break
if el_is_special t
fb = t
fb_index = i
if fb is null
loop
t = open_els.shift()
if t is fe
afe.splice fe_index, 1
return
ca = open_els[fe_index + 1] # common ancestor
node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
# 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
bookmark = new_aaa_bookmark()
for t, i in afe
if t is fe
afe.splice i, 0, bookmark
node = last_node = fb
inner = 0
loop
inner += 1
node_next = null
for t, i in open_els
if t is node
node_next = open_els[i + 1]
break
node = node_next ? node_above
# TODO make sure node_above gets re-set if/when node is removed from open_els
if node is fe
break
node_in_afe = false
for t, i of afe
if t is node
if inner > 3
afe.splice i, 1
else
node_in_afe = true
break
unless node_in_afe
for t, i in open_els
if t is node
node_above = open_els[i + 1]
open_els.splice i, 1
break
continue
# 7. reate an element for the token for which the element node
# was created, in the HTML namespace, with common ancestor as
# the intended parent; replace the entry for node in the list
# of active formatting elements with an entry for the new
# element, replace the entry for node in the stack of open
# elements with an entry for the new element, and let node be
# the new element.
new_node = node.shallow_clone()
for t, i in afe
if t is node
afe[i] = new_node
break
for t, i in open_els
if t is node
open_els[i] = new_node
break
node = new_node
# 8. If last node is furthest block, then move the
# aforementioned bookmark to be immediately after the new node
# in the list of active formatting elements.
if last_node is fb
for t, i in afe
if t is bookmark
afe.splice i, 1
for t, i in afe
if t is node
# TODO test: position i gets you "after"?
afe.splice i, 0, new_aaa_bookmark()
# 9. Insert last node into node, first removing it from its
# previous parent node if any.
if last_node.parent?
for c, i of last_node.parent.children
if c is last_node
last_node.parent.children.splice i, 1
node.children.push last_node
last_node.parent = node
# 10. Let last node be node.
last_node = node
# 11. Return to the step labeled inner loop.
# 14. Insert whatever last node ended up being in the previous step
# at the appropriate place for inserting a node, but using common
# ancestor as the override target.
tree_insert_element last_node, ca
# 15. Create an element for the token for which formatting element
# was created, in the HTML namespace, with furthest block as the
# intended parent.
new_element = fe.shallow_clone()
# 16. Take all of the child nodes of furthest block and append them
# to the element created in the last step.
while fb.children.length
t = fb.children.shift()
t.parent = new_element
new_element.children.push t
# 17. Append that new element to furthest block.
new_element.parent = fb
fb.children.push new_element
# 18. Remove formatting element from the list of active formatting
# elements, and insert the new element into the list of active
# formatting elements at the position of the aforementioned
# bookmark.
for t, i in afe
if t is fe
afe.splice i, 1
break
for t, i in afe
if t is bookmark
afe[i] = node
break
# 19. Remove formatting element from the stack of open elements,
# and insert the new element into the stack of open elements
# immediately below the position of furthest block in that stack.
for t, i of open_els
if t is fe
open_els.splice i, 1
break
for t, i of open_els
if t is fb
open_els.splice i, 0, new_element
break
# 20. Jump back to the step labeled outer loop.
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
# FIXME implement this
close_p_if_in_button_scope = ->
if open_els[0].name is 'p'
open_els.pop()
return
#p = find_button_scope 'p'
#if p?
# TODO generate_implied_end_tags except for p tags
# TODO parse_error unless open_els[0].name is 'p'
# TODO pop stack until 'p' popped
# http://www.w3.org/TR/html5/syntax.html#insert-a-character
tree_insert_text = (t) ->
dest = adjusted_insertion_location()
if dest[1] > 0
prev = dest[0].children[dest[1] - 1]
if prev.type is TYPE_TEXT
prev.text += t.text
return
dest[0].children.splice dest[1], 0, t
# 8.2.5.1
# http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
# http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
adjusted_insertion_location = (override_target = null) ->
# 1. If there was an override target specified, then let target be the
# override target.
if override_target?
target = override_target
else # Otherwise, let target be the current node.
target = open_els[0]
# 2. Determine the adjusted insertion location using the first matching
# steps from the following list:
#
# If foster parenting is enabled and target is a table, tbody, tfoot,
# thead, or tr element Foster parenting happens when content is
# misnested in tables.
if flag_foster_parenting and target.name in foster_parenting_targets
console.log "foster parenting isn't implemented yet" # TODO
# 1. Let last template be the last template element in the stack of
# open elements, if any.
# 2. Let last table be the last table element in the stack of open
# elements, if any.
# 3. If there is a last template and either there is no last table,
# or there is one, but last template is lower (more recently added)
# than last table in the stack of open elements, then: let adjusted
# insertion location be inside last template's template contents,
# after its last child (if any), and abort these substeps.
# 4. If there is no last table, then let adjusted insertion
# location be inside the first element in the stack of open
# elements (the html element), after its last child (if any), and
# abort these substeps. (fragment case)
# 5. If last table has a parent element, then let adjusted
# insertion location be inside last table's parent element,
# immediately before last table, and abort these substeps.
# 6. Let previous element be the element immediately above last
# table in the stack of open elements.
# 7. Let adjusted insertion location be inside previous element,
# after its last child (if any).
# Note: These steps are involved in part because it's possible for
# elements, the table element in this case in particular, to have
# been moved by a script around in the DOM, or indeed removed from
# the DOM entirely, after the element was inserted by the parser.
else
# Otherwise Let adjusted insertion location be inside target, after
# its last child (if any).
target_i = target.children.length
# 3. If the adjusted insertion location is inside a template element,
# let it instead be inside the template element's template contents,
# after its last child (if any). TODO
# 4. Return the adjusted insertion location.
return [target, target_i]
# http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
# aka create_an_element_for_token
token_to_element = (t, namespace, intended_parent) ->
t.type = TYPE_TAG # not TYPE_OPEN_TAG
# convert attributes into a hash
attrs = {}
while t.attrs_a.length
a = t.attrs_a.pop()
attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
# TODO 2. If the newly created element has an xmlns attribute in the
# XMLNS namespace whose value is not exactly the same as the element's
# namespace, that is a parse error. Similarly, if the newly created
# element has an xmlns:xlink attribute in the XMLNS namespace whose
# value is not the XLink Namespace, that is a parse error.
# fixfull: the spec says stuff about form pointers and ownerDocument
return el
# FIXME read implement "foster parenting" part
# FIXME read spec, do this right
# FIXME implement the override target thing
# note: this assumes it's an open tag
# TODO tree_insert_html_element = (t, ...
tree_insert_element = (el, override_target = null, namespace = null) ->
dest = adjusted_insertion_location override_target
if el.type is TYPE_OPEN_TAG # means it's a "token"
el = token_to_element el, namespace, dest[0]
# fixfull: Document nodes sometimes can't accept more chidren
dest[0].children.splice dest[1], 0, el
el.parent = dest[0]
open_els.unshift el
return el
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
tree_insert_a_comment = (t) ->
# FIXME read spec for "adjusted insertion location, etc, this might be wrong
open_els[0].children.push t
# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
for node, i in open_els
if node.name is name
# FIXME generate implied end tags except those with name==name
parse_error() unless i is 0
while i > 0
open_els.shift()
i -= 1
open_els.shift()
return
if special_elements[node.name]?
parse_error()
return
tree_in_body = (t) ->
switch t.type
when TYPE_TEXT
switch t.text
when "\u0000"
parse_error()
when "\t", "\u000a", "\u000c", "\u000d", ' '
reconstruct_active_formatting_elements()
tree_insert_text t
else
reconstruct_active_formatting_elements()
tree_insert_text t
flag_frameset_ok = false
when TYPE_COMMENT
tree_insert_a_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_OPEN_TAG
switch t.name
when 'html'
parse_error()
return if template_tag_is_open()
root_attrs = open_els[open_els.length - 1].children
for k, v of t.attrs
root_attrs[k] = v unless root_attrs[k]?
when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
# FIXME also do this for (end tag)
return tree_in_head t
when 'body'
parse_error()
# TODO
when 'frameset'
parse_error()
# TODO
when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
close_p_if_in_button_scope()
tree_insert_element t
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
close_p_if_in_button_scope()
if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
parse_error()
open_els.shift()
tree_insert_element t
# TODO lots more to implement here
when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
reconstruct_active_formatting_elements()
el = tree_insert_element t
afe.push el
# TODO lots more to implement here
else # any other start tag
reconstruct_active_formatting_elements()
tree_insert_element t
when TYPE_EOF
ok_tags = {
dd: true, dt: true, li: true, p: true, tbody: true, td: true,
tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
}
for t in open_els
unless ok_tags[t.name]?
parse_error()
break
# TODO stack of template insertion modes thing
flag_parsing = false # stop parsing
when TYPE_END_TAG
switch t.name
when 'body'
unless is_in_scope 'body'
parse_error()
return
# TODO implement parse error and move to tree_after_body
when 'html'
unless is_in_scope 'body' # weird, but it's what the spec says
parse_error()
return
# TODO implement parse error and move to tree_after_body, reprocess
# TODO lots more close tags to implement here
when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
adoption_agency t.name
# TODO lots more close tags to implement here
else
in_body_any_other_end_tag t.name
return
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
return new_text_node tokenize_character_reference()
when '<'
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
return new_text_node c
when '' # EOF
return new_eof_token()
else
return new_text_node c
return null
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
# not needed: tok_state_character_reference_in_data = ->
# just call tok_state_character_reference_in_data()
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
switch c = txt.charAt(cur++)
when '!'
tok_state = tok_state_markup_declaration_open
when '/'
tok_state = tok_state_end_tag_open
when '?'
parse_error()
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
tok_cur_tag = new_open_tag c
tok_state = tok_state_tag_name
else if uc_alpha.indexOf(c) > -1
tok_cur_tag = new_open_tag c.toLowerCase()
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_data
cur -= 1 # we didn't parse/handle the char after <
return new_text_node '<'
return null
# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
tok_state_end_tag_open = ->
switch c = txt.charAt(cur++)
when '>'
parse_error()
tok_state = tok_state_data
when '' # EOF
parse_error()
tok_state = tok_state_data
return new_text_node ''
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag = new_end_tag c.toLowerCase()
tok_state = tok_state_tag_name
else if lc_alpha.indexOf(c) > -1
tok_cur_tag = new_end_tag c
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_bogus_comment
return null
# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
tok_state_tag_name = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
parse_error()
tok_cur_tag.name += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag.name += c.toLowerCase()
else
tok_cur_tag.name += c
return null
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
tok_state_before_attribute_name = ->
attr_name = null
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
return null
when '/'
tok_state = tok_state_self_closing_start_tag
return null
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
parse_error()
attr_name = "\ufffd"
when '"', "'", '<', '='
parse_error()
attr_name = c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
attr_name = c.toLowerCase()
else
attr_name = c
if attr_name?
tok_cur_tag.attrs_a.unshift [attr_name, '']
tok_state = tok_state_attribute_name
return null
# 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
tok_state_attribute_name = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_after_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '='
tok_state = tok_state_before_attribute_value
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
parse_error()
tok_cur_tag.attrs_a[0][0] = "\ufffd"
when '"', "'", '<'
parse_error()
tok_cur_tag.attrs_a[0][0] = c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
else
tok_cur_tag.attrs_a[0][0] += c
return null
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state_before_attribute_value = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
return null
when '"'
tok_state = tok_state_attribute_value_double_quoted
when '&'
tok_state = tok_state_attribute_value_unquoted
cur -= 1
when "'"
tok_state = tok_state_attribute_value_single_quoted
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
tok_state = tok_state_attribute_value_unquoted
when '>'
# Parse error
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when '' # EOF
parse_error()
tok_state = tok_state_data
else
tok_cur_tag.attrs_a[0][1] += c
tok_state = tok_state_attribute_value_unquoted
return null
# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
tok_state_attribute_value_double_quoted = ->
switch c = txt.charAt(cur++)
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
tok_state_attribute_value_single_quoted = ->
switch c = txt.charAt(cur++)
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
tok_state_attribute_value_unquoted = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
# Parse Error if ', <, = or ` (backtick)
tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
tok_state_after_attribute_value_quoted = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when '' # EOF
parse_error()
tok_state = tok_state_data
else
# Parse Error
tok_state = tok_state_before_attribute_name
cur -= 1 # we didn't handle that char
return null
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
# returns a string (NOT a text node)
tokenize_character_reference = (allowed_char = null, in_attr = false) ->
if cur >= txt.length
return '&'
switch c = txt.charAt(cur)
when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
# explicitly not a parse error
return '&'
when ';'
# there has to be "one or more" alnums between & and ; to be a parse error
return '&'
when '#'
if cur + 1 >= txt.length
return '&'
if txt.charAt(cur + 1).toLowerCase() is 'x'
prefix = '#x'
charset = hex_chars
start = cur + 2
else
charset = digits
start = cur + 1
prefix = '#'
i = 0
while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
i += 1
if i is 0
return '&'
if txt.charAt(start + i) is ';'
i += 1
# FIXME This is supposed to generate parse errors for some chars
decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
if decoded?
cur = start + i
return decoded
return '&'
else
for i in [0...31]
if alnum.indexOf(txt.charAt(cur + i)) is -1
break
if i is 0
# exit early, because parse_error() below needs at least one alnum
return '&'
if txt.charAt(cur + i) is ';'
i += 1 # include ';' terminator in value
decoded = decode_named_char_ref txt.substr(cur, i)
if decoded?
cur += i
return decoded
parse_error()
return '&'
else
# no ';' terminator (only legacy char refs)
max = i
for i in [2..max] # no prefix matches, so ok to check shortest first
c = legacy_char_refs[txt.substr(cur, i)]
if c?
if in_attr
if txt.charAt(cur + i) is '='
# "because some legacy user agents will
# misinterpret the markup in those cases"
parse_error()
return '&'
if alnum.indexOf(txt.charAt(cur + i)) > -1
# this makes attributes forgiving about url args
return '&'
# ok, and besides the weird exceptions for attributes...
# return the matching char
cur += i # consume entity chars
parse_error() # because no terminating ";"
return c
parse_error()
return '&'
return # never reached
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
tree = new Node TYPE_TAG, name: 'html'
open_els = [tree]
tree_state = tree_in_body
flag_frameset_ok = true
flag_parsing = true
flag_foster_parenting = false
afe = [] # active formatting elements
# tokenizer initialization
tok_state = tok_state_data
# proccess input
while flag_parsing
t = tok_state()
if t?
tree_state t
return tree.children
# everything below is tests on the above
test_equals = (description, output, expected_output) ->
if output is expected_output
console.log "passed." # don't say name, so smart consoles can merge all of these
else
console.log "FAILED: \"#{description}\""
console.log " Expected: #{expected_output}"
console.log " Actual: #{output}"
test_parser = (args) ->
parse_errors = []
errors_cb = (i) ->
parse_errors.push i
parsed = parse_html args.html, errors_cb
serialized = ''
sep = ''
for t in parsed
serialized += sep
sep = ','
serialized += t.serialize()
if serialized isnt args.expected or parse_errors.length isnt args.errors
console.log "FAILED: \"#{args.name}\""
else
console.log "passed \"#{args.name}\""
if serialized isnt args.expected
console.log " Input: #{args.html}"
console.log " Correct: #{args.expected}"
console.log " Output: #{serialized}"
if parse_errors.length isnt args.errors
console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
test_parser name: "empty", \
html: "",
expected: '',
errors: 0
test_parser name: "just text", \
html: "abc",
expected: 'text:"abc"',
errors: 0
test_parser name: "named entity", \
html: "a&1234",
expected: 'text:"a&1234"',
errors: 0
test_parser name: "broken named character references", \
html: "1&2&&3&aabbcc;",
expected: 'text:"1&2&&3&aabbcc;"',
errors: 2
test_parser name: "numbered entity overrides", \
html: "1 ",
expected: 'text:"1€€ ƒ"',
errors: 0
test_parser name: "open tag", \
html: "foobar",
expected: 'text:"foo",tag:"span",{},[text:"bar"]',
errors: 1 # no close tag
test_parser name: "open tag with attributes", \
html: "foobar",
expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
errors: 1 # no close tag
test_parser name: "open tag with attributes of various quotings", \
html: "foobar",
expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
errors: 1 # no close tag
test_parser name: "attribute entity exceptions dq", \
html: "foobar",
expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
errors: 2 # no close tag, &= in attr
test_parser name: "attribute entity exceptions sq", \
html: "foo bar",
expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
errors: 2 # no close tag, &= in attr
test_parser name: "attribute entity exceptions uq", \
html: "foo bar",
expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
errors: 2 # no close tag, &= in attr
test_parser name: "matching closing tags", \
html: "foo hi bar",
expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
errors: 0
test_parser name: "missing closing tag inside", \
html: "foobarbaz
qux",
expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
errors: 1 # close tag mismatch
test_parser name: "mis-matched closing tags", \
html: "123456
78",
expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
errors: 2 # misplaced , no at the end
test_parser name: "mis-matched formatting elements", \
html: "123456 7890",
expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
errors: 1 # no idea how many their should be
test_parser name: "crazy formatting elements test", \
html: "first
second ",
# chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
# firefox does this:
expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
errors: 6 # no idea how many there should be
# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
test_parser name: "html5lib aaa 1", \
html: '
',
expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
errors: 2
test_parser name: "html5lib aaa 2", \
html: '12
3',
expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
errors: 2
test_parser name: "html5lib aaa 3", \
html: '12 3',
expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
errors: 2
test_parser name: "html5lib aaa 4", \
html: '12 3 ',
expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
errors: 2
test_parser name: "html5lib aaa 5", \
html: '1',
expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"]text:"5"]',
errors: 3