# HTML parser meant to run in a browser, in support of WYSIWYG editor
# Copyright 2015 Jason Woofenden
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
# This file implements a parser for html snippets, meant to be used by a
# WYSIWYG editor. Hence it does not attempt to parse doctypes, ,
# or tags, nor does it produce the top level "document" node in the dom
# tree, nor nodes for html, head or body.
#
# Instead, the data structure produced by this parser is an array of nodes.
#
# Each node is an array. The first element in the array is an integer (one of
# the TYPE_* constants below) followed by the appropriate fields for that type
# (shown below in the comments after the TYPE_* definition.)
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
TYPE_COMMENT = 2
TYPE_DOCTYPE = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
TYPE_END_TAG = 5 # name
TYPE_EOF = 6
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
scopers = { # FIXME these are supposed to be namespace specific
'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
'foreignObject', 'desc', 'title'
}
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
# http://www.w3.org/TR/html5/infrastructure.html#space-character
space_chars = "\u0009\u000a\u000c\u000d\u0020"
# https://en.wikipedia.org/wiki/Whitespace_character#Unicode
whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
# These are the character references that don't need a terminating semicolon
# min length: 2, max: 6, none are a prefix of any other.
legacy_char_refs = {
Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
yen: '¥', yuml: 'ÿ'
}
void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
raw_text_elements = ['script', 'style']
escapable_raw_text_elements = ['textarea', 'title']
# http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
svg_elements = [
'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
'view', 'vkern'
]
# http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
mathml_elements = [
'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
'determinant', 'diff', 'divergence', 'divide', 'domain',
'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
'vectorproduct', 'xor'
]
# foreign_elements = [svg_elements..., mathml_elements...]
#normal_elements = All other allowed HTML elements are normal elements.
special_elements = {
# from HTML:
address: true, applet: true, area: true, article: true, aside: true,
base: true, basefont: true, bgsound: true, blockquote: true, body: true,
br: true, button: true, caption: true, center: true, col: true,
colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
h3: true, h4: true, h5: true, h6: true, head: true, header: true,
hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
isindex: true, li: true, link: true, listing: true, main: true,
marquee: true, meta: true, nav: true, noembed: true, noframes: true,
noscript: true, object: true, ol: true, p: true, param: true,
plaintext: true, pre: true, script: true, section: true, select: true,
source: true, style: true, summary: true, table: true, tbody: true,
td: true, template: true, textarea: true, tfoot: true, th: true,
thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
xmp: true,
# from MathML:
mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
# from SVG:
foreignObject: true, desc: true, title: true
}
formatting_elements = {
a: true, b: true, big: true, code: true, em: true, font: true, i: true,
nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
u: true
}
# decode_named_char_ref()
#
# The list of named character references is _huge_ so ask the browser to decode
# for us instead of wasting bandwidth/space on including the table here.
#
# Pass without the "&" but with the ";" examples:
# for "&" pass "amp;"
# for "′" pass "x2032;"
g_dncr = {
cache: {}
textarea: document.createElement('textarea')
}
# TODO test this in IE8
decode_named_char_ref = (txt) ->
txt = "{txt}"
decoded = g_dncr.cache[txt]
return decoded if decoded?
g_dncr.textarea.innerHTML = txt
decoded = g_dncr.textarea.value
return null if decoded is txt
return g_dncr.cache[txt] = decoded
parse_html = (txt) ->
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
open_tags = [] # stack of open elements
tree_state = null
tok_state = null
tok_cur_tag = null # partially parsed tag
flag_frameset_ok = null
flag_parsing = null
parse_error = ->
console.log "Parse error at character #{cur} of #{txt.length}"
# the functions below impliment the Tree Contstruction algorithm
# http://www.w3.org/TR/html5/syntax.html#tree-construction
# But first... the helpers
template_tag_is_open = ->
for t of open_tags
if t[0] is TYPE_TAG and t[1] is 'template'
return true
return false
is_in_scope = (tag_name) ->
for t of open_tags
if t[0] is TYPE_TAG and t[1] is tag_name
return true
# FIXME bail if in scopers
return false
reconstruct_active_formatting_elements = ->
# FIXME implement this
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
# FIXME implement this
close_p_if_in_button_scope = ->
if open_tags[0][1] is 'p' # FIXME
open_tags.pop()
return
#p = find_button_scope 'p'
#if p?
# TODO generate_implied_end_tags except for p tags
# TODO parse_error unless open_tags[0][1] is 'p'
# TODO pop stack until 'p' popped
# http://www.w3.org/TR/html5/syntax.html#insert-a-character
tree_insert_a_character = (t) ->
# FIXME read spec for "adjusted insertion location, etc, this might be wrong
if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
else
open_tags[0][3].push t
# FIXME read spec, do this right
# note: this assumes it's an open tag
tree_insert_tag = (t) ->
t[0] = TYPE_TAG # not TYPE_OPEN_TAG
# convert attributes into a hash
attrs = {}
while t[2].length
a = t[2].pop()
attrs[a[0]] = a[1]
t[2] = attrs
open_tags[0][3].push t
open_tags.unshift t
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
tree_insert_a_comment = (t) ->
# FIXME read spec for "adjusted insertion location, etc, this might be wrong
open_tags[0][3].push t
# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
tree_in_body = (t) ->
switch t[0]
when TYPE_TEXT
switch t[1]
when "\u0000"
parse_error()
when "\t", "\u000a", "\u000c", "\u000d", ' '
reconstruct_active_formatting_elements()
tree_insert_a_character t
else
reconstruct_active_formatting_elements()
tree_insert_a_character t
flag_frameset_ok = false
when TYPE_COMMENT
tree_insert_a_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_OPEN_TAG
switch t[1]
when 'html'
parse_error()
return if template_tag_is_open()
root_attrs = open_tags[open_tags.length - 1][3]
for k, v of t[2]
root_attrs[k] = v unless root_attrs[k]?
when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
# FIXME also do this for (end tag)
return tree_in_head t
when 'body'
parse_error()
# TODO
when 'frameset'
parse_error()
# TODO
when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
close_p_if_in_button_scope()
tree_insert_tag t
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
close_p_if_in_button_scope()
if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
parse_error()
open_tags.shift()
tree_insert_tag t
# TODO lots more to implement here
else # any other start tag
reconstruct_active_formatting_elements()
tree_insert_tag t
when TYPE_EOF
ok_tags = {
dd: true, dt: true, li: true, p: true, tbody: true, td: true,
tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
}
for t in open_tags
unless ok_tags[t[1]]?
parse_error()
break
# TODO stack of template insertion modes thing
flag_parsing = false # stop parsing
when TYPE_END_TAG
switch t[1]
when 'body'
unless is_in_scope 'body'
parse_error()
return
# TODO implement parse error and move to tree_after_body
when 'html'
unless is_in_scope 'body' # weird, but it's what the spec says
parse_error()
return
# TODO implement parse error and move to tree_after_body, reprocess
# TODO lots more close tags to implement here
else
for node, i in open_tags
if node[1] is t[1]
# FIXME generate implied end tags except those with name==t[1]
parse_error() unless i is 0
while i > 0
open_tags.shift()
i -= 1
open_tags.shift()
return
if special_elements[node[1]]?
parse_error()
return
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
return [TYPE_TEXT, tokenize_character_reference()]
when '<'
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
return [TYPE_TEXT, c]
when '' # EOF
return [TYPE_EOF]
else
return [TYPE_TEXT, c]
return null
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
# not needed: tok_state_character_reference_in_data = ->
# just call tok_state_character_reference_in_data()
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
switch c = txt.charAt(cur++)
when '!'
tok_state = tok_state_markup_declaration_open
when '/'
tok_state = tok_state_end_tag_open
when '?'
parse_error()
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
tok_state = tok_state_tag_name
else if uc_alpha.indexOf(c) > -1
tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_data
cur -= 1 # we didn't parse/handle the char after <
return [TYPE_TEXT, '<']
return null
# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
tok_state_end_tag_open = ->
switch c = txt.charAt(cur++)
when '>'
parse_error()
tok_state = tok_state_data
when '' # EOF
parse_error()
tok_state = tok_state_data
return [TYPE_TEXT, '']
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
tok_state = tok_state_tag_name
else if lc_alpha.indexOf(c) > -1
tok_cur_tag = [TYPE_END_TAG, c, [], []]
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_bogus_comment
return null
# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
tok_state_tag_name = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
parse_error()
tok_cur_tag[1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag[1] += c.toLowerCase()
else
tok_cur_tag[1] += c
return null
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
tok_state_before_attribute_name = ->
attr_name = null
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
return null
when '/'
tok_state = tok_state_self_closing_start_tag
return null
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
parse_error()
attr_name = "\ufffd"
when '"', "'", '<', '='
parse_error()
attr_name = c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
attr_name = c.toLowerCase()
else
attr_name = c
if attr_name?
tok_cur_tag[2].unshift [attr_name, '']
tok_state = tok_state_attribute_name
return null
# 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
tok_state_attribute_name = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_after_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '='
tok_state = tok_state_before_attribute_value
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
parse_error()
tok_cur_tag[2][0][0] = "\ufffd"
when '"', "'", '<'
parse_error()
tok_cur_tag[2][0][0] = c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag[2][0][0] = c.toLowerCase()
else
tok_cur_tag[2][0][0] += c
return null
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state_before_attribute_value = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
return null
when '"'
tok_state = tok_state_attribute_value_double_quoted
when '&'
tok_state = tok_state_attribute_value_unquoted
cur -= 1
when "'"
tok_state = tok_state_attribute_value_single_quoted
when "\u0000"
# Parse error
tok_cur_tag[2][0][1] += "\ufffd"
tok_state = tok_state_attribute_value_unquoted
when '>'
# Parse error
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when '' # EOF
parse_error()
tok_state = tok_state_data
else
tok_cur_tag[2][0][1] += c
tok_state = tok_state_attribute_value_unquoted
return null
# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
tok_state_attribute_value_double_quoted = ->
switch c = txt.charAt(cur++)
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
when "\u0000"
# Parse error
tok_cur_tag[2][0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
tok_cur_tag[2][0][1] += c
return null
# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
tok_state_attribute_value_single_quoted = ->
switch c = txt.charAt(cur++)
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
when "\u0000"
# Parse error
tok_cur_tag[2][0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
tok_cur_tag[2][0][1] += c
return null
# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
tok_state_attribute_value_unquoted = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
tok_cur_tag[2][0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
# Parse Error if ', <, = or ` (backtick)
tok_cur_tag[2][0][1] += c
return null
# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
tok_state_after_attribute_value_quoted = ->
switch c = txt.charAt(cur++)
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when '' # EOF
parse_error()
tok_state = tok_state_data
else
# Parse Error
tok_state = tok_state_before_attribute_name
cur -= 1 # we didn't handle that char
return null
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
# returns a string (NOT a text node)
tokenize_character_reference = (allowed_char = null, in_attr = false) ->
if cur >= txt.length
return '&'
switch c = txt.charAt(cur)
when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
# explicitly not a parse error
return '&'
when ';'
# there has to be "one or more" alnums between & and ; to be a parse error
return '&'
when '#'
if cur + 1 >= txt.length
return '&'
if txt.charAt(cur + 1).toLowerCase() is 'x'
prefix = '#x'
charset = hex_chars
start = cur + 2
else
charset = digits
start = cur + 1
prefix = '#'
i = 0
while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
i += 1
if i is 0
return '&'
if txt.charAt(start + i) is ';'
i += 1
# FIXME This is supposed to generate parse errors for some chars
decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
if decoded?
cur = start + i
return decoded
return '&'
else
for i in [0...31]
if alnum.indexOf(txt.charAt(cur + i)) is -1
break
if i is 0
# exit early, because parse_error() below needs at least one alnum
return '&'
if txt.charAt(cur + i) is ';'
i += 1 # include ';' terminator in value
decoded = decode_named_char_ref txt.substr(cur, i)
if decoded?
cur += i
return decoded
parse_error()
return '&'
else
# no ';' terminator (only legacy char refs)
max = i
for i in [2..max] # no prefix matches, so ok to check shortest first
c = legacy_char_refs[txt.substr(cur, i)]
if c?
if in_attr
if txt.charAt(cur + i) is '='
# "because some legacy user agents will
# misinterpret the markup in those cases"
parse_error()
return '&'
if alnum.indexOf(txt.charAt(cur + i)) > -1
# this makes attributes forgiving about url args
return '&'
# ok, and besides the weird exceptions for attributes...
# return the matching char
cur += i # consume entity chars
parse_error() # because no terminating ";"
return c
parse_error()
return '&'
return # never reached
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
tree = [TYPE_TAG, 'html', {}, []]
open_tags = [tree]
tree_state = tree_in_body
flag_frameset_ok = true
flag_parsing = true
# tokenizer initialization
tok_state = tok_state_data
# proccess input
while flag_parsing
t = tok_state()
if t?
tree_state t
return tree[3]
# everything below is tests on the above
test_equals = (description, fn, args..., expected_output) ->
output = fn.apply this, args
if output is expected_output
console.log "passed: #{description}."
else
console.log "FAILED: #{description}..."
console.log " Expected: #{expected_output}"
console.log " Actual: #{output}"
html_to_json = (html) ->
return JSON.stringify parse_html html
test_equals "empty", html_to_json, "", '[]'
test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
test_equals "numbered entity overrides", html_to_json, "1 ", '[[1,"1€€ ƒ"]]'
test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
test_equals "attribute entity exceptions dq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions sq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions uq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "matching closing tags", html_to_json, "foohibar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
test_equals "mis-matched closing tags", html_to_json, "foobarbaz
qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'