1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an obect of the Node class. Here are the Node types:
26 TYPE_TAG = 0 # name, {attributes}, [children]
27 TYPE_TEXT = 1 # "text"
30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
32 TYPE_END_TAG = 5 # name
34 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
35 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
43 constructor: (type, args = {}) ->
44 @type = type # one of the TYPE_* constants above
45 @name = args.name ? '' # tag name
46 @text = args.text ? '' # contents for text/comment nodes
47 @attrs = args.attrs ? {}
48 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
49 @children = args.children ? []
50 @namespace = args.namespace ? NS_HTML
51 @parent = args.parent ? null
52 shallow_clone: -> # return a new node that's the same except without the children or parent
53 # WARNING this doesn't work right on open tags that are still being parsed
55 attrs[k] = v for k, v of @attrs
56 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
57 serialize: -> # for unit tests
62 ret += JSON.stringify @name
64 ret += JSON.stringify @attrs
74 ret += JSON.stringify @text
77 ret += JSON.stringify @text
83 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
86 # helpers: (only take args that are normally known when parser creates nodes)
87 new_open_tag = (name) ->
88 return new Node TYPE_OPEN_TAG, name: name
89 new_end_tag = (name) ->
90 return new Node TYPE_END_TAG, name: name
91 new_text_node = (txt) ->
92 return new Node TYPE_TEXT, text: txt
93 new_comment_node = (txt) ->
94 return new Node TYPE_COMMENT, text: txt
96 return new Node TYPE_EOF
98 return new Node TYPE_AAA_BOOKMARK
100 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
101 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
102 digits = "0123456789"
103 alnum = lc_alpha + uc_alpha + digits
104 hex_chars = digits + "abcdefABCDEF"
106 # some SVG elements have dashes in them
107 tag_name_chars = alnum + "-"
109 # http://www.w3.org/TR/html5/infrastructure.html#space-character
110 space_chars = "\u0009\u000a\u000c\u000d\u0020"
112 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
113 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
115 # These are the character references that don't need a terminating semicolon
116 # min length: 2, max: 6, none are a prefix of any other.
118 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
119 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
120 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
121 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
122 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
123 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
124 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
125 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
126 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
127 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
128 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
129 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
130 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
131 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
132 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
133 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
134 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
138 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
139 raw_text_elements = ['script', 'style']
140 escapable_raw_text_elements = ['textarea', 'title']
141 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
143 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
144 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
145 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
146 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
147 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
148 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
149 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
150 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
151 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
152 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
153 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
154 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
155 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
156 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
160 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
162 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
163 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
164 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
165 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
166 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
167 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
168 'determinant', 'diff', 'divergence', 'divide', 'domain',
169 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
170 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
171 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
172 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
173 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
174 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
175 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
176 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
177 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
178 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
179 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
180 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
181 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
182 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
183 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
184 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
185 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
186 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
187 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
188 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
189 'vectorproduct', 'xor'
191 # foreign_elements = [svg_elements..., mathml_elements...]
192 #normal_elements = All other allowed HTML elements are normal elements.
196 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
197 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
198 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
199 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
200 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
201 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
202 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
203 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
204 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
205 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
206 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
207 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
208 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
209 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
210 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
211 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
212 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
213 wbr:NS_HTML, xmp:NS_HTML,
216 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
217 'annotation-xml':NS_MATHML,
220 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
223 formatting_elements = {
224 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
225 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
229 el_is_special = (e) ->
230 return special_elements[e] is e.namespace
232 # decode_named_char_ref()
234 # The list of named character references is _huge_ so ask the browser to decode
235 # for us instead of wasting bandwidth/space on including the table here.
237 # Pass without the "&" but with the ";" examples:
238 # for "&" pass "amp;"
239 # for "′" pass "x2032;"
242 textarea: document.createElement('textarea')
244 # TODO test this in IE8
245 decode_named_char_ref = (txt) ->
247 decoded = g_dncr.cache[txt]
248 return decoded if decoded?
249 g_dncr.textarea.innerHTML = txt
250 decoded = g_dncr.textarea.value
251 return null if decoded is txt
252 return g_dncr.cache[txt] = decoded
254 parse_html = (txt, parse_error_cb = null) ->
255 cur = 0 # index of next char in txt to be parsed
256 # declare tree and tokenizer variables so they're in scope below
258 open_els = [] # stack of open elements
261 tok_cur_tag = null # partially parsed tag
262 flag_frameset_ok = null
264 afe = [] # active formatting elements
270 console.log "Parse error at character #{cur} of #{txt.length}"
273 # the functions below impliment the Tree Contstruction algorithm
274 # http://www.w3.org/TR/html5/syntax.html#tree-construction
276 # But first... the helpers
277 template_tag_is_open = ->
279 if t.type is TYPE_TAG and t.name is 'template'
282 is_in_scope_x = (tag_name, scope) ->
284 if t.name is tag_name
289 is_in_scope_x_y = (tag_name, scope, scope2) ->
291 if t.name is tag_name
298 standard_scopers = { # FIXME these are supposed to be namespace specific
299 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
300 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
301 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
302 'foreignObject': true, 'desc': true, 'title'
304 button_scopers = button: true
305 li_scopers = ol: true, ul: true
306 table_scopers = html: true, table: true, template: true
307 is_in_scope = (tag_name) ->
308 return is_in_scope_x tag_name, standard_scopers
309 is_in_button_scope = (tag_name) ->
310 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
311 is_in_table_scope = (tag_name) ->
312 return is_in_scope_x tag_name, table_scopers
313 is_in_select_scope = (tag_name) ->
315 if t.name is tag_name
317 if t.name isnt 'optgroup' and t.name isnt 'option'
320 # this checks for a particular element, not by name
321 el_is_in_scope = (el) ->
325 if t.name of standard_scopers
329 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
330 # this implementation is structured (mostly) as described at the link above.
331 # capitalized comments are the "labels" described at the link above.
332 reconstruct_active_formatting_elements = ->
333 return if afe.length is 0
334 if afe[0].type is TYPE_MARKER or afe[0] in open_els
339 if i is afe.length - 1
342 if afe[i].type is TYPE_MARKER or afe[i] in open_els
347 el = afe[i].shallow_clone()
353 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
354 # adoption agency algorithm
355 adoption_agency = (subject) ->
356 if open_els[0].name is subject
359 # remove it from the list of active formatting elements (if found)
371 for t, fe_index in afe
372 if t.type is TYPE_MARKER
378 in_body_any_other_end_tag subject
387 # "remove it from the list" must mean afe, since it's not in open_els
388 afe.splice fe_index, 1
390 unless el_is_in_scope fe
393 unless open_els[0] is fe
408 afe.splice fe_index, 1
410 ca = open_els[fe_index + 1] # common ancestor
411 node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
412 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
413 bookmark = new_aaa_bookmark()
416 afe.splice i, 0, bookmark
417 node = last_node = fb
424 node_next = open_els[i + 1]
426 node = node_next ? node_above
427 # TODO make sure node_above gets re-set if/when node is removed from open_els
441 node_above = open_els[i + 1]
445 # 7. reate an element for the token for which the element node
446 # was created, in the HTML namespace, with common ancestor as
447 # the intended parent; replace the entry for node in the list
448 # of active formatting elements with an entry for the new
449 # element, replace the entry for node in the stack of open
450 # elements with an entry for the new element, and let node be
452 new_node = node.shallow_clone()
459 open_els[i] = new_node
462 # 8. If last node is furthest block, then move the
463 # aforementioned bookmark to be immediately after the new node
464 # in the list of active formatting elements.
471 # TODO test: position i gets you "after"?
472 afe.splice i, 0, new_aaa_bookmark()
473 # 9. Insert last node into node, first removing it from its
474 # previous parent node if any.
476 for c, i of last_node.parent.children
478 last_node.parent.children.splice i, 1
479 node.children.push last_node
480 last_node.parent = node
481 # 10. Let last node be node.
483 # 11. Return to the step labeled inner loop.
484 # 14. Insert whatever last node ended up being in the previous step
485 # at the appropriate place for inserting a node, but using common
486 # ancestor as the override target.
487 tree_insert_tag last_node, ca
488 # 15. Create an element for the token for which formatting element
489 # was created, in the HTML namespace, with furthest block as the
491 new_element = fe.shallow_clone()
492 # 16. Take all of the child nodes of furthest block and append them
493 # to the element created in the last step.
494 while fb.children.length
495 t = fb.children.shift()
496 t.parent = new_element
497 new_element.children.push t
498 # 17. Append that new element to furthest block.
499 new_element.parent = fb
500 fb.children.push new_element
501 # 18. Remove formatting element from the list of active formatting
502 # elements, and insert the new element into the list of active
503 # formatting elements at the position of the aforementioned
513 # 19. Remove formatting element from the stack of open elements,
514 # and insert the new element into the stack of open elements
515 # immediately below the position of furthest block in that stack.
522 open_els.splice i, 0, new_element
524 # 20. Jump back to the step labeled outer loop.
526 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
527 # FIXME implement this
528 close_p_if_in_button_scope = ->
529 if open_els[0].name is 'p'
532 #p = find_button_scope 'p'
534 # TODO generate_implied_end_tags except for p tags
535 # TODO parse_error unless open_els[0].name is 'p'
536 # TODO pop stack until 'p' popped
538 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
539 tree_insert_a_character = (t) ->
540 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
541 dest = open_els[0].children
542 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
543 dest[dest.length - 1].text += t.text
547 # FIXME read spec, do this right
548 # FIXME implement the override target thing
549 # note: this assumes it's an open tag
550 tree_insert_tag = (t, override_target = null) ->
551 t.type = TYPE_TAG # not TYPE_OPEN_TAG
552 # convert attributes into a hash
553 while t.attrs_a.length
555 t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
557 for c, i of t.parent.children
559 t.parent.children.splice i, 1
560 # FIXME spec says to do something to figure out what parent should be
563 parent.children.push t
566 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
567 tree_insert_a_comment = (t) ->
568 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
569 open_els[0].children.push t
571 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
572 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
573 for node, i in open_els
575 # FIXME generate implied end tags except those with name==name
576 parse_error() unless i is 0
582 if special_elements[node.name]?
585 tree_in_body = (t) ->
591 when "\t", "\u000a", "\u000c", "\u000d", ' '
592 reconstruct_active_formatting_elements()
593 tree_insert_a_character t
595 reconstruct_active_formatting_elements()
596 tree_insert_a_character t
597 flag_frameset_ok = false
599 tree_insert_a_comment t
606 return if template_tag_is_open()
607 root_attrs = open_els[open_els.length - 1].children
609 root_attrs[k] = v unless root_attrs[k]?
610 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
611 # FIXME also do this for </template> (end tag)
612 return tree_in_head t
619 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
620 close_p_if_in_button_scope()
622 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
623 close_p_if_in_button_scope()
624 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
628 # TODO lots more to implement here
629 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
630 reconstruct_active_formatting_elements()
633 # TODO lots more to implement here
634 else # any other start tag
635 reconstruct_active_formatting_elements()
639 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
640 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
643 unless ok_tags[t.name]?
646 # TODO stack of template insertion modes thing
647 flag_parsing = false # stop parsing
651 unless is_in_scope 'body'
654 # TODO implement parse error and move to tree_after_body
656 unless is_in_scope 'body' # weird, but it's what the spec says
659 # TODO implement parse error and move to tree_after_body, reprocess
660 # TODO lots more close tags to implement here
661 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
662 adoption_agency t.name
663 # TODO lots more close tags to implement here
665 in_body_any_other_end_tag t.name
669 # the functions below implement the tokenizer stats described here:
670 # http://www.w3.org/TR/html5/syntax.html#tokenization
672 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
674 switch c = txt.charAt(cur++)
676 return new_text_node tokenize_character_reference()
678 tok_state = tok_state_tag_open
681 return new_text_node c
683 return new_eof_token()
685 return new_text_node c
688 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
689 # not needed: tok_state_character_reference_in_data = ->
690 # just call tok_state_character_reference_in_data()
692 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
693 tok_state_tag_open = ->
694 switch c = txt.charAt(cur++)
696 tok_state = tok_state_markup_declaration_open
698 tok_state = tok_state_end_tag_open
701 tok_state = tok_state_bogus_comment
703 if lc_alpha.indexOf(c) > -1
704 tok_cur_tag = new_open_tag c
705 tok_state = tok_state_tag_name
706 else if uc_alpha.indexOf(c) > -1
707 tok_cur_tag = new_open_tag c.toLowerCase()
708 tok_state = tok_state_tag_name
711 tok_state = tok_state_data
712 cur -= 1 # we didn't parse/handle the char after <
713 return new_text_node '<'
716 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
717 tok_state_end_tag_open = ->
718 switch c = txt.charAt(cur++)
721 tok_state = tok_state_data
724 tok_state = tok_state_data
725 return new_text_node '</'
727 if uc_alpha.indexOf(c) > -1
728 tok_cur_tag = new_end_tag c.toLowerCase()
729 tok_state = tok_state_tag_name
730 else if lc_alpha.indexOf(c) > -1
731 tok_cur_tag = new_end_tag c
732 tok_state = tok_state_tag_name
735 tok_state = tok_state_bogus_comment
738 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
739 tok_state_tag_name = ->
740 switch c = txt.charAt(cur++)
741 when "\t", "\n", "\u000c", ' '
742 tok_state = tok_state_before_attribute_name
744 tok_state = tok_state_self_closing_start_tag
746 tok_state = tok_state_data
752 tok_cur_tag.name += "\ufffd"
755 tok_state = tok_state_data
757 if uc_alpha.indexOf(c) > -1
758 tok_cur_tag.name += c.toLowerCase()
760 tok_cur_tag.name += c
763 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
764 tok_state_before_attribute_name = ->
766 switch c = txt.charAt(cur++)
767 when "\t", "\n", "\u000c", ' '
770 tok_state = tok_state_self_closing_start_tag
773 tok_state = tok_state_data
780 when '"', "'", '<', '='
785 tok_state = tok_state_data
787 if uc_alpha.indexOf(c) > -1
788 attr_name = c.toLowerCase()
792 tok_cur_tag.attrs_a.unshift [attr_name, '']
793 tok_state = tok_state_attribute_name
796 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
797 tok_state_attribute_name = ->
798 switch c = txt.charAt(cur++)
799 when "\t", "\n", "\u000c", ' '
800 tok_state = tok_state_after_attribute_name
802 tok_state = tok_state_self_closing_start_tag
804 tok_state = tok_state_before_attribute_value
806 tok_state = tok_state_data
812 tok_cur_tag.attrs_a[0][0] = "\ufffd"
815 tok_cur_tag.attrs_a[0][0] = c
818 tok_state = tok_state_data
820 if uc_alpha.indexOf(c) > -1
821 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
823 tok_cur_tag.attrs_a[0][0] += c
826 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
827 tok_state_before_attribute_value = ->
828 switch c = txt.charAt(cur++)
829 when "\t", "\n", "\u000c", ' '
832 tok_state = tok_state_attribute_value_double_quoted
834 tok_state = tok_state_attribute_value_unquoted
837 tok_state = tok_state_attribute_value_single_quoted
840 tok_cur_tag.attrs_a[0][1] += "\ufffd"
841 tok_state = tok_state_attribute_value_unquoted
844 tok_state = tok_state_data
850 tok_state = tok_state_data
852 tok_cur_tag.attrs_a[0][1] += c
853 tok_state = tok_state_attribute_value_unquoted
856 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
857 tok_state_attribute_value_double_quoted = ->
858 switch c = txt.charAt(cur++)
860 tok_state = tok_state_after_attribute_value_quoted
862 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
865 tok_cur_tag.attrs_a[0][1] += "\ufffd"
868 tok_state = tok_state_data
870 tok_cur_tag.attrs_a[0][1] += c
873 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
874 tok_state_attribute_value_single_quoted = ->
875 switch c = txt.charAt(cur++)
877 tok_state = tok_state_after_attribute_value_quoted
879 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
882 tok_cur_tag.attrs_a[0][1] += "\ufffd"
885 tok_state = tok_state_data
887 tok_cur_tag.attrs_a[0][1] += c
890 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
891 tok_state_attribute_value_unquoted = ->
892 switch c = txt.charAt(cur++)
893 when "\t", "\n", "\u000c", ' '
894 tok_state = tok_state_before_attribute_name
896 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
898 tok_state = tok_state_data
903 tok_cur_tag.attrs_a[0][1] += "\ufffd"
906 tok_state = tok_state_data
908 # Parse Error if ', <, = or ` (backtick)
909 tok_cur_tag.attrs_a[0][1] += c
912 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
913 tok_state_after_attribute_value_quoted = ->
914 switch c = txt.charAt(cur++)
915 when "\t", "\n", "\u000c", ' '
916 tok_state = tok_state_before_attribute_name
918 tok_state = tok_state_self_closing_start_tag
920 tok_state = tok_state_data
926 tok_state = tok_state_data
929 tok_state = tok_state_before_attribute_name
930 cur -= 1 # we didn't handle that char
933 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
934 # Don't set this as a state, just call it
935 # returns a string (NOT a text node)
936 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
939 switch c = txt.charAt(cur)
940 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
941 # explicitly not a parse error
944 # there has to be "one or more" alnums between & and ; to be a parse error
947 if cur + 1 >= txt.length
949 if txt.charAt(cur + 1).toLowerCase() is 'x'
958 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
962 if txt.charAt(start + i) is ';'
964 # FIXME This is supposed to generate parse errors for some chars
965 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
972 if alnum.indexOf(txt.charAt(cur + i)) is -1
975 # exit early, because parse_error() below needs at least one alnum
977 if txt.charAt(cur + i) is ';'
978 i += 1 # include ';' terminator in value
979 decoded = decode_named_char_ref txt.substr(cur, i)
986 # no ';' terminator (only legacy char refs)
988 for i in [2..max] # no prefix matches, so ok to check shortest first
989 c = legacy_char_refs[txt.substr(cur, i)]
992 if txt.charAt(cur + i) is '='
993 # "because some legacy user agents will
994 # misinterpret the markup in those cases"
997 if alnum.indexOf(txt.charAt(cur + i)) > -1
998 # this makes attributes forgiving about url args
1000 # ok, and besides the weird exceptions for attributes...
1001 # return the matching char
1002 cur += i # consume entity chars
1003 parse_error() # because no terminating ";"
1007 return # never reached
1009 # tree constructor initialization
1010 # see comments on TYPE_TAG/etc for the structure of this data
1011 tree = new Node TYPE_TAG, name: 'html'
1013 tree_state = tree_in_body
1014 flag_frameset_ok = true
1016 afe = [] # active formatting elements
1018 # tokenizer initialization
1019 tok_state = tok_state_data
1026 return tree.children
1028 # everything below is tests on the above
1029 test_equals = (description, output, expected_output) ->
1030 if output is expected_output
1031 console.log "passed." # don't say name, so smart consoles can merge all of these
1033 console.log "FAILED: \"#{description}\""
1034 console.log " Expected: #{expected_output}"
1035 console.log " Actual: #{output}"
1036 test_parser = (args) ->
1040 parsed = parse_html args.html, errors_cb
1046 serialized += t.serialize()
1047 if serialized isnt args.expected or parse_errors.length isnt args.errors
1048 console.log "FAILED: \"#{args.name}\""
1050 console.log "passed \"#{args.name}\""
1051 if serialized isnt args.expected
1052 console.log " Input: #{args.html}"
1053 console.log " Correct: #{args.expected}"
1054 console.log " Output: #{serialized}"
1055 if parse_errors.length isnt args.errors
1056 console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1058 test_parser name: "empty", \
1062 test_parser name: "just text", \
1064 expected: 'text:"abc"',
1066 test_parser name: "named entity", \
1068 expected: 'text:"a&1234"',
1070 test_parser name: "broken named character references", \
1071 html: "1&2&&3&aabbcc;",
1072 expected: 'text:"1&2&&3&aabbcc;"',
1074 test_parser name: "numbered entity overrides", \
1075 html: "1€€ ƒ",
1076 expected: 'text:"1€€ ƒ"',
1078 test_parser name: "open tag", \
1079 html: "foo<span>bar",
1080 expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1081 errors: 1 # no close tag
1082 test_parser name: "open tag with attributes", \
1083 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1084 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1085 errors: 1 # no close tag
1086 test_parser name: "open tag with attributes of various quotings", \
1087 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1088 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1089 errors: 1 # no close tag
1090 test_parser name: "attribute entity exceptions dq", \
1091 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
1092 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1093 errors: 2 # no close tag, &= in attr
1094 test_parser name: "attribute entity exceptions sq", \
1095 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
1096 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1097 errors: 2 # no close tag, &= in attr
1098 test_parser name: "attribute entity exceptions uq", \
1099 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
1100 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1101 errors: 2 # no close tag, &= in attr
1102 test_parser name: "matching closing tags", \
1103 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1104 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1106 test_parser name: "missing closing tag inside", \
1107 html: "foo<div>bar<span>baz</div>qux",
1108 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1109 errors: 1 # close tag mismatch
1110 test_parser name: "mis-matched closing tags", \
1111 html: "<span>12<div>34</span>56</div>78",
1112 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1113 errors: 2 # misplaced </span>, no </span> at the end
1114 test_parser name: "mis-matched formatting elements", \
1115 html: "12<b>34<i>56</b>78</i>90",
1116 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1117 errors: 1 # no idea how many their should be
1118 test_parser name: "crazy formatting elements test", \
1119 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1120 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1121 # firefox does this:
1122 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1123 errors: 6 # no idea how many there should be