X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=a6d501fdb09d5b933d062abb3a751c7675cf088a;hb=55d296a75a0471af76f0dc05494e9eb98567d314;hp=0e79a1d5499b24c8b79834148cdcb9bfa74d4821;hpb=594fc20cbc397942900b2f8fdcf0d759af23d60d;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 0e79a1d..a6d501f 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -15,32 +15,57 @@ # along with this program. If not, see . -# This file implements a parser for html snippets, meant to be used by a +# This file implements a thorough parser for html5, meant to be used by a # WYSIWYG editor. # The implementation is a pretty direct implementation of the parsing algorithm # described here: -# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream # -# Deviations from that spec: +# http://www.w3.org/TR/html5/syntax.html # -# Purposeful: search this file for "WTAG" +# except for some places marked "WHATWG" that are implemented as described here: # -# Not finished yet: search this file for "fixfull", "TODO" and "FIXME" +# https://html.spec.whatwg.org/multipage/syntax.html +# +# This code passes all of the tests in the .dat files at: +# +# https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction + + +################################## +## how to use this code +################################## +# +# See README.md for how to run this file in the browser or in node.js. +# +# This file exports a single useful function: parse_tml, and some constants +# (see the bottom of this file for those.) +# +# Call it like this: +# +# wheic.parse_html("

") +# +# Or, if you don't want /etc, do this: +# +# wheic.parse_html("
hi
", {fragment: "body"}) +# +# return value is an array of Nodes, see "class Node" below. +# This code is a work in progress, eg try search this file for "fixfull", +# "TODO" and "FIXME" -# stacks/lists + +# Notes: stacks/lists # -# the spec uses a many different words do indicate which ends of lists/stacks -# they are talking about (and relative movement within the lists/stacks). This -# section splains. I'm implementing "lists" (afe and open_els) the same way -# (both as stacks) +# Jason was frequently confused by the terminology used to refer to different +# parts of the stacks and lists in the spec, so he made this chart to help keep +# his head straight: # # stacks grow downward (current element is index=0) # # example: open_els = [a, b, c, d, e, f, g] # -# "grows downwards" means it's visualized like this: (index: el, names) +# "grows downwards" means it's visualized like this: (index: el "names") # # 6: g "start of the list", "topmost", "first" # 5: f @@ -50,13 +75,13 @@ # 1: b # 0: a "end of the list", "current node", "bottommost", "last" - -# browser -# note: to get this to run outside a browser, you'll have to write a native -# implementation of decode_named_char_ref() -unless module?.exports? +if (typeof module) isnt 'undefined' and module.exports? + context = 'module' + exports = module.exports +else + context = 'browser' window.wheic = {} - module = exports: window.wheic + exports = window.wheic from_code_point = (x) -> if String.fromCodePoint? @@ -84,14 +109,23 @@ NS_HTML = 1 NS_MATHML = 2 NS_SVG = 3 +# quirks mode constants +QUIRKS_NO = 1 +QUIRKS_LIMITED = 2 +QUIRKS_YES = 3 + +# queue up debug logs, so eg they can be shown only for tests that fail g_debug_log = [] debug_log_reset = -> g_debug_log = [] + return debug_log = (str) -> g_debug_log.push str + return debug_log_each = (cb) -> for str in g_debug_log cb str + return prev_node_id = 0 class Node @@ -112,58 +146,16 @@ class Node @id = "#{++prev_node_id}" acknowledge_self_closing: -> if @token? - @token.flag 'did_self_close' + @token.flag 'did_self_close', true else @flag 'did_self_close', true + return flag: (key, value = null) -> if value? @flags[key] = value else return @flags[key] - serialize: (shallow = false, show_ids = false) -> # for unit tests - ret = '' - switch @type - when TYPE_TAG - ret += 'tag:' - ret += JSON.stringify @name - ret += ',' - if show_ids - ret += "##{@id}," - if shallow - break - attr_keys = [] - for k of @attrs - attr_keys.push k - attr_keys.sort() - ret += '{' - sep = '' - for k in attr_keys - ret += sep - sep = ',' - ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}" - ret += '},[' - sep = '' - for c in @children - ret += sep - sep = ',' - ret += c.serialize shallow, show_ids - ret += ']' - when TYPE_TEXT - ret += 'text:' - ret += JSON.stringify @text - when TYPE_COMMENT - ret += 'comment:' - ret += JSON.stringify @text - when TYPE_DOCTYPE - ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}" - when TYPE_AFE_MARKER - ret += 'marker' - when TYPE_AAA_BOOKMARK - ret += 'aaa_bookmark' - else - ret += 'unknown:' - console.log "unknown: #{JSON.stringify @}" # backtrace is just as well - return ret + return # helpers: (only take args that are normally known when parser creates nodes) new_open_tag = (name) -> @@ -249,6 +241,64 @@ unicode_fixes[0x9C] = "\u0153" unicode_fixes[0x9E] = "\u017E" unicode_fixes[0x9F] = "\u0178" +quirks_yes_pi_prefixes = [ + "+//silmaril//dtd html pro v0r11 19970101//" + "-//as//dtd html 3.0 aswedit + extensions//" + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//" + "-//ietf//dtd html 2.0 level 1//" + "-//ietf//dtd html 2.0 level 2//" + "-//ietf//dtd html 2.0 strict level 1//" + "-//ietf//dtd html 2.0 strict level 2//" + "-//ietf//dtd html 2.0 strict//" + "-//ietf//dtd html 2.0//" + "-//ietf//dtd html 2.1e//" + "-//ietf//dtd html 3.0//" + "-//ietf//dtd html 3.2 final//" + "-//ietf//dtd html 3.2//" + "-//ietf//dtd html 3//" + "-//ietf//dtd html level 0//" + "-//ietf//dtd html level 1//" + "-//ietf//dtd html level 2//" + "-//ietf//dtd html level 3//" + "-//ietf//dtd html strict level 0//" + "-//ietf//dtd html strict level 1//" + "-//ietf//dtd html strict level 2//" + "-//ietf//dtd html strict level 3//" + "-//ietf//dtd html strict//" + "-//ietf//dtd html//" + "-//metrius//dtd metrius presentational//" + "-//microsoft//dtd internet explorer 2.0 html strict//" + "-//microsoft//dtd internet explorer 2.0 html//" + "-//microsoft//dtd internet explorer 2.0 tables//" + "-//microsoft//dtd internet explorer 3.0 html strict//" + "-//microsoft//dtd internet explorer 3.0 html//" + "-//microsoft//dtd internet explorer 3.0 tables//" + "-//netscape comm. corp.//dtd html//" + "-//netscape comm. corp.//dtd strict html//" + "-//o'reilly and associates//dtd html 2.0//" + "-//o'reilly and associates//dtd html extended 1.0//" + "-//o'reilly and associates//dtd html extended relaxed 1.0//" + "-//sq//dtd html 2.0 hotmetal + extensions//" + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//" + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//" + "-//spyglass//dtd html 2.0 extended//" + "-//sun microsystems corp.//dtd hotjava html//" + "-//sun microsystems corp.//dtd hotjava strict html//" + "-//w3c//dtd html 3 1995-03-24//" + "-//w3c//dtd html 3.2 draft//" + "-//w3c//dtd html 3.2 final//" + "-//w3c//dtd html 3.2//" + "-//w3c//dtd html 3.2s draft//" + "-//w3c//dtd html 4.0 frameset//" + "-//w3c//dtd html 4.0 transitional//" + "-//w3c//dtd html experimental 19960712//" + "-//w3c//dtd html experimental 970421//" + "-//w3c//dtd w3 html//" + "-//w3o//dtd w3 html 3.0//" + "-//webtechs//dtd mozilla html 2.0//" + "-//webtechs//dtd mozilla html//" +] + # These are the character references that don't need a terminating semicolon # min length: 2, max: 6, none are a prefix of any other. legacy_char_refs = { @@ -342,7 +392,7 @@ special_elements = { img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML, listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, - menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these + menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML, @@ -468,7 +518,7 @@ svg_attribute_fixes = { diffuseconstant: 'diffuseConstant' edgemode: 'edgeMode' externalresourcesrequired: 'externalResourcesRequired' - filterres: 'filterRes' + # WHATWG removes this: filterres: 'filterRes' filterunits: 'filterUnits' glyphref: 'glyphRef' gradienttransform: 'gradientTransform' @@ -553,27 +603,31 @@ adjust_foreign_attributes = (t) -> # decode_named_char_ref() # -# The list of named character references is _huge_ so ask the browser to decode -# for us instead of wasting bandwidth/space on including the table here. -# -# Pass without the "&" but with the ";" examples: -# for "&" pass "amp;" -# for "′" pass "x2032;" -g_dncr = { - cache: {} - textarea: document.createElement('textarea') -} -# TODO test this in IE8 +# The list of named character references is _huge_ so if we're running in a +# browser, we get the browser to decode them, rather than increasing the code +# size to include the table. +if context is 'module' + _decode_named_char_ref = require './html5-named-entities.coffee' +else + # TODO test this in IE8 + decode_named_char_ref_el = document.createElement('textarea') + _decode_named_char_ref = (txt) -> + txt = "&#{txt};" + decode_named_char_ref_el.innerHTML = txt + decoded = decode_named_char_ref_el.value + return null if decoded is txt + return decoded +# Pass the name of a named entity _that has a terminating semicolon_ +# Entities without terminating semicolons should use legacy_char_refs[] +# Do not include the "&" or ";" in your argument, eg pass "alpha" +decode_named_char_ref_cache = {} decode_named_char_ref = (txt) -> - txt = "&#{txt}" - decoded = g_dncr.cache[txt] + decoded = decode_named_char_ref_cache[txt] return decoded if decoded? - g_dncr.textarea.innerHTML = txt - decoded = g_dncr.textarea.value - return null if decoded is txt - return g_dncr.cache[txt] = decoded + decoded = _decode_named_char_ref txt + return decode_named_char_ref_cache[txt] = decoded -parse_html = (args) -> +parse_html = (args_html, args = {}) -> txt = null cur = null # index of next char in txt to be parsed # declare doc and tokenizer variables so they're in scope below @@ -598,61 +652,77 @@ parse_html = (args) -> stop_parsing = -> flag_parsing = false + return parse_error = -> if args.error_cb? args.error_cb cur else console.log "Parse error at character #{cur} of #{txt.length}" + return + # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements + # "Noah's Ark clause" but with three afe_push = (new_el) -> matches = 0 for el, i in afe + if el.type is TYPE_AFE_MARKER + break if el.name is new_el.name and el.namespace is new_el.namespace + attrs_match = true for k, v of el.attrs - continue unless new_el.attrs[k] is v - for k, v of new_el.attrs - continue unless el.attrs[k] is v - matches += 1 - if matches is 3 - afe.splice i, 1 - break + unless new_el.attrs[k] is v + attrs_match = false + break + if attrs_match + for k, v of new_el.attrs + unless el.attrs[k] is v + attrs_match = false + break + if attrs_match + matches += 1 + if matches is 3 + afe.splice i, 1 + break afe.unshift new_el + return + afe_push_marker = -> afe.unshift new_afe_marker() + return # the functions below impliment the Tree Contstruction algorithm # http://www.w3.org/TR/html5/syntax.html#tree-construction # But first... the helpers template_tag_is_open = -> - for t in open_els - if t.name is 'template' and t.namespace is NS_HTML + for el in open_els + if el.name is 'template' and el.namespace is NS_HTML return true return false is_in_scope_x = (tag_name, scope, namespace) -> - for t in open_els - if t.name is tag_name and (namespace is null or namespace is t.namespace) + for el in open_els + if el.name is tag_name and (namespace is null or namespace is el.namespace) return true - if scope[t.name] is t.namespace + if scope[el.name] is el.namespace return false return false is_in_scope_x_y = (tag_name, scope, scope2, namespace) -> - for t in open_els - if t.name is tag_name and (namespace is null or namespace is t.namespace) + for el in open_els + if el.name is tag_name and (namespace is null or namespace is el.namespace) return true - if scope[t.name] is t.namespace + if scope[el.name] is el.namespace return false - if scope2[t.name] is t.namespace + if scope2[el.name] is el.namespace return false return false standard_scopers = { applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML, td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML, - template: NS_HTML, mi: NS_MATHML, + template: NS_HTML, - mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML, - 'annotation-xml': NS_MATHML, + mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, + mtext: NS_MATHML, 'annotation-xml': NS_MATHML, foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG } @@ -743,8 +813,8 @@ parse_html = (args) -> loop if node_i is open_els.length - 1 last = true - # fixfull (fragment case) - + if flag_fragment_parsing + node = context_element # 4. If node is a select element, run these substeps: if node.name is 'select' and node.namespace is NS_HTML # 1. If last is true, jump to the step below labeled done. @@ -853,6 +923,7 @@ parse_html = (args) -> node_i += 1 node = open_els[node_i] # 19. Return to the step labeled loop. + return # 8.2.3.2 @@ -884,6 +955,7 @@ parse_html = (args) -> afe[i] = el break if i is 0 i -= 1 # Advance + return # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm # adoption agency algorithm @@ -892,20 +964,43 @@ parse_html = (args) -> # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements adoption_agency = (subject) -> - debug_log "adoption_agency()" - debug_log "tree: #{serialize_els doc.children, false, true}" - debug_log "open_els: #{serialize_els open_els, true, true}" - debug_log "afe: #{serialize_els afe, true, true}" +# this block implements tha W3C spec +# # 1. If the current node is an HTML element whose tag name is subject, +# # then run these substeps: +# # +# # 1. Let element be the current node. +# # +# # 2. Pop element off the stack of open elements. +# # +# # 3. If element is also in the list of active formatting elements, +# # remove the element from the list. +# # +# # 4. Abort the adoption agency algorithm. +# if open_els[0].name is subject and open_els[0].namespace is NS_HTML +# el = open_els.shift() +# # remove it from the list of active formatting elements (if found) +# for t, i in afe +# if t is el +# afe.splice i, 1 +# break +# return +# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm + # If the current node is an HTML element whose tag name is subject, and + # the current node is not in the list of active formatting elements, + # then pop the current node off the stack of open elements, and abort + # these steps. if open_els[0].name is subject and open_els[0].namespace is NS_HTML - el = open_els[0] - open_els.shift() # remove it from the list of active formatting elements (if found) - for t, i in afe - if t is el - afe.splice i, 1 + in_afe = false + for el, i in afe + if el is open_els[0] + in_afe = true break - debug_log "aaa: starting off with subject on top of stack, exiting" - return + unless in_afe + open_els.shift() + return + # fall through +# END WHATWG outer = 0 loop if outer >= 8 @@ -925,7 +1020,6 @@ parse_html = (args) -> # If there is no such element, then abort these steps and instead # act as described in the "any other end tag" entry above. if fe is null - debug_log "aaa: fe not found in afe" in_body_any_other_end_tag subject return # 6. If formatting element is not in the stack of open elements, @@ -937,7 +1031,6 @@ parse_html = (args) -> in_open_els = true break unless in_open_els - debug_log "aaa: fe not found in open_els" parse_error() # "remove it from the list" must mean afe, since it's not in open_els afe.splice fe_of_afe, 1 @@ -946,7 +1039,6 @@ parse_html = (args) -> # the element is not in scope, then this is a parse error; abort # these steps. unless el_is_in_scope fe - debug_log "aaa: fe not in scope" parse_error() return # 8. If formatting element is not the current node, this is a parse @@ -972,7 +1064,6 @@ parse_html = (args) -> # formatting element from the list of active formatting elements, # and finally abort these steps. if fb is null - debug_log "aaa: no fb" loop t = open_els.shift() if t is fe @@ -1004,21 +1095,12 @@ parse_html = (args) -> node_next = open_els[i + 1] break node = node_next ? node_above - debug_log "inner loop #{inner}" - debug_log "tree: #{serialize_els doc.children, false, true}" - debug_log "open_els: #{serialize_els open_els, true, true}" - debug_log "afe: #{serialize_els afe, true, true}" - debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}" - debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}" - debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}" - debug_log "node: #{node.serialize true, true}" # TODO make sure node_above gets re-set if/when node is removed from open_els # 4. If node is formatting element, then go to the next step in # the overall algorithm. if node is fe break - debug_log "the meat" # 5. If inner loop counter is greater than three and node is in # the list of active formatting elements, then remove node from # the list of active formatting elements. @@ -1027,23 +1109,19 @@ parse_html = (args) -> if t is node if inner > 3 afe.splice i, 1 - debug_log "max out inner" else node_in_afe = true - debug_log "in afe" break # 6. If node is not in the list of active formatting elements, # then remove node from the stack of open elements and then go # back to the step labeled inner loop. unless node_in_afe - debug_log "not in afe" for t, i in open_els if t is node node_above = open_els[i + 1] open_els.splice i, 1 break continue - debug_log "the bones" # 7. create an element for the token for which the element node # was created, in the HTML namespace, with common ancestor as # the intended parent; replace the entry for node in the list @@ -1055,13 +1133,11 @@ parse_html = (args) -> for t, i in afe if t is node afe[i] = new_node - debug_log "replaced in afe" break for t, i in open_els if t is node node_above = open_els[i + 1] open_els[i] = new_node - debug_log "replaced in open_els" break node = new_node # 8. If last node is furthest block, then move the @@ -1071,29 +1147,23 @@ parse_html = (args) -> for t, i in afe if t is bookmark afe.splice i, 1 - debug_log "removed bookmark" break for t, i in afe if t is node # "after" means lower afe.splice i, 0, bookmark # "after as <- - debug_log "placed bookmark after node" - debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}" break # 9. Insert last node into node, first removing it from its # previous parent node if any. if last_node.parent? - debug_log "last_node has parent" for c, i in last_node.parent.children if c is last_node - debug_log "removing last_node from parent" last_node.parent.children.splice i, 1 break node.children.push last_node last_node.parent = node # 10. Let last node be node. last_node = node - debug_log "at last" # 11. Return to the step labeled inner loop. # 14. Insert whatever last node ended up being in the previous step # at the appropriate place for inserting a node, but using common @@ -1104,36 +1174,15 @@ parse_html = (args) -> # * last_node is fb # * last_node is still in the tree (not a duplicate) if last_node.parent? - debug_log "FEFIRST? last_node has parent" for c, i in last_node.parent.children if c is last_node - debug_log "removing last_node from parent" last_node.parent.children.splice i, 1 break - - debug_log "after aaa inner loop" - debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}" - debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}" - debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}" - debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}" - debug_log "tree: #{serialize_els doc.children, false, true}" - - debug_log "insert" - - # can't use standard insert token thing, because it's already in # open_els and must stay at it's current position in open_els dest = adjusted_insertion_location ca dest[0].children.splice dest[1], 0, last_node last_node.parent = dest[0] - - - debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}" - debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}" - debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}" - debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}" - debug_log "tree: #{serialize_els doc.children, false, true}" - # 15. Create an element for the token for which formatting element # was created, in the HTML namespace, with furthest block as the # intended parent. @@ -1171,11 +1220,7 @@ parse_html = (args) -> open_els.splice i, 0, new_element break # 20. Jump back to the step labeled outer loop. - debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}" - debug_log "tree: #{serialize_els doc.children, false, true}" - debug_log "open_els: #{serialize_els open_els, true, true}" - debug_log "afe: #{serialize_els afe, true, true}" - debug_log "AAA DONE" + return # http://www.w3.org/TR/html5/syntax.html#close-a-p-element close_p_element = -> @@ -1186,9 +1231,11 @@ parse_html = (args) -> el = open_els.shift() if el.name is 'p' and el.namespace is NS_HTML return + return close_p_if_in_button_scope = -> if is_in_button_scope 'p', NS_HTML close_p_element() + return # http://www.w3.org/TR/html5/syntax.html#insert-a-character # aka insert_a_character = (t) -> @@ -1201,7 +1248,7 @@ parse_html = (args) -> prev.text += t.text return dest[0].children.splice dest[1], 0, t - + return # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction process_token = (t) -> @@ -1213,7 +1260,7 @@ parse_html = (args) -> ins_mode t return if is_mathml_text_integration_point(acn) - if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark') + if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark') ins_mode t return if t.type is TYPE_TEXT @@ -1357,13 +1404,14 @@ parse_html = (args) -> return el # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element insert_html_element = (token) -> - insert_foreign_element token, NS_HTML + return insert_foreign_element token, NS_HTML # http://www.w3.org/TR/html5/syntax.html#insert-a-comment # position should be [node, index_within_children] insert_comment = (t, position = null) -> position ?= adjusted_insertion_location() position[0].children.splice position[1], 0, t + return # 8.2.5.2 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm @@ -1372,23 +1420,55 @@ parse_html = (args) -> tok_state = tok_state_rawtext original_ins_mode = ins_mode ins_mode = ins_mode_text + return parse_generic_rcdata_text = (t) -> insert_html_element t tok_state = tok_state_rcdata original_ins_mode = ins_mode ins_mode = ins_mode_text + return # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags generate_implied_end_tags = (except = null) -> while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except open_els.shift() + return # 8.2.5.4 The rules for parsing tokens in HTML content # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml # 8.2.5.4.1 The "initial" insertion mode # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode + is_quirks_yes_doctype = (t) -> + if t.flag 'force-quirks' + return true + if t.name isnt 'html' + return true + if t.public_identifier? + pi = t.public_identifier.toLowerCase() + for p in quirks_yes_pi_prefixes + if pi.substr(0, p.length) is p + return true + if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html' + return true + if t.system_identifier? + if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' + return true + else if t.public_identifier? + # already did this: pi = t.public_identifier.toLowerCase() + if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//' + return true + return false + is_quirks_limited_doctype = (t) -> + if t.public_identifier? + pi = t.public_identifier.toLowerCase() + if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//' + return true + if t.system_identifier? + if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//' + return true + return false ins_mode_initial = (t) -> if is_space_tok t return @@ -1397,13 +1477,20 @@ parse_html = (args) -> doc.children.push t return if t.type is TYPE_DOCTYPE - # FIXME check identifiers, set quirks, etc - # fixfull + # fixfull syntax error from first paragraph and following bullets + # fixfull set doc.doctype + # fixfull is the "not an iframe srcdoc" thing relevant? + if is_quirks_yes_doctype t + doc.flag 'quirks mode', QUIRKS_YES + else if is_quirks_limited_doctype t + doc.flag 'quirks mode', QUIRKS_LIMITED doc.children.push t ins_mode = ins_mode_before_html return # Anything else - #fixfull (iframe, quirks) + # fixfull not iframe srcdoc? + parse_error() + doc.flag 'quirks mode', QUIRKS_YES ins_mode = ins_mode_before_html process_token t return @@ -1421,6 +1508,7 @@ parse_html = (args) -> if t.type is TYPE_START_TAG and t.name is 'html' el = token_to_element t, NS_HTML, doc doc.children.push el + el.document = doc open_els.unshift(el) # fixfull (big paragraph in spec about manifest, fragment, urls, etc) ins_mode = ins_mode_before_head @@ -1432,9 +1520,9 @@ parse_html = (args) -> parse_error() return # Anything else - html_tok = new_open_tag 'html' - el = token_to_element html_tok, NS_HTML, doc + el = token_to_element new_open_tag('html'), NS_HTML, doc doc.children.push el + el.document = doc open_els.unshift el # ?fixfull browsing context ins_mode = ins_mode_before_head @@ -1466,17 +1554,18 @@ parse_html = (args) -> parse_error() return # Anything else - head_tok = new_open_tag 'head' - el = insert_html_element head_tok + el = insert_html_element new_open_tag 'head' head_element_pointer = el ins_mode = ins_mode_in_head process_token t + return # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control open_els.shift() # spec says this will be a 'head' node ins_mode = ins_mode_after_head process_token t + return ins_mode_in_head = (t) -> if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ') insert_character t @@ -1555,6 +1644,7 @@ parse_html = (args) -> parse_error() return ins_mode_in_head_else t + return # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript ins_mode_in_head_noscript_else = (t) -> @@ -1562,6 +1652,7 @@ parse_html = (args) -> open_els.shift() ins_mode = ins_mode_in_head process_token t + return ins_mode_in_head_noscript = (t) -> if t.type is TYPE_DOCTYPE parse_error() @@ -1586,8 +1677,6 @@ parse_html = (args) -> ins_mode_in_head_noscript_else t return - - # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode ins_mode_after_head_else = (t) -> body_tok = new_open_tag 'body' @@ -1621,11 +1710,10 @@ parse_html = (args) -> parse_error() open_els.unshift head_element_pointer ins_mode_in_head t - for el, i of open_els + for el, i in open_els if el is head_element_pointer open_els.splice i, 1 return - console.log "warning: 23904 couldn't find head element in open_els" return if t.type is TYPE_END_TAG and t.name is 'template' ins_mode_in_head t @@ -1638,20 +1726,27 @@ parse_html = (args) -> return # Anything else ins_mode_after_head_else t + return # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it - for el, i in open_els - if el.name is name and el.namespace is NS_HTML + node = open_els[0] + loop + if node.name is name and node.namespace is NS_HTML generate_implied_end_tags name # arg is exception - parse_error() unless i is 0 - while i >= 0 - open_els.shift() - i -= 1 - return - if special_elements[el.name] is el.namespace + unless node is open_els[0] + parse_error() + loop + el = open_els.shift() + if el is node + return + if special_elements[node.name] is node.namespace parse_error() return + for el, i in open_els + if node is el + node = open_els[i + 1] + break return ins_mode_in_body = (t) -> if t.type is TYPE_TEXT and t.text is "\u0000" @@ -1676,7 +1771,7 @@ parse_html = (args) -> parse_error() return if template_tag_is_open() root_attrs = open_els[open_els.length - 1].attrs - for a of t.attrs_a + for a in t.attrs_a root_attrs[a[0]] = a[1] unless root_attrs[a[0]]? return @@ -1691,7 +1786,7 @@ parse_html = (args) -> return unless second.name is 'body' return if template_tag_is_open() flag_frameset_ok = false - for a of t.attrs_a + for a in t.attrs_a second.attrs[a[0]] = a[1] unless second.attrs[a[0]]? return if t.type is TYPE_START_TAG and t.name is 'frameset' @@ -1779,11 +1874,7 @@ parse_html = (args) -> if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing') close_p_if_in_button_scope() insert_html_element t - # spec: If the next token is a "LF" (U+000A) character token, then - # ignore that token and move on to the next one. (Newlines at the - # start of pre blocks are ignored as an authoring convenience.) - if txt.charAt(cur) is "\u000a" # FIXME check for crlf? - cur += 1 + eat_next_token_if_newline() flag_frameset_ok = false return if t.type is TYPE_START_TAG and t.name is 'form' @@ -1978,6 +2069,10 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and t.name is 'nobr' reconstruct_afe() + if is_in_scope 'nobr', NS_HTML + parse_error() + adoption_agency 'nobr' + reconstruct_afe() el = insert_html_element t afe_push el return @@ -2004,14 +2099,16 @@ parse_html = (args) -> clear_afe_to_marker() return if t.type is TYPE_START_TAG and t.name is 'table' - close_p_if_in_button_scope() # fixfull quirksmode thing + unless doc.flag('quirks mode') is QUIRKS_YES + close_p_if_in_button_scope() # test insert_html_element t flag_frameset_ok = false ins_mode = ins_mode_in_table return if t.type is TYPE_END_TAG and t.name is 'br' parse_error() - t.type is TYPE_START_TAG + # W3C: t.type = TYPE_START_TAG + t = new_open_tag 'br' # WHATWG # fall through if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr') reconstruct_afe() @@ -2028,7 +2125,8 @@ parse_html = (args) -> unless is_input_hidden_tok t flag_frameset_ok = false return - if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track') + if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track') + # WHATWG adds 'menuitem' for this block insert_html_element t open_els.shift() t.acknowledge_self_closing() @@ -2088,8 +2186,7 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and t.name is 'textarea' insert_html_element t - if txt.charAt(cur) is "\u000a" # FIXME check for crlf? - cur += 1 + eat_next_token_if_newline() tok_state = tok_state_rcdata original_ins_mode = ins_mode flag_frameset_ok = false @@ -2138,7 +2235,7 @@ parse_html = (args) -> # parse_error() # insert_html_element t # return -# below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody +# below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc') if is_in_scope 'ruby', NS_HTML generate_implied_end_tags() @@ -2153,7 +2250,7 @@ parse_html = (args) -> parse_error() insert_html_element t return -# end WATWG chunk +# end WHATWG chunk if t.type is TYPE_START_TAG and t.name is 'math' reconstruct_afe() adjust_mathml_attributes t @@ -2207,7 +2304,7 @@ parse_html = (args) -> open_els.shift() ins_mode = original_ins_mode return - console.log 'warning: end of ins_mode_text reached' + return # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization @@ -2308,6 +2405,7 @@ parse_html = (args) -> ins_mode_in_body t else ins_mode_in_table_else t + return # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext @@ -2334,6 +2432,7 @@ parse_html = (args) -> pending_table_character_tokens = [] ins_mode = original_ins_mode process_token t + return # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption ins_mode_in_caption = (t) -> @@ -2369,6 +2468,7 @@ parse_html = (args) -> return # Anything else ins_mode_in_body t + return # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup ins_mode_in_column_group = (t) -> @@ -2457,6 +2557,7 @@ parse_html = (args) -> return # Anything else ins_mode_in_table t + return # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr ins_mode_in_row = (t) -> @@ -2498,6 +2599,7 @@ parse_html = (args) -> return # Anything else ins_mode_in_table t + return # http://www.w3.org/TR/html5/syntax.html#close-the-cell close_the_cell = -> @@ -2510,6 +2612,7 @@ parse_html = (args) -> break clear_afe_to_marker() ins_mode = ins_mode_in_row + return # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd ins_mode_in_cell = (t) -> @@ -2553,6 +2656,7 @@ parse_html = (args) -> return # Anything Else ins_mode_in_body t + return # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect ins_mode_in_select = (t) -> @@ -2584,7 +2688,7 @@ parse_html = (args) -> insert_html_element t return if t.type is TYPE_END_TAG and t.name is 'optgroup' - if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML + if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML open_els.shift() if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML @@ -2620,7 +2724,7 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea') parse_error() - if is_in_select_scope 'select', NS_HTML + unless is_in_select_scope 'select', NS_HTML return loop el = open_els.shift() @@ -2719,6 +2823,7 @@ parse_html = (args) -> template_ins_modes.shift() reset_ins_mode() process_token t + return # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody ins_mode_after_body = (t) -> @@ -2726,7 +2831,8 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_COMMENT - insert_comment t, [open_els[0], open_els[0].children.length] + first = open_els[open_els.length - 1] + insert_comment t, [first, first.children.length] return if t.type is TYPE_DOCTYPE parse_error() @@ -2747,6 +2853,7 @@ parse_html = (args) -> parse_error() ins_mode = ins_mode_in_body process_token t + return # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset ins_mode_in_frameset = (t) -> @@ -2805,7 +2912,7 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_END_TAG and t.name is 'html' - insert_mode = ins_mode_after_after_frameset + ins_mode = ins_mode_after_after_frameset return if t.type is TYPE_START_TAG and t.name is 'noframes' ins_mode_in_head t @@ -2872,7 +2979,7 @@ parse_html = (args) -> adjust_svg_attributes t adjust_foreign_attributes t insert_foreign_element t, acn.namespace - if t.flag 'self-closing' # FIXME CONTINUE this isn't getting set + if t.flag 'self-closing' if t.name is 'script' t.acknowledge_self_closing() in_foreign_content_end_script() @@ -2906,8 +3013,7 @@ parse_html = (args) -> return loop # is this safe? open_els.shift() - cn = open_els[0] - if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML + if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML break process_token t return @@ -2918,9 +3024,11 @@ parse_html = (args) -> in_foreign_content_end_script() return if t.type is TYPE_END_TAG - if open_els[0].name.toLowerCase() isnt t.name + i = 0 + node = open_els[i] + if node.name.toLowerCase() isnt t.name parse_error() - for node in open_els + loop if node is open_els[open_els.length - 1] return if node.name.toLowerCase() is t.name @@ -2928,9 +3036,12 @@ parse_html = (args) -> el = open_els.shift() if el is node return + i += 1 + node = open_els[i] if node.namespace is NS_HTML break ins_mode t # explicitly call HTML insertion mode + return # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state @@ -2942,7 +3053,7 @@ parse_html = (args) -> tok_state = tok_state_tag_open when "\u0000" parse_error() - return new_text_node "\ufffd" + return new_text_node c when '' # EOF return new_eof_token() else @@ -3016,50 +3127,55 @@ parse_html = (args) -> # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> - switch c = txt.charAt(cur++) - when '!' - tok_state = tok_state_markup_declaration_open - when '/' - tok_state = tok_state_end_tag_open - when '?' - parse_error() - tok_cur_tag = new_comment_token '?' - tok_state = tok_state_bogus_comment - else - if is_lc_alpha(c) - tok_cur_tag = new_open_tag c - tok_state = tok_state_tag_name - else if is_uc_alpha(c) - tok_cur_tag = new_open_tag c.toLowerCase() - tok_state = tok_state_tag_name - else - parse_error() - tok_state = tok_state_data - cur -= 1 # we didn't parse/handle the char after < - return new_text_node '<' - return null + c = txt.charAt(cur++) + if c is '!' + tok_state = tok_state_markup_declaration_open + return + if c is '/' + tok_state = tok_state_end_tag_open + return + if is_uc_alpha(c) + tok_cur_tag = new_open_tag c.toLowerCase() + tok_state = tok_state_tag_name + return + if is_lc_alpha(c) + tok_cur_tag = new_open_tag c + tok_state = tok_state_tag_name + return + if c is '?' + parse_error() + tok_cur_tag = new_comment_token '?' # FIXME right? + tok_state = tok_state_bogus_comment + return + # Anything else + parse_error() + tok_state = tok_state_data + cur -= 1 # we didn't parse/handle the char after < + return new_text_node '<' # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state tok_state_end_tag_open = -> - switch c = txt.charAt(cur++) - when '>' - parse_error() - tok_state = tok_state_data - when '' # EOF - parse_error() - tok_state = tok_state_data - return new_text_node '' + parse_error() + tok_state = tok_state_data + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + return new_text_node ' # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token is_appropriate_end_tag = (t) -> - # spec says to check against "the tag name of the last start tag to - # have been emitted from this tokenizer", but this is only called from - # the various "raw" states, so it's hopefully ok to assume that - # open_els[0].name will work instead TODO: verify this after the script - # data states are implemented - debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}" + # fixfull: this assumes that open_els[0].name is "the tag name of the last + # start tag to have been emitted from this tokenizer" return t.type is TYPE_END_TAG and t.name is open_els[0].name # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state @@ -3389,7 +3501,7 @@ parse_html = (args) -> # Anything else tok_state = tok_state_script_data_escaped cur -= 1 # Reconsume - return new_character_token c + return new_character_token '<' # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state tok_state_script_data_escaped_end_tag_open = -> @@ -3634,7 +3746,7 @@ parse_html = (args) -> return if c is '>' tok_state = tok_state_data - return + return tok_cur_tag if is_uc_alpha(c) tok_cur_tag.attrs_a.unshift [c.toLowerCase(), ''] tok_state = tok_state_attribute_name @@ -3655,6 +3767,7 @@ parse_html = (args) -> # Anything else tok_cur_tag.attrs_a.unshift [c, ''] tok_state = tok_state_attribute_name + return # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state tok_state_before_attribute_value = -> @@ -3767,7 +3880,7 @@ parse_html = (args) -> tok_state_self_closing_start_tag = -> c = txt.charAt(cur++) if c is '>' - tok_cur_tag.flag 'self-closing' + tok_cur_tag.flag 'self-closing', true tok_state = tok_state_data return tok_cur_tag if c is '' @@ -4387,7 +4500,10 @@ parse_html = (args) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - return new_character_token val # fixfull split + val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") + if val.length > 0 + return new_character_token val # fixfull split + return null # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it @@ -4447,42 +4563,60 @@ parse_html = (args) -> # exit early, because parse_error() below needs at least one alnum return '&' if txt.charAt(cur + i) is ';' - i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) + i += 1 # scan past the ';' (after, so we dno't pass it to decode) if decoded? cur += i return decoded - parse_error() - return '&' - else - # no ';' terminator (only legacy char refs) - max = i - for i in [2..max] # no prefix matches, so ok to check shortest first - c = legacy_char_refs[txt.substr(cur, i)] - if c? - if in_attr - if txt.charAt(cur + i) is '=' - # "because some legacy user agents will - # misinterpret the markup in those cases" - parse_error() - return '&' - if alnum.indexOf(txt.charAt(cur + i)) > -1 - # this makes attributes forgiving about url args - return '&' - # ok, and besides the weird exceptions for attributes... - # return the matching char - cur += i # consume entity chars - parse_error() # because no terminating ";" - return c - parse_error() - return '&' + # else FALL THROUGH (check for match without last char(s) or ";") + # no ';' terminator (only legacy char refs) + max = i + for i in [2..max] # no prefix matches, so ok to check shortest first + c = legacy_char_refs[txt.substr(cur, i)] + if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char + cur += i # consume entity chars + parse_error() # because no terminating ";" + return c + parse_error() + return '&' return # never reached + eat_next_token_if_newline = -> + old_cur = cur + t = null + until t? + t = tok_state() + if t.type is TYPE_TEXT + # definition of a newline depends on whether it was a character ref or not + if cur - old_cur is 1 + # not a character reference + if t.text is "\u000d" or t.text is "\u000a" + return + else + if t.text is "\u000a" + return + # not a "newline" + cur = old_cur + return + # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data - txt = args.html + txt = args_html cur = 0 - doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML + doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this + fragment_root = null # fragment parsing algorithm returns children of this open_els = [] afe = [] # active formatting elements template_ins_modes = [] @@ -4496,45 +4630,119 @@ parse_html = (args) -> temporary_buffer = null pending_table_character_tokens = [] head_element_pointer = null - flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) - context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + flag_fragment_parsing = false + context_element = null + prev_node_id = 0 # just for debugging # tokenizer initialization tok_state = tok_state_data - # text pre-processing - # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream - txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this - txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this - txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this + parse_init = -> + # fragment parsing (text arg) + if args.fragment? + # this handles the fragment from the tests in the format described here: + # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md + f = args.fragment + ns = NS_HTML + if f.substr(0, 5) is 'math ' + f = f.substr 5 + ns = NS_MATHML + else if f.substr(0, 4) is 'svg ' + f = f.substr 4 + ns = NS_SVG + t = new_open_tag f + context_element = token_to_element t, ns + context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML + context_element.document.flag 'quirks mode', QUIRKS_NO + # fragment parsing (Node arg) + if args.context? + context_element = args.context + + # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + # fragment parsing algorithm + if context_element? + flag_fragment_parsing = true + doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + # search up the tree from context, to try to find it's document, + # because this file only puts a "document" property on the root + # element. + old_doc = null + el = context_element + loop + if el.document? + old_doc = el.document + break + if el.parent + el = el.parent + else + break + if old_doc + doc.flag 'quirks mode', old_doc.flag 'quirks mode' + # set tok_state + if context_element.namespace is NS_HTML + switch context_element.name + when 'title', 'textarea' + tok_state = tok_state_rcdata + when 'style', 'xmp', 'iframe', 'noembed', 'noframes' + tok_state = tok_state_rawtext + when 'script' + tok_state = tok_state_script_data + when 'noscript' + if flag_scripting + tok_state = tok_state_rawtext + when 'plaintext' + tok_state = tok_state_plaintext + fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc.children.push fragment_root + fragment_root.document = doc + open_els = [fragment_root] + if context_element.name is 'template' and context_element.namespace is NS_HTML + template_ins_modes.unshift ins_mode_in_template + # fixfull create token for context (it should have it's original one already) + reset_ins_mode() + # set form_element pointer... in the foreign doc?! + el = context_element + loop + if el.name is 'form' and el.namespace is NS_HTML + form_element_pointer = el + break + if el.parent + el = el.parent + else + break + + # text pre-processing + # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream + txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this + + return - if args.name is "plain-text-unsafe.dat #4" - console.log "hi" - # proccess input # http://www.w3.org/TR/html5/syntax.html#tree-construction - while flag_parsing - t = tok_state() - if t? - process_token t - # fixfull parse error if has self-closing flag, but it wasn't acknolwedged + parse_main_loop = -> + while flag_parsing + t = tok_state() + if t? + process_token t + # fixfull parse error if has self-closing flag, but it wasn't acknolwedged + return + parse_init() + parse_main_loop() + + if flag_fragment_parsing + return fragment_root.children return doc.children -serialize_els = (els, shallow, show_ids) -> - serialized = '' - sep = '' - for t in els - serialized += sep - sep = ',' - serialized += t.serialize shallow, show_ids - return serialized - -module.exports.parse_html = parse_html -module.exports.debug_log_reset = debug_log_reset -module.exports.debug_log_each = debug_log_each -module.exports.TYPE_TAG = TYPE_TAG -module.exports.TYPE_TEXT = TYPE_TEXT -module.exports.TYPE_COMMENT = TYPE_COMMENT -module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE -module.exports.NS_HTML = NS_HTML -module.exports.NS_MATHML = NS_MATHML -module.exports.NS_SVG = NS_SVG +exports.parse_html = parse_html +exports.debug_log_reset = debug_log_reset +exports.debug_log_each = debug_log_each +exports.TYPE_TAG = TYPE_TAG +exports.TYPE_TEXT = TYPE_TEXT +exports.TYPE_COMMENT = TYPE_COMMENT +exports.TYPE_DOCTYPE = TYPE_DOCTYPE +exports.NS_HTML = NS_HTML +exports.NS_MATHML = NS_MATHML +exports.NS_SVG = NS_SVG +exports.QUIRKS_NO = QUIRKS_NO +exports.QUIRKS_LIMITED = QUIRKS_LIMITED +exports.QUIRKS_YES = QUIRKS_YES