X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=362240fdb8f33512c7b7a279d39f40dec6de8861;hb=71cc53897f25e7f7685a0bc1c7182554ca736c73;hp=46195f9e63f8681e255e3ae60ec4ebe35c6d1914;hpb=0c9099629992eca837582bf019c54e9efd8a7f21;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 46195f9..362240f 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -15,32 +15,57 @@ # along with this program. If not, see . -# This file implements a parser for html snippets, meant to be used by a +# This file implements a thorough parser for html5, meant to be used by a # WYSIWYG editor. # The implementation is a pretty direct implementation of the parsing algorithm # described here: -# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream # -# Deviations from that spec: +# http://www.w3.org/TR/html5/syntax.html # -# Purposeful: search this file for "WHATWG" +# except for some places marked "WHATWG" that are implemented as described here: # -# Not finished yet: search this file for "fixfull", "TODO" and "FIXME" +# https://html.spec.whatwg.org/multipage/syntax.html +# +# This code passes all of the tests in the .dat files at: +# +# https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction + + +################################## +## how to use this code +################################## +# +# See README.md for how to run this file in the browser or in node.js. +# +# This file exports a single useful function: parse_tml, and some constants +# (see the bottom of this file for those.) +# +# Call it like this: +# +# wheic.parse_html("

hi

") +# +# Or, if you don't want /etc, do this: +# +# wheic.parse_html("

hi

", {fragment: "body"}) +# +# return value is an array of Nodes, see "class Node" below. + +# This code is a work in progress, eg try search this file for "fixfull", +# "TODO" and "FIXME" -# stacks/lists +# Notes: stacks/lists # -# the spec uses a many different words do indicate which ends of lists/stacks -# they are talking about (and relative movement within the lists/stacks). This -# section splains. I'm implementing "lists" (afe and open_els) the same way -# (both as stacks) +# Jason was frequently confused by the terminology used to refer to different +# parts of the stacks and lists in the spec, so he made this chart to help keep +# his head straight: # # stacks grow downward (current element is index=0) # # example: open_els = [a, b, c, d, e, f, g] # -# "grows downwards" means it's visualized like this: (index: el, names) +# "grows downwards" means it's visualized like this: (index: el "names") # # 6: g "start of the list", "topmost", "first" # 5: f @@ -50,13 +75,13 @@ # 1: b # 0: a "end of the list", "current node", "bottommost", "last" - -# browser -# note: to get this to run outside a browser, you'll have to write a native -# implementation of decode_named_char_ref() -unless module?.exports? +if (typeof module) isnt 'undefined' and module.exports? + context = 'module' + exports = module.exports +else + context = 'browser' window.wheic = {} - module = exports: window.wheic + exports = window.wheic from_code_point = (x) -> if String.fromCodePoint? @@ -578,27 +603,31 @@ adjust_foreign_attributes = (t) -> # decode_named_char_ref() # -# The list of named character references is _huge_ so ask the browser to decode -# for us instead of wasting bandwidth/space on including the table here. -# -# Pass without the "&" but with the ";" examples: -# for "&" pass "amp;" -# for "′" pass "x2032;" -g_dncr = { - cache: {} - textarea: document.createElement('textarea') -} -# TODO test this in IE8 +# The list of named character references is _huge_ so if we're running in a +# browser, we get the browser to decode them, rather than increasing the code +# size to include the table. +if context is 'module' + _decode_named_char_ref = require './html5-named-entities.coffee' +else + # TODO test this in IE8 + decode_named_char_ref_el = document.createElement('textarea') + _decode_named_char_ref = (txt) -> + txt = "&#{txt};" + decode_named_char_ref_el.innerHTML = txt + decoded = decode_named_char_ref_el.value + return null if decoded is txt + return decoded +# Pass the name of a named entity _that has a terminating semicolon_ +# Entities without terminating semicolons should use legacy_char_refs[] +# Do not include the "&" or ";" in your argument, eg pass "alpha" +decode_named_char_ref_cache = {} decode_named_char_ref = (txt) -> - txt = "&#{txt}" - decoded = g_dncr.cache[txt] + decoded = decode_named_char_ref_cache[txt] return decoded if decoded? - g_dncr.textarea.innerHTML = txt - decoded = g_dncr.textarea.value - return null if decoded is txt - return g_dncr.cache[txt] = decoded + decoded = _decode_named_char_ref txt + return decode_named_char_ref_cache[txt] = decoded -parse_html = (args) -> +parse_html = (args_html, args = {}) -> txt = null cur = null # index of next char in txt to be parsed # declare doc and tokenizer variables so they're in scope below @@ -628,8 +657,6 @@ parse_html = (args) -> parse_error = -> if args.error_cb? args.error_cb cur - else - console.log "Parse error at character #{cur} of #{txt.length}" return # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements @@ -4534,35 +4561,33 @@ parse_html = (args) -> # exit early, because parse_error() below needs at least one alnum return '&' if txt.charAt(cur + i) is ';' - i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) + i += 1 # scan past the ';' (after, so we dno't pass it to decode) if decoded? cur += i return decoded - parse_error() - return '&' - else - # no ';' terminator (only legacy char refs) - max = i - for i in [2..max] # no prefix matches, so ok to check shortest first - c = legacy_char_refs[txt.substr(cur, i)] - if c? - if in_attr - if txt.charAt(cur + i) is '=' - # "because some legacy user agents will - # misinterpret the markup in those cases" - parse_error() - return '&' - if alnum.indexOf(txt.charAt(cur + i)) > -1 - # this makes attributes forgiving about url args - return '&' - # ok, and besides the weird exceptions for attributes... - # return the matching char - cur += i # consume entity chars - parse_error() # because no terminating ";" - return c - parse_error() - return '&' + # else FALL THROUGH (check for match without last char(s) or ";") + # no ';' terminator (only legacy char refs) + max = i + for i in [2..max] # no prefix matches, so ok to check shortest first + c = legacy_char_refs[txt.substr(cur, i)] + if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char + cur += i # consume entity chars + parse_error() # because no terminating ";" + return c + parse_error() + return '&' return # never reached eat_next_token_if_newline = -> @@ -4585,7 +4610,7 @@ parse_html = (args) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data - txt = args.html + txt = args_html cur = 0 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this @@ -4706,16 +4731,16 @@ parse_html = (args) -> return fragment_root.children return doc.children -module.exports.parse_html = parse_html -module.exports.debug_log_reset = debug_log_reset -module.exports.debug_log_each = debug_log_each -module.exports.TYPE_TAG = TYPE_TAG -module.exports.TYPE_TEXT = TYPE_TEXT -module.exports.TYPE_COMMENT = TYPE_COMMENT -module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE -module.exports.NS_HTML = NS_HTML -module.exports.NS_MATHML = NS_MATHML -module.exports.NS_SVG = NS_SVG -module.exports.QUIRKS_NO = QUIRKS_NO -module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED -module.exports.QUIRKS_YES = QUIRKS_YES +exports.parse_html = parse_html +exports.debug_log_reset = debug_log_reset +exports.debug_log_each = debug_log_each +exports.TYPE_TAG = TYPE_TAG +exports.TYPE_TEXT = TYPE_TEXT +exports.TYPE_COMMENT = TYPE_COMMENT +exports.TYPE_DOCTYPE = TYPE_DOCTYPE +exports.NS_HTML = NS_HTML +exports.NS_MATHML = NS_MATHML +exports.NS_SVG = NS_SVG +exports.QUIRKS_NO = QUIRKS_NO +exports.QUIRKS_LIMITED = QUIRKS_LIMITED +exports.QUIRKS_YES = QUIRKS_YES