X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=362240fdb8f33512c7b7a279d39f40dec6de8861;hb=071a77b51a7f9ba5da51b3fc43885e63771bc566;hp=a4beb3dbda24aecf0618b662cfe56cfc85fdf2c5;hpb=9ecc7f55f96de835055fa7c82f66d08b7b884a36;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index a4beb3d..362240f 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -15,7 +15,7 @@ # along with this program. If not, see . -# This file implements a parser for html snippets, meant to be used by a +# This file implements a thorough parser for html5, meant to be used by a # WYSIWYG editor. # The implementation is a pretty direct implementation of the parsing algorithm @@ -36,26 +36,20 @@ ## how to use this code ################################## # -# See README.md for how to pre-compile this file, or compile it in the browser. +# See README.md for how to run this file in the browser or in node.js. # -# This file exports a single useful function: parse_tml -# -# Once you include this file in a page (see index.html for an example) you'll -# have window.wheic +# This file exports a single useful function: parse_tml, and some constants +# (see the bottom of this file for those.) # # Call it like this: # -# wheic.parse_html({html: "

hi

"}) +# wheic.parse_html("

hi

") # # Or, if you don't want /etc, do this: # -# wheic.parse_html({fragment: "body", html: "

hi

"}) +# wheic.parse_html("

hi

", {fragment: "body"}) # -# This code can _almost_ run outside the browser (eg under node.js). To get it -# to run without the browser would require native implementation of -# decode_named_char_ref(). The current implementation of that function uses the -# browser's DOM api, to save space (the list of valid named characters is -# massive.) +# return value is an array of Nodes, see "class Node" below. # This code is a work in progress, eg try search this file for "fixfull", # "TODO" and "FIXME" @@ -71,7 +65,7 @@ # # example: open_els = [a, b, c, d, e, f, g] # -# "grows downwards" means it's visualized like this: (index: el, names) +# "grows downwards" means it's visualized like this: (index: el "names") # # 6: g "start of the list", "topmost", "first" # 5: f @@ -81,9 +75,13 @@ # 1: b # 0: a "end of the list", "current node", "bottommost", "last" -unless module?.exports? +if (typeof module) isnt 'undefined' and module.exports? + context = 'module' + exports = module.exports +else + context = 'browser' window.wheic = {} - module = exports: window.wheic + exports = window.wheic from_code_point = (x) -> if String.fromCodePoint? @@ -605,27 +603,31 @@ adjust_foreign_attributes = (t) -> # decode_named_char_ref() # -# The list of named character references is _huge_ so ask the browser to decode -# for us instead of wasting bandwidth/space on including the table here. -# -# Pass without the "&" but with the ";" examples: -# for "&" pass "amp;" -# for "′" pass "x2032;" -g_dncr = { - cache: {} - textarea: document.createElement('textarea') -} -# TODO test this in IE8 +# The list of named character references is _huge_ so if we're running in a +# browser, we get the browser to decode them, rather than increasing the code +# size to include the table. +if context is 'module' + _decode_named_char_ref = require './html5-named-entities.coffee' +else + # TODO test this in IE8 + decode_named_char_ref_el = document.createElement('textarea') + _decode_named_char_ref = (txt) -> + txt = "&#{txt};" + decode_named_char_ref_el.innerHTML = txt + decoded = decode_named_char_ref_el.value + return null if decoded is txt + return decoded +# Pass the name of a named entity _that has a terminating semicolon_ +# Entities without terminating semicolons should use legacy_char_refs[] +# Do not include the "&" or ";" in your argument, eg pass "alpha" +decode_named_char_ref_cache = {} decode_named_char_ref = (txt) -> - txt = "&#{txt}" - decoded = g_dncr.cache[txt] + decoded = decode_named_char_ref_cache[txt] return decoded if decoded? - g_dncr.textarea.innerHTML = txt - decoded = g_dncr.textarea.value - return null if decoded is txt - return g_dncr.cache[txt] = decoded + decoded = _decode_named_char_ref txt + return decode_named_char_ref_cache[txt] = decoded -parse_html = (args) -> +parse_html = (args_html, args = {}) -> txt = null cur = null # index of next char in txt to be parsed # declare doc and tokenizer variables so they're in scope below @@ -655,8 +657,6 @@ parse_html = (args) -> parse_error = -> if args.error_cb? args.error_cb cur - else - console.log "Parse error at character #{cur} of #{txt.length}" return # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements @@ -4561,35 +4561,33 @@ parse_html = (args) -> # exit early, because parse_error() below needs at least one alnum return '&' if txt.charAt(cur + i) is ';' - i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) + i += 1 # scan past the ';' (after, so we dno't pass it to decode) if decoded? cur += i return decoded - parse_error() - return '&' - else - # no ';' terminator (only legacy char refs) - max = i - for i in [2..max] # no prefix matches, so ok to check shortest first - c = legacy_char_refs[txt.substr(cur, i)] - if c? - if in_attr - if txt.charAt(cur + i) is '=' - # "because some legacy user agents will - # misinterpret the markup in those cases" - parse_error() - return '&' - if alnum.indexOf(txt.charAt(cur + i)) > -1 - # this makes attributes forgiving about url args - return '&' - # ok, and besides the weird exceptions for attributes... - # return the matching char - cur += i # consume entity chars - parse_error() # because no terminating ";" - return c - parse_error() - return '&' + # else FALL THROUGH (check for match without last char(s) or ";") + # no ';' terminator (only legacy char refs) + max = i + for i in [2..max] # no prefix matches, so ok to check shortest first + c = legacy_char_refs[txt.substr(cur, i)] + if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char + cur += i # consume entity chars + parse_error() # because no terminating ";" + return c + parse_error() + return '&' return # never reached eat_next_token_if_newline = -> @@ -4612,7 +4610,7 @@ parse_html = (args) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data - txt = args.html + txt = args_html cur = 0 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this @@ -4733,16 +4731,16 @@ parse_html = (args) -> return fragment_root.children return doc.children -module.exports.parse_html = parse_html -module.exports.debug_log_reset = debug_log_reset -module.exports.debug_log_each = debug_log_each -module.exports.TYPE_TAG = TYPE_TAG -module.exports.TYPE_TEXT = TYPE_TEXT -module.exports.TYPE_COMMENT = TYPE_COMMENT -module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE -module.exports.NS_HTML = NS_HTML -module.exports.NS_MATHML = NS_MATHML -module.exports.NS_SVG = NS_SVG -module.exports.QUIRKS_NO = QUIRKS_NO -module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED -module.exports.QUIRKS_YES = QUIRKS_YES +exports.parse_html = parse_html +exports.debug_log_reset = debug_log_reset +exports.debug_log_each = debug_log_each +exports.TYPE_TAG = TYPE_TAG +exports.TYPE_TEXT = TYPE_TEXT +exports.TYPE_COMMENT = TYPE_COMMENT +exports.TYPE_DOCTYPE = TYPE_DOCTYPE +exports.NS_HTML = NS_HTML +exports.NS_MATHML = NS_MATHML +exports.NS_SVG = NS_SVG +exports.QUIRKS_NO = QUIRKS_NO +exports.QUIRKS_LIMITED = QUIRKS_LIMITED +exports.QUIRKS_YES = QUIRKS_YES