# along with this program. If not, see <http://www.gnu.org/licenses/>.
-# This file implements a parser for html snippets, meant to be used by a
+# This file implements a thorough parser for html5, meant to be used by a
# WYSIWYG editor.
# The implementation is a pretty direct implementation of the parsing algorithm
# described here:
-# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
#
-# Deviations from that spec:
+# http://www.w3.org/TR/html5/syntax.html
#
-# Purposeful: search this file for "WHATWG"
+# except for some places marked "WHATWG" that are implemented as described here:
#
-# Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
+# https://html.spec.whatwg.org/multipage/syntax.html
+#
+# This code passes all of the tests in the .dat files at:
+#
+# https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
+
+
+##################################
+## how to use this code
+##################################
+#
+# See README.md for how to run this file in the browser or in node.js.
+#
+# This file exports a single useful function: parse_tml, and some constants
+# (see the bottom of this file for those.)
+#
+# Call it like this:
+#
+# wheic.parse_html("<p><b>hi</p>")
+#
+# Or, if you don't want <html><head><body>/etc, do this:
+#
+# wheic.parse_html("<p><b>hi</p>", {fragment: "body"})
+#
+# return value is an array of Nodes, see "class Node" below.
+# This code is a work in progress, eg try search this file for "fixfull",
+# "TODO" and "FIXME"
-# stacks/lists
+
+# Notes: stacks/lists
#
-# the spec uses a many different words do indicate which ends of lists/stacks
-# they are talking about (and relative movement within the lists/stacks). This
-# section splains. I'm implementing "lists" (afe and open_els) the same way
-# (both as stacks)
+# Jason was frequently confused by the terminology used to refer to different
+# parts of the stacks and lists in the spec, so he made this chart to help keep
+# his head straight:
#
# stacks grow downward (current element is index=0)
#
# example: open_els = [a, b, c, d, e, f, g]
#
-# "grows downwards" means it's visualized like this: (index: el, names)
+# "grows downwards" means it's visualized like this: (index: el "names")
#
# 6: g "start of the list", "topmost", "first"
# 5: f
# 1: b
# 0: a "end of the list", "current node", "bottommost", "last"
-
-# browser
-# note: to get this to run outside a browser, you'll have to write a native
-# implementation of decode_named_char_ref()
-unless module?.exports?
+if (typeof module) isnt 'undefined' and module.exports?
+ context = 'module'
+ exports = module.exports
+else
+ context = 'browser'
window.wheic = {}
- module = exports: window.wheic
+ exports = window.wheic
from_code_point = (x) ->
if String.fromCodePoint?
# decode_named_char_ref()
#
-# The list of named character references is _huge_ so ask the browser to decode
-# for us instead of wasting bandwidth/space on including the table here.
-#
-# Pass without the "&" but with the ";" examples:
-# for "&" pass "amp;"
-# for "′" pass "x2032;"
-g_dncr = {
- cache: {}
- textarea: document.createElement('textarea')
-}
-# TODO test this in IE8
+# The list of named character references is _huge_ so if we're running in a
+# browser, we get the browser to decode them, rather than increasing the code
+# size to include the table.
+if context is 'module'
+ _decode_named_char_ref = require './html5-named-entities.coffee'
+else
+ # TODO test this in IE8
+ decode_named_char_ref_el = document.createElement('textarea')
+ _decode_named_char_ref = (txt) ->
+ txt = "&#{txt};"
+ decode_named_char_ref_el.innerHTML = txt
+ decoded = decode_named_char_ref_el.value
+ return null if decoded is txt
+ return decoded
+# Pass the name of a named entity _that has a terminating semicolon_
+# Entities without terminating semicolons should use legacy_char_refs[]
+# Do not include the "&" or ";" in your argument, eg pass "alpha"
+decode_named_char_ref_cache = {}
decode_named_char_ref = (txt) ->
- txt = "&#{txt}"
- decoded = g_dncr.cache[txt]
+ decoded = decode_named_char_ref_cache[txt]
return decoded if decoded?
- g_dncr.textarea.innerHTML = txt
- decoded = g_dncr.textarea.value
- return null if decoded is txt
- return g_dncr.cache[txt] = decoded
+ decoded = _decode_named_char_ref txt
+ return decode_named_char_ref_cache[txt] = decoded
-parse_html = (args) ->
+parse_html = (args_html, args = {}) ->
txt = null
cur = null # index of next char in txt to be parsed
# declare doc and tokenizer variables so they're in scope below
# exit early, because parse_error() below needs at least one alnum
return '&'
if txt.charAt(cur + i) is ';'
- i += 1 # include ';' terminator in value
decoded = decode_named_char_ref txt.substr(cur, i)
+ i += 1 # scan past the ';' (after, so we dno't pass it to decode)
if decoded?
cur += i
return decoded
- parse_error()
- return '&'
- else
- # no ';' terminator (only legacy char refs)
- max = i
- for i in [2..max] # no prefix matches, so ok to check shortest first
- c = legacy_char_refs[txt.substr(cur, i)]
- if c?
- if in_attr
- if txt.charAt(cur + i) is '='
- # "because some legacy user agents will
- # misinterpret the markup in those cases"
- parse_error()
- return '&'
- if alnum.indexOf(txt.charAt(cur + i)) > -1
- # this makes attributes forgiving about url args
- return '&'
- # ok, and besides the weird exceptions for attributes...
- # return the matching char
- cur += i # consume entity chars
- parse_error() # because no terminating ";"
- return c
- parse_error()
- return '&'
+ # else FALL THROUGH (check for match without last char(s) or ";")
+ # no ';' terminator (only legacy char refs)
+ max = i
+ for i in [2..max] # no prefix matches, so ok to check shortest first
+ c = legacy_char_refs[txt.substr(cur, i)]
+ if c?
+ if in_attr
+ if txt.charAt(cur + i) is '='
+ # "because some legacy user agents will
+ # misinterpret the markup in those cases"
+ parse_error()
+ return '&'
+ if alnum.indexOf(txt.charAt(cur + i)) > -1
+ # this makes attributes forgiving about url args
+ return '&'
+ # ok, and besides the weird exceptions for attributes...
+ # return the matching char
+ cur += i # consume entity chars
+ parse_error() # because no terminating ";"
+ return c
+ parse_error()
+ return '&'
return # never reached
eat_next_token_if_newline = ->
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
- txt = args.html
+ txt = args_html
cur = 0
doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
return fragment_root.children
return doc.children
-module.exports.parse_html = parse_html
-module.exports.debug_log_reset = debug_log_reset
-module.exports.debug_log_each = debug_log_each
-module.exports.TYPE_TAG = TYPE_TAG
-module.exports.TYPE_TEXT = TYPE_TEXT
-module.exports.TYPE_COMMENT = TYPE_COMMENT
-module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
-module.exports.NS_HTML = NS_HTML
-module.exports.NS_MATHML = NS_MATHML
-module.exports.NS_SVG = NS_SVG
-module.exports.QUIRKS_NO = QUIRKS_NO
-module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
-module.exports.QUIRKS_YES = QUIRKS_YES
+exports.parse_html = parse_html
+exports.debug_log_reset = debug_log_reset
+exports.debug_log_each = debug_log_each
+exports.TYPE_TAG = TYPE_TAG
+exports.TYPE_TEXT = TYPE_TEXT
+exports.TYPE_COMMENT = TYPE_COMMENT
+exports.TYPE_DOCTYPE = TYPE_DOCTYPE
+exports.NS_HTML = NS_HTML
+exports.NS_MATHML = NS_MATHML
+exports.NS_SVG = NS_SVG
+exports.QUIRKS_NO = QUIRKS_NO
+exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+exports.QUIRKS_YES = QUIRKS_YES