JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
tests pass running under node.js
[peach-html5-editor.git] / parse-html.coffee
index 46195f9..9729874 100644 (file)
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
-# This file implements a parser for html snippets, meant to be used by a
+# This file implements a thorough parser for html5, meant to be used by a
 # WYSIWYG editor.
 
 # The implementation is a pretty direct implementation of the parsing algorithm
 # described here:
-# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
 #
-# Deviations from that spec:
+#     http://www.w3.org/TR/html5/syntax.html
 #
-#   Purposeful: search this file for "WHATWG"
+# except for some places marked "WHATWG" that are implemented as described here:
 #
-#   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
+#     https://html.spec.whatwg.org/multipage/syntax.html
+#
+# This code passes all of the tests in the .dat files at:
+#
+#     https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
 
 
-# stacks/lists
+##################################
+## how to use this code
+##################################
+#
+# See README.md for how to pre-compile this file, or compile it in the browser.
+#
+# This file exports a single useful function: parse_tml
+#
+# Once you include this file in a page (see index.html for an example) you'll
+# have window.wheic
+#
+# Call it like this:
 #
-# the spec uses a many different words do indicate which ends of lists/stacks
-# they are talking about (and relative movement within the lists/stacks). This
-# section splains. I'm implementing "lists" (afe and open_els) the same way
-# (both as stacks)
+#     wheic.parse_html({html: "<p><b>hi</p>"})
+#
+# Or, if you don't want <html><head><body>/etc, do this:
+#
+#     wheic.parse_html({fragment: "body", html: "<p><b>hi</p>"})
+#
+# This code can _almost_ run outside the browser (eg under node.js). To get it
+# to run without the browser would require native implementation of
+# decode_named_char_ref(). The current implementation of that function uses the
+# browser's DOM api, to save space (the list of valid named characters is
+# massive.)
+
+# This code is a work in progress, eg try search this file for "fixfull",
+# "TODO" and "FIXME"
+
+
+# Notes:  stacks/lists
+#
+# Jason was frequently confused by the terminology used to refer to different
+# parts of the stacks and lists in the spec, so he made this chart to help keep
+# his head straight:
 #
 # stacks grow downward (current element is index=0)
 #
 #   1: b
 #   0: a "end of the list", "current node", "bottommost", "last"
 
-
-# browser
-# note: to get this to run outside a browser, you'll have to write a native
-# implementation of decode_named_char_ref()
-unless module?.exports?
+if (typeof module) isnt 'undefined' and module.exports?
+       context = 'module'
+       exports = module.exports
+else
+       context = 'browser'
        window.wheic = {}
-       module = exports: window.wheic
+       exports = window.wheic
 
 from_code_point = (x) ->
        if String.fromCodePoint?
@@ -578,25 +609,29 @@ adjust_foreign_attributes = (t) ->
 
 # decode_named_char_ref()
 #
-# The list of named character references is _huge_ so ask the browser to decode
-# for us instead of wasting bandwidth/space on including the table here.
-#
-# Pass without the "&" but with the ";" examples:
-#    for "&amp" pass "amp;"
-#    for "&#x2032" pass "x2032;"
-g_dncr = {
-       cache: {}
-       textarea: document.createElement('textarea')
-}
-# TODO test this in IE8
+# The list of named character references is _huge_ so if we're running in a
+# browser, we get the browser to decode them, rather than increasing the code
+# size to include the table.
+if context is 'module'
+       _decode_named_char_ref = require './html5-named-entities.coffee'
+else
+       # TODO test this in IE8
+       decode_named_char_ref_el = document.createElement('textarea')
+       _decode_named_char_ref = (txt) ->
+               txt = "&#{txt};"
+               decode_named_char_ref_el.innerHTML = txt
+               decoded = decode_named_char_ref_el.value
+               return null if decoded is txt
+               return decoded
+# Pass the name of a named entity _that has a terminating semicolon_
+# Entities without terminating semicolons should use legacy_char_refs[]
+# Do not include the "&" or ";" in your argument, eg pass "alpha"
+decode_named_char_ref_cache = {}
 decode_named_char_ref = (txt) ->
-       txt = "&#{txt}"
-       decoded = g_dncr.cache[txt]
+       decoded = decode_named_char_ref_cache[txt]
        return decoded if decoded?
-       g_dncr.textarea.innerHTML = txt
-       decoded = g_dncr.textarea.value
-       return null if decoded is txt
-       return g_dncr.cache[txt] = decoded
+       decoded = _decode_named_char_ref txt
+       return decode_named_char_ref_cache[txt] = decoded
 
 parse_html = (args) ->
        txt = null
@@ -4534,35 +4569,33 @@ parse_html = (args) ->
                                        # exit early, because parse_error() below needs at least one alnum
                                        return '&'
                                if txt.charAt(cur + i) is ';'
-                                       i += 1 # include ';' terminator in value
                                        decoded = decode_named_char_ref txt.substr(cur, i)
+                                       i += 1 # scan past the ';' (after, so we dno't pass it to decode)
                                        if decoded?
                                                cur += i
                                                return decoded
-                                       parse_error()
-                                       return '&'
-                               else
-                                       # no ';' terminator (only legacy char refs)
-                                       max = i
-                                       for i in [2..max] # no prefix matches, so ok to check shortest first
-                                               c = legacy_char_refs[txt.substr(cur, i)]
-                                               if c?
-                                                       if in_attr
-                                                               if txt.charAt(cur + i) is '='
-                                                                       # "because some legacy user agents will
-                                                                       # misinterpret the markup in those cases"
-                                                                       parse_error()
-                                                                       return '&'
-                                                               if alnum.indexOf(txt.charAt(cur + i)) > -1
-                                                                       # this makes attributes forgiving about url args
-                                                                       return '&'
-                                                       # ok, and besides the weird exceptions for attributes...
-                                                       # return the matching char
-                                                       cur += i # consume entity chars
-                                                       parse_error() # because no terminating ";"
-                                                       return c
-                                       parse_error()
-                                       return '&'
+                                       # else FALL THROUGH (check for match without last char(s) or ";")
+                               # no ';' terminator (only legacy char refs)
+                               max = i
+                               for i in [2..max] # no prefix matches, so ok to check shortest first
+                                       c = legacy_char_refs[txt.substr(cur, i)]
+                                       if c?
+                                               if in_attr
+                                                       if txt.charAt(cur + i) is '='
+                                                               # "because some legacy user agents will
+                                                               # misinterpret the markup in those cases"
+                                                               parse_error()
+                                                               return '&'
+                                                       if alnum.indexOf(txt.charAt(cur + i)) > -1
+                                                               # this makes attributes forgiving about url args
+                                                               return '&'
+                                               # ok, and besides the weird exceptions for attributes...
+                                               # return the matching char
+                                               cur += i # consume entity chars
+                                               parse_error() # because no terminating ";"
+                                               return c
+                               parse_error()
+                               return '&'
                return # never reached
 
        eat_next_token_if_newline = ->
@@ -4706,16 +4739,16 @@ parse_html = (args) ->
                return fragment_root.children
        return doc.children
 
-module.exports.parse_html = parse_html
-module.exports.debug_log_reset = debug_log_reset
-module.exports.debug_log_each = debug_log_each
-module.exports.TYPE_TAG = TYPE_TAG
-module.exports.TYPE_TEXT = TYPE_TEXT
-module.exports.TYPE_COMMENT = TYPE_COMMENT
-module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
-module.exports.NS_HTML = NS_HTML
-module.exports.NS_MATHML = NS_MATHML
-module.exports.NS_SVG = NS_SVG
-module.exports.QUIRKS_NO = QUIRKS_NO
-module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
-module.exports.QUIRKS_YES = QUIRKS_YES
+exports.parse_html = parse_html
+exports.debug_log_reset = debug_log_reset
+exports.debug_log_each = debug_log_each
+exports.TYPE_TAG = TYPE_TAG
+exports.TYPE_TEXT = TYPE_TEXT
+exports.TYPE_COMMENT = TYPE_COMMENT
+exports.TYPE_DOCTYPE = TYPE_DOCTYPE
+exports.NS_HTML = NS_HTML
+exports.NS_MATHML = NS_MATHML
+exports.NS_SVG = NS_SVG
+exports.QUIRKS_NO = QUIRKS_NO
+exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+exports.QUIRKS_YES = QUIRKS_YES