JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
update .gitignore
[peach-html5-editor.git] / parse-html.coffee
index 46195f9..a6d501f 100644 (file)
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
-# This file implements a parser for html snippets, meant to be used by a
+# This file implements a thorough parser for html5, meant to be used by a
 # WYSIWYG editor.
 
 # The implementation is a pretty direct implementation of the parsing algorithm
 # described here:
-# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
 #
-# Deviations from that spec:
+#     http://www.w3.org/TR/html5/syntax.html
 #
-#   Purposeful: search this file for "WHATWG"
+# except for some places marked "WHATWG" that are implemented as described here:
 #
-#   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
+#     https://html.spec.whatwg.org/multipage/syntax.html
+#
+# This code passes all of the tests in the .dat files at:
+#
+#     https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
+
+
+##################################
+## how to use this code
+##################################
+#
+# See README.md for how to run this file in the browser or in node.js.
+#
+# This file exports a single useful function: parse_tml, and some constants
+# (see the bottom of this file for those.)
+#
+# Call it like this:
+#
+#     wheic.parse_html("<p><b>hi</p>")
+#
+# Or, if you don't want <html><head><body>/etc, do this:
+#
+#     wheic.parse_html("<p><b>hi</p>", {fragment: "body"})
+#
+# return value is an array of Nodes, see "class Node" below.
 
+# This code is a work in progress, eg try search this file for "fixfull",
+# "TODO" and "FIXME"
 
-# stacks/lists
+
+# Notes:  stacks/lists
 #
-# the spec uses a many different words do indicate which ends of lists/stacks
-# they are talking about (and relative movement within the lists/stacks). This
-# section splains. I'm implementing "lists" (afe and open_els) the same way
-# (both as stacks)
+# Jason was frequently confused by the terminology used to refer to different
+# parts of the stacks and lists in the spec, so he made this chart to help keep
+# his head straight:
 #
 # stacks grow downward (current element is index=0)
 #
 # example: open_els = [a, b, c, d, e, f, g]
 #
-# "grows downwards" means it's visualized like this: (index: el, names)
+# "grows downwards" means it's visualized like this: (index: el "names")
 #
 #   6: g "start of the list", "topmost", "first"
 #   5: f
 #   1: b
 #   0: a "end of the list", "current node", "bottommost", "last"
 
-
-# browser
-# note: to get this to run outside a browser, you'll have to write a native
-# implementation of decode_named_char_ref()
-unless module?.exports?
+if (typeof module) isnt 'undefined' and module.exports?
+       context = 'module'
+       exports = module.exports
+else
+       context = 'browser'
        window.wheic = {}
-       module = exports: window.wheic
+       exports = window.wheic
 
 from_code_point = (x) ->
        if String.fromCodePoint?
@@ -578,27 +603,31 @@ adjust_foreign_attributes = (t) ->
 
 # decode_named_char_ref()
 #
-# The list of named character references is _huge_ so ask the browser to decode
-# for us instead of wasting bandwidth/space on including the table here.
-#
-# Pass without the "&" but with the ";" examples:
-#    for "&amp" pass "amp;"
-#    for "&#x2032" pass "x2032;"
-g_dncr = {
-       cache: {}
-       textarea: document.createElement('textarea')
-}
-# TODO test this in IE8
+# The list of named character references is _huge_ so if we're running in a
+# browser, we get the browser to decode them, rather than increasing the code
+# size to include the table.
+if context is 'module'
+       _decode_named_char_ref = require './html5-named-entities.coffee'
+else
+       # TODO test this in IE8
+       decode_named_char_ref_el = document.createElement('textarea')
+       _decode_named_char_ref = (txt) ->
+               txt = "&#{txt};"
+               decode_named_char_ref_el.innerHTML = txt
+               decoded = decode_named_char_ref_el.value
+               return null if decoded is txt
+               return decoded
+# Pass the name of a named entity _that has a terminating semicolon_
+# Entities without terminating semicolons should use legacy_char_refs[]
+# Do not include the "&" or ";" in your argument, eg pass "alpha"
+decode_named_char_ref_cache = {}
 decode_named_char_ref = (txt) ->
-       txt = "&#{txt}"
-       decoded = g_dncr.cache[txt]
+       decoded = decode_named_char_ref_cache[txt]
        return decoded if decoded?
-       g_dncr.textarea.innerHTML = txt
-       decoded = g_dncr.textarea.value
-       return null if decoded is txt
-       return g_dncr.cache[txt] = decoded
+       decoded = _decode_named_char_ref txt
+       return decode_named_char_ref_cache[txt] = decoded
 
-parse_html = (args) ->
+parse_html = (args_html, args = {}) ->
        txt = null
        cur = null # index of next char in txt to be parsed
        # declare doc and tokenizer variables so they're in scope below
@@ -4534,35 +4563,33 @@ parse_html = (args) ->
                                        # exit early, because parse_error() below needs at least one alnum
                                        return '&'
                                if txt.charAt(cur + i) is ';'
-                                       i += 1 # include ';' terminator in value
                                        decoded = decode_named_char_ref txt.substr(cur, i)
+                                       i += 1 # scan past the ';' (after, so we dno't pass it to decode)
                                        if decoded?
                                                cur += i
                                                return decoded
-                                       parse_error()
-                                       return '&'
-                               else
-                                       # no ';' terminator (only legacy char refs)
-                                       max = i
-                                       for i in [2..max] # no prefix matches, so ok to check shortest first
-                                               c = legacy_char_refs[txt.substr(cur, i)]
-                                               if c?
-                                                       if in_attr
-                                                               if txt.charAt(cur + i) is '='
-                                                                       # "because some legacy user agents will
-                                                                       # misinterpret the markup in those cases"
-                                                                       parse_error()
-                                                                       return '&'
-                                                               if alnum.indexOf(txt.charAt(cur + i)) > -1
-                                                                       # this makes attributes forgiving about url args
-                                                                       return '&'
-                                                       # ok, and besides the weird exceptions for attributes...
-                                                       # return the matching char
-                                                       cur += i # consume entity chars
-                                                       parse_error() # because no terminating ";"
-                                                       return c
-                                       parse_error()
-                                       return '&'
+                                       # else FALL THROUGH (check for match without last char(s) or ";")
+                               # no ';' terminator (only legacy char refs)
+                               max = i
+                               for i in [2..max] # no prefix matches, so ok to check shortest first
+                                       c = legacy_char_refs[txt.substr(cur, i)]
+                                       if c?
+                                               if in_attr
+                                                       if txt.charAt(cur + i) is '='
+                                                               # "because some legacy user agents will
+                                                               # misinterpret the markup in those cases"
+                                                               parse_error()
+                                                               return '&'
+                                                       if alnum.indexOf(txt.charAt(cur + i)) > -1
+                                                               # this makes attributes forgiving about url args
+                                                               return '&'
+                                               # ok, and besides the weird exceptions for attributes...
+                                               # return the matching char
+                                               cur += i # consume entity chars
+                                               parse_error() # because no terminating ";"
+                                               return c
+                               parse_error()
+                               return '&'
                return # never reached
 
        eat_next_token_if_newline = ->
@@ -4585,7 +4612,7 @@ parse_html = (args) ->
 
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
-       txt = args.html
+       txt = args_html
        cur = 0
        doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
        doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
@@ -4706,16 +4733,16 @@ parse_html = (args) ->
                return fragment_root.children
        return doc.children
 
-module.exports.parse_html = parse_html
-module.exports.debug_log_reset = debug_log_reset
-module.exports.debug_log_each = debug_log_each
-module.exports.TYPE_TAG = TYPE_TAG
-module.exports.TYPE_TEXT = TYPE_TEXT
-module.exports.TYPE_COMMENT = TYPE_COMMENT
-module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
-module.exports.NS_HTML = NS_HTML
-module.exports.NS_MATHML = NS_MATHML
-module.exports.NS_SVG = NS_SVG
-module.exports.QUIRKS_NO = QUIRKS_NO
-module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
-module.exports.QUIRKS_YES = QUIRKS_YES
+exports.parse_html = parse_html
+exports.debug_log_reset = debug_log_reset
+exports.debug_log_each = debug_log_each
+exports.TYPE_TAG = TYPE_TAG
+exports.TYPE_TEXT = TYPE_TEXT
+exports.TYPE_COMMENT = TYPE_COMMENT
+exports.TYPE_DOCTYPE = TYPE_DOCTYPE
+exports.NS_HTML = NS_HTML
+exports.NS_MATHML = NS_MATHML
+exports.NS_SVG = NS_SVG
+exports.QUIRKS_NO = QUIRKS_NO
+exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+exports.QUIRKS_YES = QUIRKS_YES