update aaa to WHATWG version

[peach-html5-editor.git] / parse-html.coffee
diff --git a/parse-html.coffee b/parse-html.coffee

index 5dc05a9..f5437c9 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -24,7 +24,7 @@
  #
  # Deviations from that spec:
  #
-#   Purposeful: search this file for "WTAG"
+#   Purposeful: search this file for "WHATWG"
  #
  #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
  
@@ -112,7 +112,7 @@ class Node
                         @id = "#{++prev_node_id}"
         acknowledge_self_closing: ->
                 if @token?
-                       @token.flag 'did_self_close'
+                       @token.flag 'did_self_close', true
                 else
                         @flag 'did_self_close', true
         flag: (key, value = null) ->
@@ -342,7 +342,7 @@ special_elements = {
         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
  
-       menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
+       menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
  
         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
@@ -468,7 +468,7 @@ svg_attribute_fixes = {
         diffuseconstant: 'diffuseConstant'
         edgemode: 'edgeMode'
         externalresourcesrequired: 'externalResourcesRequired'
-       filterres: 'filterRes'
+       # WHATWG removes this: filterres: 'filterRes'
         filterunits: 'filterUnits'
         glyphref: 'glyphRef'
         gradienttransform: 'gradientTransform'
@@ -520,6 +520,20 @@ svg_attribute_fixes = {
         ychannelselector: 'yChannelSelector'
         zoomandpan: 'zoomAndPan'
  }
+foreign_attr_fixes = {
+       'xlink:actuate': 'xlink actuate'
+       'xlink:arcrole': 'xlink arcrole'
+       'xlink:href': 'xlink href'
+       'xlink:role': 'xlink role'
+       'xlink:show': 'xlink show'
+       'xlink:title': 'xlink title'
+       'xlink:type': 'xlink type'
+       'xml:base': 'xml base'
+       'xml:lang': 'xml lang'
+       'xml:space': 'xml space'
+       'xmlns': 'xmlns'
+       'xmlns:xlink': 'xmlns xlink'
+}
  adjust_mathml_attributes = (t) ->
         for a in t.attrs_a
                 if a[0] is 'definitionurl'
@@ -532,6 +546,9 @@ adjust_svg_attributes = (t) ->
         return
  adjust_foreign_attributes = (t) ->
         # fixfull
+       for a in t.attrs_a
+               if foreign_attr_fixes[a[0]]?
+                       a[0] = foreign_attr_fixes[a[0]]
         return
  
  # decode_named_char_ref()
@@ -632,10 +649,10 @@ parse_html = (args) ->
         standard_scopers = {
                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
-               template: NS_HTML, mi: NS_MATHML,
+               template: NS_HTML,
  
-               mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
-               'annotation-xml': NS_MATHML,
+               mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
+               mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
  
                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
         }
@@ -879,16 +896,46 @@ parse_html = (args) ->
                 debug_log "tree: #{serialize_els doc.children, false, true}"
                 debug_log "open_els: #{serialize_els open_els, true, true}"
                 debug_log "afe: #{serialize_els afe, true, true}"
+# this block implements tha W3C spec
+#              # 1. If the current node is an HTML element whose tag name is subject,
+#              # then run these substeps:
+#              #
+#              # 1. Let element be the current node.
+#              #
+#              # 2. Pop element off the stack of open elements.
+#              #
+#              # 3. If element is also in the list of active formatting elements,
+#              # remove the element from the list.
+#              #
+#              # 4. Abort the adoption agency algorithm.
+#              if open_els[0].name is subject and open_els[0].namespace is NS_HTML
+#                      el = open_els.shift()
+#                      # remove it from the list of active formatting elements (if found)
+#                      for t, i in afe
+#                              if t is el
+#                                      afe.splice i, 1
+#                                      break
+#                      debug_log "aaa: starting off with subject on top of stack, exiting"
+#                      return
+# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
+               # If the current node is an HTML element whose tag name is subject, and
+               # the current node is not in the list of active formatting elements,
+               # then pop the current node off the stack of open elements, and abort
+               # these steps.
                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
-                       el = open_els[0]
-                       open_els.shift()
+                       debug_log "aaa: starting off with subject on top of stack, exiting"
                         # remove it from the list of active formatting elements (if found)
-                       for t, i in afe
-                               if t is el
-                                       afe.splice i, 1
+                       in_afe = false
+                       for el, i in afe
+                               if el is open_els[0]
+                                       in_afe = true
                                         break
-                       debug_log "aaa: starting off with subject on top of stack, exiting"
-                       return
+                       unless in_afe
+                               debug_log "aaa: ...and not in afe, aaa done"
+                               open_els.shift()
+                               return
+                       # fall through
+# END WHATWG
                 outer = 0
                 loop
                         if outer >= 8
@@ -1196,7 +1243,7 @@ parse_html = (args) ->
                         ins_mode t
                         return
                 if is_mathml_text_integration_point(acn)
-                       if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
+                       if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
                                 ins_mode t
                                 return
                         if t.type is TYPE_TEXT
@@ -1659,7 +1706,7 @@ parse_html = (args) ->
                         parse_error()
                         return if template_tag_is_open()
                         root_attrs = open_els[open_els.length - 1].attrs
-                       for a of t.attrs_a
+                       for a in t.attrs_a
                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
                         return
  
@@ -2121,7 +2168,7 @@ parse_html = (args) ->
  #                                      parse_error()
  #                      insert_html_element t
  #                      return
-# below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
+# below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
                         if is_in_scope 'ruby', NS_HTML
                                 generate_implied_end_tags()
@@ -2136,7 +2183,7 @@ parse_html = (args) ->
                                         parse_error()
                         insert_html_element t
                         return
-# end WATWG chunk
+# end WHATWG chunk
                 if t.type is TYPE_START_TAG and t.name is 'math'
                         reconstruct_afe()
                         adjust_mathml_attributes t
@@ -2709,7 +2756,8 @@ parse_html = (args) ->
                         ins_mode_in_body t
                         return
                 if t.type is TYPE_COMMENT
-                       insert_comment t, [open_els[0], open_els[0].children.length]
+                       first = open_els[open_els.length - 1]
+                       insert_comment t, [first, first.children.length]
                         return
                 if t.type is TYPE_DOCTYPE
                         parse_error()
@@ -2718,7 +2766,9 @@ parse_html = (args) ->
                         ins_mode_in_body t
                         return
                 if t.type is TYPE_END_TAG and t.name is 'html'
-                       # fixfull fragment case
+                       if flag_fragment_parsing
+                               parse_error()
+                               return
                         ins_mode = ins_mode_after_after_body
                         return
                 if t.type is TYPE_EOF
@@ -2786,7 +2836,7 @@ parse_html = (args) ->
                         ins_mode_in_body t
                         return
                 if t.type is TYPE_END_TAG and t.name is 'html'
-                       insert_mode = ins_mode_after_after_frameset
+                       ins_mode = ins_mode_after_after_frameset
                         return
                 if t.type is TYPE_START_TAG and t.name is 'noframes'
                         ins_mode_in_head t
@@ -2812,6 +2862,7 @@ parse_html = (args) ->
                 # Anything else
                 parse_error()
                 ins_mode = ins_mode_in_body
+               process_token t
                 return
  
         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
@@ -2856,6 +2907,7 @@ parse_html = (args) ->
                         if t.name is 'script'
                                 t.acknowledge_self_closing()
                                 in_foreign_content_end_script()
+                               # fixfull
                         else
                                 open_els.shift()
                                 t.acknowledge_self_closing()
@@ -2885,8 +2937,7 @@ parse_html = (args) ->
                                 return
                         loop # is this safe?
                                 open_els.shift()
-                               cn = open_els[0]
-                               if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
+                               if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
                                         break
                         process_token t
                         return
@@ -2897,9 +2948,11 @@ parse_html = (args) ->
                         in_foreign_content_end_script()
                         return
                 if t.type is TYPE_END_TAG
-                       if open_els[0].name.toLowerCase() isnt t.name
+                       i = 0
+                       node = open_els[i]
+                       if node.name.toLowerCase() isnt t.name
                                 parse_error()
-                       for node in open_els
+                       loop
                                 if node is open_els[open_els.length - 1]
                                         return
                                 if node.name.toLowerCase() is t.name
@@ -2907,6 +2960,8 @@ parse_html = (args) ->
                                                 el = open_els.shift()
                                                 if el is node
                                                         return
+                               i += 1
+                               node = open_els[i]
                                 if node.namespace is NS_HTML
                                         break
                         ins_mode t # explicitly call HTML insertion mode
@@ -2995,50 +3050,55 @@ parse_html = (args) ->
  
         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
         tok_state_tag_open = ->
-               switch c = txt.charAt(cur++)
-                       when '!'
-                               tok_state = tok_state_markup_declaration_open
-                       when '/'
-                               tok_state = tok_state_end_tag_open
-                       when '?'
-                               parse_error()
-                               tok_cur_tag = new_comment_token '?'
-                               tok_state = tok_state_bogus_comment
-                       else
-                               if is_lc_alpha(c)
-                                       tok_cur_tag = new_open_tag c
-                                       tok_state = tok_state_tag_name
-                               else if is_uc_alpha(c)
-                                       tok_cur_tag = new_open_tag c.toLowerCase()
-                                       tok_state = tok_state_tag_name
-                               else
-                                       parse_error()
-                                       tok_state = tok_state_data
-                                       cur -= 1 # we didn't parse/handle the char after <
-                                       return new_text_node '<'
-               return null
+               c = txt.charAt(cur++)
+               if c is '!'
+                       tok_state = tok_state_markup_declaration_open
+                       return
+               if c is '/'
+                       tok_state = tok_state_end_tag_open
+                       return
+               if is_uc_alpha(c)
+                       tok_cur_tag = new_open_tag c.toLowerCase()
+                       tok_state = tok_state_tag_name
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag = new_open_tag c
+                       tok_state = tok_state_tag_name
+                       return
+               if c is '?'
+                       parse_error()
+                       tok_cur_tag = new_comment_token '?' # FIXME right?
+                       tok_state = tok_state_bogus_comment
+                       return
+               # Anything else
+               parse_error()
+               tok_state = tok_state_data
+               cur -= 1 # we didn't parse/handle the char after <
+               return new_text_node '<'
  
         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
         tok_state_end_tag_open = ->
-               switch c = txt.charAt(cur++)
-                       when '>'
-                               parse_error()
-                               tok_state = tok_state_data
-                       when '' # EOF
-                               parse_error()
-                               tok_state = tok_state_data
-                               return new_text_node '</'
-                       else
-                               if is_uc_alpha(c)
-                                       tok_cur_tag = new_end_tag c.toLowerCase()
-                                       tok_state = tok_state_tag_name
-                               else if is_lc_alpha(c)
-                                       tok_cur_tag = new_end_tag c
-                                       tok_state = tok_state_tag_name
-                               else
-                                       parse_error()
-                                       tok_cur_tag = new_comment_token '/'
-                                       tok_state = tok_state_bogus_comment
+               c = txt.charAt(cur++)
+               if is_uc_alpha(c)
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       tok_state = tok_state_tag_name
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag = new_end_tag c
+                       tok_state = tok_state_tag_name
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_state = tok_state_data
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       return new_text_node '</'
+               # Anything else
+               parse_error()
+               tok_cur_tag = new_comment_token c
+               tok_state = tok_state_bogus_comment
                 return null
  
         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
@@ -3368,7 +3428,7 @@ parse_html = (args) ->
                 # Anything else
                 tok_state = tok_state_script_data_escaped
                 cur -= 1 # Reconsume
-               return new_character_token c
+               return new_character_token '<'
  
         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
         tok_state_script_data_escaped_end_tag_open = ->
@@ -3746,7 +3806,7 @@ parse_html = (args) ->
         tok_state_self_closing_start_tag = ->
                 c = txt.charAt(cur++)
                 if c is '>'
-                       tok_cur_tag.flag 'self-closing'
+                       tok_cur_tag.flag 'self-closing', true
                         tok_state = tok_state_data
                         return tok_cur_tag
                 if c is ''
@@ -4477,6 +4537,7 @@ parse_html = (args) ->
         head_element_pointer = null
         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+       prev_node_id = 0 # just for debugging
  
         # tokenizer initialization
         tok_state = tok_state_data
@@ -4487,7 +4548,7 @@ parse_html = (args) ->
         txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
         txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
  
-       if args.name is "plain-text-unsafe.dat #4"
+       if args.name is "tests18.dat #17"
                 console.log "hi"
         # proccess input
         # http://www.w3.org/TR/html5/syntax.html#tree-construction