net/html/comment_test.go

// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html

import (
	"bytes"
	"strings"
	"testing"
)

// TestComments exhaustively tests every 'interesting' N-byte string is
// correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
// inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
//
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
// sampled from the interestingCommentBytes const string, below. These cover
// all of the possible state transitions from comment-related parser states, as
// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
// and subsequent sections).
//
// The spec is written as an explicit state machine that, as a side effect,
// accumulates "the comment token's data" to a separate buffer.
// Tokenizer.readComment in this package does not have an explicit state
// machine and usually returns the comment text as a sub-slice of the input,
// between the opening '<' and closing '>' or EOF. This test confirms that the
// two algorithms match.
func TestComments(t *testing.T) {
	const prefix = "<!--"
	const maxSuffixLen = 6
	buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
	testAllComments(t, append(buffer, prefix...))
}

// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
// It's equivalent to any other non-interesting byte (other than being replaced
// by U+FFFD REPLACEMENT CHARACTER).
//
// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
// testOneComment below breaks the loop instead.
//
// 'x' represents all other "non-interesting" comment bytes.
var interestingCommentBytes = [...]byte{
	'!', '-', '<', '>', 'x',
}

// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
// interesting bytes and then tests that this package's tokenization matches
// the HTML spec.
//
// Precondition: len(buffer) < cap(buffer)
// Precondition: string(buffer[:4]) == "<!--"
func testAllComments(t *testing.T, buffer []byte) {
	for _, interesting := range interestingCommentBytes {
		b := append(buffer, interesting)
		testOneComment(t, b)
		if len(b) < cap(b) {
			testAllComments(t, b)
		}
	}
}

func testOneComment(t *testing.T, b []byte) {
	z := NewTokenizer(bytes.NewReader(b))
	if next := z.Next(); next != CommentToken {
		t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
	}
	gotRemainder := string(b[len(z.Raw()):])
	gotComment := string(z.Text())

	i := len("<!--")
	wantBuffer := []byte(nil)
loop:
	for state := 43; ; {
		// Consume the next input character, handling EOF.
		if i >= len(b) {
			break
		}
		nextInputCharacter := b[i]
		i++

		switch state {
		case 43: // 13.2.5.43 Comment start state.
			switch nextInputCharacter {
			case '-':
				state = 44
			case '>':
				break loop
			default:
				i-- // Reconsume.
				state = 45
			}

		case 44: // 13.2.5.44 Comment start dash state.
			switch nextInputCharacter {
			case '-':
				state = 51
			case '>':
				break loop
			default:
				wantBuffer = append(wantBuffer, '-')
				i-- // Reconsume.
				state = 45
			}

		case 45: // 13.2.5.45 Comment state.
			switch nextInputCharacter {
			case '-':
				state = 50
			case '<':
				wantBuffer = append(wantBuffer, '<')
				state = 46
			default:
				wantBuffer = append(wantBuffer, nextInputCharacter)
			}

		case 46: // 13.2.5.46 Comment less-than sign state.
			switch nextInputCharacter {
			case '!':
				wantBuffer = append(wantBuffer, '!')
				state = 47
			case '<':
				wantBuffer = append(wantBuffer, '<')
				state = 46
			default:
				i-- // Reconsume.
				state = 45
			}

		case 47: // 13.2.5.47 Comment less-than sign bang state.
			switch nextInputCharacter {
			case '-':
				state = 48
			default:
				i-- // Reconsume.
				state = 45
			}

		case 48: // 13.2.5.48 Comment less-than sign bang dash state.
			switch nextInputCharacter {
			case '-':
				state = 49
			default:
				i-- // Reconsume.
				state = 50
			}

		case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
			switch nextInputCharacter {
			case '>':
				break loop
			default:
				i-- // Reconsume.
				state = 51
			}

		case 50: // 13.2.5.50 Comment end dash state.
			switch nextInputCharacter {
			case '-':
				state = 51
			default:
				wantBuffer = append(wantBuffer, '-')
				i-- // Reconsume.
				state = 45
			}

		case 51: // 13.2.5.51 Comment end state.
			switch nextInputCharacter {
			case '!':
				state = 52
			case '-':
				wantBuffer = append(wantBuffer, '-')
			case '>':
				break loop
			default:
				wantBuffer = append(wantBuffer, "--"...)
				i-- // Reconsume.
				state = 45
			}

		case 52: // 13.2.5.52 Comment end bang state.
			switch nextInputCharacter {
			case '-':
				wantBuffer = append(wantBuffer, "--!"...)
				state = 50
			case '>':
				break loop
			default:
				wantBuffer = append(wantBuffer, "--!"...)
				i-- // Reconsume.
				state = 45
			}

		default:
			t.Fatalf("input=%q: unexpected state %d", b, state)
		}
	}

	wantRemainder := ""
	if i < len(b) {
		wantRemainder = string(b[i:])
	}
	wantComment := string(wantBuffer)
	if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
		t.Errorf("input=%q\ngot:  %q + %q\nwant: %q + %q",
			b, gotComment, gotRemainder, wantComment, wantRemainder)
		return
	}

	// suffix is the "N-4 byte suffix" per the TestComments comment.
	suffix := string(b[4:])

	// Test that a round trip, rendering (escaped) and re-parsing, of a comment
	// token (with that suffix as the Token.Data) preserves that string.
	tok := Token{
		Type: CommentToken,
		Data: suffix,
	}
	z2 := NewTokenizer(strings.NewReader(tok.String()))
	if next := z2.Next(); next != CommentToken {
		t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
	}
	gotComment2 := string(z2.Text())
	if gotComment2 != suffix {
		t.Errorf("round-trip\ngot:  %q\nwant: %q", gotComment2, suffix)
		return
	}
}

// This table below summarizes the HTML-comment-related state machine from
// 13.2.5.43 "Comment start state" and subsequent sections.
// https://html.spec.whatwg.org/#comment-start-state
//
// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
// initial 13.2.5.1 "Data state":
//   - "<"  moves to 13.2.5.6  "Tag open state",
//   - "!"  moves to 13.2.5.42 "Markup declaration open state",
//   - "--" moves to 13.2.5.43 "Comment start state".
// Each of these transitions are the only way to get to the 6/42/43 states.
//
// State   !         -         <         >         NUL       EOF       default   HTML spec section
// 43      ...       s44       ...       s01.T.E0  ...       ...       r45       13.2.5.43 Comment start state
// 44      ...       s51       ...       s01.T.E0  ...       T.Z.E1    r45.A-    13.2.5.44 Comment start dash state
// 45      ...       s50       s46.A<    ...       t45.A?.E2 T.Z.E1    t45.Ax    13.2.5.45 Comment state
// 46      s47.A!    ...       t46.A<    ...       ...       ...       r45       13.2.5.46 Comment less-than sign state
// 47      ...       s48       ...       ...       ...       ...       r45       13.2.5.47 Comment less-than sign bang state
// 48      ...       s49       ...       ...       ...       ...       r50       13.2.5.48 Comment less-than sign bang dash state
// 49      ...       ...       ...       s01.T     ...       T.Z.E1    r51.E3    13.2.5.49 Comment less-than sign bang dash dash state
// 50      ...       s51       ...       ...       ...       T.Z.E1    r45.A-    13.2.5.50 Comment end dash state
// 51      s52       t51.A-    ...       s01.T     ...       T.Z.E1    r45.A--   13.2.5.51 Comment end state
// 52      ...       s50.A--!  ...       s01.T.E4  ...       T.Z.E1    r45.A--!  13.2.5.52 Comment end bang state
//
// State 43 is the "Comment start state" meaning that we've only seen "<!--"
// and nothing else. Similarly, state 44 means that we've only seen "<!---",
// with three dashes, and nothing else. For the other states, we deduce
// (working backwards) that the immediate prior input must be:
//   - 45  something that's not '-'
//   - 46  "<"
//   - 47  "<!"
//   - 48  "<!-"
//   - 49  "<!--"  not including the opening "<!--"
//   - 50  "-"     not including the opening "<!--" and also not "--"
//   - 51  "--"    not including the opening "<!--"
//   - 52  "--!"
//
// The table cell actions:
//   - ...   do the default action
//   - A!    append "!"      to the comment token's data.
//   - A-    append "-"      to the comment token's data.
//   - A--   append "--"     to the comment token's data.
//   - A--!  append "--!"    to the comment token's data.
//   - A<    append "<"      to the comment token's data.
//   - A?    append "\uFFFD" to the comment token's data.
//   - Ax    append the current input character to the comment token's data.
//   - E0    parse error (abrupt-closing-of-empty-comment).
//   - E1    parse error (eof-in-comment).
//   - E2    parse error (unexpected-null-character).
//   - E3    parse error (nested-comment).
//   - E4    parse error (incorrectly-closed-comment).
//   - T     emit the current comment token.
//   - Z     emit an end-of-file token.
//   - rNN   reconsume in the 13.2.5.NN     state (after any A* or E* operations).
//   - s01   switch to the    13.2.5.1 Data state (after any A* or E* operations).
//   - sNN   switch to the    13.2.5.NN     state (after any A* or E* operations).
//   - tNN   stay in the      13.2.5.NN     state (after any A* or E* operations).
//
// The E* actions are called errors in the HTML spec but they are not fatal
// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
// the parser"). They are warnings that, in practice, browsers simply ignore.