1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
17 type tokenTest struct {
18 // A short description of the test case.
22 // The string representations of the expected tokens, joined by '$'.
26 var tokenTests = []tokenTest{
32 // A single text node. The tokenizer should not break text nodes on whitespace,
33 // nor should it normalize whitespace within a text node.
45 // A start, self-closing and end tag. The tokenizer does not care if the start
46 // and end tokens don't match; that is the job of the parser.
52 // Angle brackets that aren't a tag.
101 "a<<$<b>$>>c",
105 "if x<0 and y < 0 then x*y>0",
106 "if x<0 and y < 0 then x*y>0",
113 // EOF in a tag name.
139 // Some malformed tags that are missing a '>'.
183 `<p id="0" <="" p="">`,
190 // Raw text and RCDATA.
193 "<script><a></b></script>",
194 "<script>$<a></b>$</script>",
197 "unfinished script end tag",
199 "<script>$a</SCR",
202 "broken script end tag",
203 "<SCRIPT>a</SCR ipt>",
204 "<script>$a</SCR ipt>",
207 "EOF in script end tag",
209 "<script>$a</SCRipt",
213 "<SCRIPT>a</SCRiptx",
214 "<script>$a</SCRiptx",
217 "' ' completes script end tag",
218 "<SCRIPT>a</SCRipt ",
222 "'>' completes script end tag",
223 "<SCRIPT>a</SCRipt>",
224 "<script>$a$</script>",
227 "self-closing script end tag",
228 "<SCRIPT>a</SCRipt/>",
229 "<script>$a$</script>",
233 "<SCRIPT>a</SCRipt<script>",
234 "<script>$a</SCRipt<script>",
237 "script end tag after unfinished",
238 "<SCRIPT>a</SCRipt</script>",
239 "<script>$a</SCRipt$</script>",
242 "script/style mismatched tags",
244 "<script>$a</style>",
247 "style element with entity",
249 "<style>$&apos;",
253 "<textarea><div></textarea>",
254 "<textarea>$<div>$</textarea>",
257 "title with tag and entity",
258 "<title><b>K&R C</b></title>",
259 "<title>$<b>K&R C</b>$</title>",
268 "DOCTYPE with no space",
273 "DOCTYPE with two spaces",
278 "looks like DOCTYPE but isn't",
280 "<!--DOCUMENT html-->",
287 // XML processing instructions.
289 "XML processing instruction",
296 "abc<b><!-- skipme --></b>def",
297 "abc$<b>$<!-- skipme -->$</b>$def",
364 // An attribute with a backslash.
370 // Entities, tag name and attribute key lower-casing, and whitespace
371 // normalization within a tag.
374 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
375 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
377 // A nonexistent entity. Tokenizing and converting back to a string should
378 // escape the "&" to become "&".
381 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
382 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
385 "entity without semicolon",
386 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
387 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
390 "entity with digits",
395 // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
398 `<input disabled FOO>`,
399 `<input disabled="" foo="">`,
402 "Empty attribute, whitespace",
403 `<input disabled FOO >`,
404 `<input disabled="" foo="">`,
407 "Unquoted attribute value",
408 `<input value=yes FOO=BAR>`,
409 `<input value="yes" foo="BAR">`,
412 "Unquoted attribute value, spaces",
413 `<input value = yes FOO = BAR>`,
414 `<input value="yes" foo="BAR">`,
417 "Unquoted attribute value, trailing space",
418 `<input value=yes FOO=BAR >`,
419 `<input value="yes" foo="BAR">`,
422 "Single-quoted attribute value",
423 `<input value='yes' FOO='BAR'>`,
424 `<input value="yes" foo="BAR">`,
427 "Single-quoted attribute value, trailing space",
428 `<input value='yes' FOO='BAR' >`,
429 `<input value="yes" foo="BAR">`,
432 "Double-quoted attribute value",
433 `<input value="I'm an attribute" FOO="BAR">`,
434 `<input value="I'm an attribute" foo="BAR">`,
437 "Attribute name characters",
438 `<meta http-equiv="content-type">`,
439 `<meta http-equiv="content-type">`,
443 `a<P V="0 1" w='2' X=3 y>z`,
444 `a$<p v="0 1" w="2" x="3" y="">$z`,
447 "Attributes with a solitary single quote",
448 `<p id=can't><p id=won't>`,
449 `<p id="can't">$<p id="won't">`,
453 func TestTokenizer(t *testing.T) {
455 for _, tt := range tokenTests {
456 z := NewTokenizer(strings.NewReader(tt.html))
458 for i, s := range strings.Split(tt.golden, "$") {
459 if z.Next() == ErrorToken {
460 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
463 actual := z.Token().String()
465 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
471 if z.Err() != io.EOF {
472 t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
477 func TestMaxBuffer(t *testing.T) {
478 // Exceeding the maximum buffer size generates ErrBufferExceeded.
479 z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
482 if got, want := tt, ErrorToken; got != want {
483 t.Fatalf("token type: got: %v want: %v", got, want)
485 if got, want := z.Err(), ErrBufferExceeded; got != want {
486 t.Errorf("error type: got: %v want: %v", got, want)
488 if got, want := string(z.Raw()), "<tttt"; got != want {
489 t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
493 func TestMaxBufferReconstruction(t *testing.T) {
494 // Exceeding the maximum buffer size at any point while tokenizing permits
495 // reconstructing the original input.
497 for _, test := range tokenTests {
498 for maxBuf := 1; ; maxBuf++ {
499 r := strings.NewReader(test.html)
502 var tokenized bytes.Buffer
505 tokenized.Write(z.Raw())
506 if tt == ErrorToken {
507 if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
508 t.Errorf("%s: unexpected error: %v", test.desc, err)
513 // Anything tokenized along with untokenized input or data left in the reader.
514 assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
516 t.Errorf("%s: ReadAll: %v", test.desc, err)
519 if got, want := string(assembled), test.html; got != want {
520 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
523 // EOF indicates that we completed tokenization and hence found the max
524 // maxBuf that generates ErrBufferExceeded, so continue to the next test.
525 if z.Err() == io.EOF {
532 func TestPassthrough(t *testing.T) {
533 // Accumulating the raw output for each parse event should reconstruct the
535 for _, test := range tokenTests {
536 z := NewTokenizer(strings.NewReader(test.html))
537 var parsed bytes.Buffer
540 parsed.Write(z.Raw())
541 if tt == ErrorToken {
545 if got, want := parsed.String(), test.html; got != want {
546 t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
551 func TestBufAPI(t *testing.T) {
552 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
553 z := NewTokenizer(bytes.NewBufferString(s))
554 var result bytes.Buffer
561 if z.Err() != io.EOF {
567 result.Write(z.Text())
569 case StartTagToken, EndTagToken:
571 if len(tn) == 1 && tn[0] == 'a' {
572 if tt == StartTagToken {
581 v := string(result.Bytes())
583 t.Errorf("TestBufAPI: want %q got %q", u, v)
587 func TestConvertNewlines(t *testing.T) {
588 testCases := map[string]string{
589 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
590 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
591 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
602 "\r\r\n\n": "\n\n\n",
603 "\r\r\r\n": "\n\n\n",
607 for in, want := range testCases {
608 if got := string(convertNewlines([]byte(in))); got != want {
609 t.Errorf("input %q: got %q, want %q", in, got, want)
614 func TestReaderEdgeCases(t *testing.T) {
615 const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
616 testCases := []io.Reader{
617 &zeroOneByteReader{s: s},
618 &eofStringsReader{s: s},
621 for i, tc := range testCases {
623 z := NewTokenizer(tc)
626 if tt == ErrorToken {
629 got = append(got, tt)
631 if err := z.Err(); err != nil && err != io.EOF {
632 if err != io.ErrNoProgress {
633 t.Errorf("i=%d: %v", i, err)
642 if !reflect.DeepEqual(got, want) {
643 t.Errorf("i=%d: got %v, want %v", i, got, want)
649 // zeroOneByteReader is like a strings.Reader that alternates between
650 // returning 0 bytes and 1 byte at a time.
651 type zeroOneByteReader struct {
656 func (r *zeroOneByteReader) Read(p []byte) (int, error) {
667 p[0], r.s = r.s[0], r.s[1:]
671 // eofStringsReader is like a strings.Reader but can return an (n, err) where
672 // n > 0 && err != nil.
673 type eofStringsReader struct {
677 func (r *eofStringsReader) Read(p []byte) (int, error) {
686 // stuckReader is an io.Reader that always returns no data and no error.
687 type stuckReader struct{}
689 func (*stuckReader) Read(p []byte) (int, error) {
699 func benchmarkTokenizer(b *testing.B, level int) {
700 buf, err := ioutil.ReadFile("testdata/go1.html")
702 b.Fatalf("could not read testdata/go1.html: %v", err)
704 b.SetBytes(int64(len(buf)))
708 for i := 0; i < b.N; i++ {
709 z := NewTokenizer(bytes.NewBuffer(buf))
712 if tt == ErrorToken {
713 if err := z.Err(); err != nil && err != io.EOF {
714 b.Fatalf("tokenizer error: %v", err)
720 // Calling z.Raw just returns the raw bytes of the token. It does
721 // not unescape < to <, or lower-case tag names and attribute keys.
724 // Caling z.Text, z.TagName and z.TagAttr returns []byte values
725 // whose contents may change on the next call to z.Next.
727 case TextToken, CommentToken, DoctypeToken:
729 case StartTagToken, SelfClosingTagToken:
730 _, more := z.TagName()
732 _, _, more = z.TagAttr()
738 // Calling z.Token converts []byte values to strings whose validity
739 // extend beyond the next call to z.Next.
746 func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
747 func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
748 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }