src/pkg/html/token.go | 58 +++++++++++++++++++++++++++++++++-------------------- src/pkg/html/token_test.go | 17 ++++++++++++++++- diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index d280ff225609c145e1231d383b599b7cc9723eaf..fddc922d6016d9903fbfb9fc2c2cf69fdd74b4c0 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -276,13 +276,12 @@ c := z.readByte() if z.err != nil { return } - var tt TokenType switch { case c == '/': - tt = EndTagToken + z.tt = EndTagToken // Lower-cased characters are more common in tag names, so we check for them first. case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': - tt = StartTagToken + z.tt = StartTagToken case c == '!': z.nextMarkupDeclaration() return @@ -305,8 +304,7 @@ if z.err != nil { return } case '>': - z.tt = tt - if z.buf[z.p1-2] == '/' && tt == StartTagToken { + if z.buf[z.p1-2] == '/' && z.tt == StartTagToken { z.tt = SelfClosingTagToken } return @@ -379,37 +377,53 @@ } return k } -// word finds the largest alphabetic [0-9A-Za-z]* word at the start -// of z.buf[i:] and returns that word (optionally lower-cased), as -// well as the trimmed cursor location after that word. -func (z *Tokenizer) word(i int, lower bool) ([]byte, int) { +// tagName finds the tag name at the start of z.buf[i:] and returns that name +// lower-cased, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) tagName(i int) ([]byte, int) { i0 := i loop: for ; i < z.p1; i++ { c := z.buf[i] - switch { - case '0' <= c && c <= '9': - // No-op. - case 'A' <= c && c <= 'Z': - if lower { - z.buf[i] = c + 'a' - 'A' - } - case 'a' <= c && c <= 'z': - // No-op. - default: + switch c { + case ' ', '\n', '\t', '\f', '/', '>': break loop } + if 'A' <= c && c <= 'Z' { + z.buf[i] = c + 'a' - 'A' + } + } + return z.buf[i0:i], z.trim(i) +} + +// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:] +// and returns that value, as well as the trimmed cursor location afterwards. +func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + switch z.buf[i] { + case ' ', '\n', '\t', '\f', '>': + break loop + case '&': + // TODO: unescape the entity. + } } return z.buf[i0:i], z.trim(i) } // attrName finds the largest attribute name at the start // of z.buf[i:] and returns it lower-cased, as well -// as the trimmed cursor location after that word. +// as the trimmed cursor location after that name. // // http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name // TODO: unicode characters func (z *Tokenizer) attrName(i int) ([]byte, int) { + for z.buf[i] == '/' { + i++ + if z.buf[i] == '>' { + return nil, z.trim(i) + } + } i0 := i loop: for ; i < z.p1; i++ { @@ -469,7 +483,7 @@ } if z.buf[i] == '/' { i++ } - name, z.p0 = z.word(i, true) + name, z.p0 = z.tagName(i) hasAttr = z.p0 != z.p1 return } @@ -496,7 +510,7 @@ return } closeQuote := z.buf[i] if closeQuote != '\'' && closeQuote != '"' { - val, z.p0 = z.word(i, false) + val, z.p0 = z.unquotedAttrVal(i) moreAttr = z.p0 != z.p1 return } diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index c8dcc88648def6a23c9ed825bc28b70bc4a8ee3d..1330f3247a15ecef2fc30c1a03f57572ee32ac68 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -41,6 +41,22 @@ "tags", "bd", "$b$$d$", }, + // Some malformed tags that are missing a '>'. + { + "malformed tag #0", + ``, + ``, + }, + { + "malformed tag #1", + `

`, + `

`, + }, + { + "malformed tag #2", + `

`, + }, // Comments. { "comment0", @@ -117,7 +133,6 @@ "entity with digits", "½", "½", }, - // Attribute tests: // http://dev.w3.org/html5/spec/Overview.html#attributes-0 {