Ver Fonte

go.net/html/charset: implement the encoding sniffing algorithm

R=nigeltao
CC=golang-dev
https://golang.org/cl/31220043
Andrew Balholm há 12 anos atrás
pai
commit
74213743f3

+ 183 - 0
html/charset/charset.go

@@ -5,9 +5,14 @@
 package charset
 package charset
 
 
 import (
 import (
+	"bytes"
+	"mime"
 	"strings"
 	"strings"
+	"unicode/utf8"
 
 
+	"code.google.com/p/go.net/html"
 	"code.google.com/p/go.text/encoding"
 	"code.google.com/p/go.text/encoding"
+	"code.google.com/p/go.text/encoding/charmap"
 )
 )
 
 
 // Lookup returns the encoding with the specified label, and its canonical
 // Lookup returns the encoding with the specified label, and its canonical
@@ -19,3 +24,181 @@ func Lookup(label string) (e encoding.Encoding, name string) {
 	enc := encodings[label]
 	enc := encodings[label]
 	return enc.e, enc.name
 	return enc.e, enc.name
 }
 }
+
+// DetermineEncoding determines the encoding of an HTML document by examining
+// up to the first 1024 bytes of content and the declared Content-Type.
+//
+// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
+func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
+	if len(content) > 1024 {
+		content = content[:1024]
+	}
+
+	for _, b := range boms {
+		if bytes.HasPrefix(content, b.bom) {
+			e, name = Lookup(b.enc)
+			return e, name, true
+		}
+	}
+
+	if _, params, err := mime.ParseMediaType(contentType); err == nil {
+		if cs, ok := params["charset"]; ok {
+			if e, name = Lookup(cs); e != nil {
+				return e, name, true
+			}
+		}
+	}
+
+	if len(content) > 0 {
+		e, name = prescan(content)
+		if e != nil {
+			return e, name, false
+		}
+	}
+
+	// Try to detect UTF-8.
+	// First eliminate any partial rune at the end.
+	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
+		b := content[i]
+		if b < 0x80 {
+			break
+		}
+		if utf8.RuneStart(b) {
+			content = content[:i]
+			break
+		}
+	}
+	hasHighBit := false
+	for _, c := range content {
+		if c >= 0x80 {
+			hasHighBit = true
+			break
+		}
+	}
+	if hasHighBit && utf8.Valid(content) {
+		return encoding.Nop, "utf-8", false
+	}
+
+	// TODO: change default depending on user's locale?
+	return charmap.Windows1252, "windows-1252", false
+}
+
+func prescan(content []byte) (e encoding.Encoding, name string) {
+	z := html.NewTokenizer(bytes.NewReader(content))
+	for {
+		switch z.Next() {
+		case html.ErrorToken:
+			return nil, ""
+
+		case html.StartTagToken, html.SelfClosingTagToken:
+			tagName, hasAttr := z.TagName()
+			if !bytes.Equal(tagName, []byte("meta")) {
+				continue
+			}
+			attrList := make(map[string]bool)
+			gotPragma := false
+
+			const (
+				dontKnow = iota
+				doNeedPragma
+				doNotNeedPragma
+			)
+			needPragma := dontKnow
+
+			name = ""
+			e = nil
+			for hasAttr {
+				var key, val []byte
+				key, val, hasAttr = z.TagAttr()
+				ks := string(key)
+				if attrList[ks] {
+					continue
+				}
+				attrList[ks] = true
+				for i, c := range val {
+					if 'A' <= c && c <= 'Z' {
+						val[i] = c + 0x20
+					}
+				}
+
+				switch ks {
+				case "http-equiv":
+					if bytes.Equal(val, []byte("content-type")) {
+						gotPragma = true
+					}
+
+				case "content":
+					if e == nil {
+						name = fromMetaElement(string(val))
+						if name != "" {
+							e, name = Lookup(name)
+							if e != nil {
+								needPragma = doNeedPragma
+							}
+						}
+					}
+
+				case "charset":
+					e, name = Lookup(string(val))
+					needPragma = doNotNeedPragma
+				}
+			}
+
+			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
+				continue
+			}
+
+			if strings.HasPrefix(name, "utf-16") {
+				name = "utf-8"
+				e = encoding.Nop
+			}
+
+			if e != nil {
+				return e, name
+			}
+		}
+	}
+}
+
+func fromMetaElement(s string) string {
+	for s != "" {
+		csLoc := strings.Index(s, "charset")
+		if csLoc == -1 {
+			return ""
+		}
+		s = s[csLoc+len("charset"):]
+		s = strings.TrimLeft(s, " \t\n\f\r")
+		if !strings.HasPrefix(s, "=") {
+			continue
+		}
+		s = s[1:]
+		s = strings.TrimLeft(s, " \t\n\f\r")
+		if s == "" {
+			return ""
+		}
+		if q := s[0]; q == '"' || q == '\'' {
+			s = s[1:]
+			closeQuote := strings.IndexRune(s, rune(q))
+			if closeQuote == -1 {
+				return ""
+			}
+			return s[:closeQuote]
+		}
+
+		end := strings.IndexAny(s, "; \t\n\f\r")
+		if end == -1 {
+			end = len(s)
+		}
+		return s[:end]
+	}
+	return ""
+}
+
+var boms = []struct {
+	bom []byte
+	enc string
+}{
+	{[]byte{0xfe, 0xff}, "utf-16be"},
+	{[]byte{0xff, 0xfe}, "utf-16le"},
+	{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
+}

+ 53 - 0
html/charset/charset_test.go

@@ -110,3 +110,56 @@ func TestNames(t *testing.T) {
 		}
 		}
 	}
 	}
 }
 }
+
+var sniffTestCases = []struct {
+	filename, declared, want string
+}{
+	{"HTTP-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
+	{"UTF-16LE-BOM.html", "", "utf-16le"},
+	{"UTF-16BE-BOM.html", "", "utf-16be"},
+	{"meta-content-attribute.html", "text/html", "iso-8859-15"},
+	{"meta-charset-attribute.html", "text/html", "iso-8859-15"},
+	{"No-encoding-declaration.html", "text/html", "utf-8"},
+	{"HTTP-vs-UTF-8-BOM.html", "text/html; charset=iso-8859-15", "utf-8"},
+	{"HTTP-vs-meta-content.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
+	{"HTTP-vs-meta-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
+	{"UTF-8-BOM-vs-meta-content.html", "text/html", "utf-8"},
+	{"UTF-8-BOM-vs-meta-charset.html", "text/html", "utf-8"},
+}
+
+func TestSniff(t *testing.T) {
+	for _, tc := range sniffTestCases {
+		content, err := ioutil.ReadFile("testdata/" + tc.filename)
+		if err != nil {
+			t.Errorf("%s: error reading file: %v", tc.filename, err)
+			continue
+		}
+
+		_, name, _ := DetermineEncoding(content, tc.declared)
+		if name != tc.want {
+			t.Errorf("%s: got %q, want %q", tc.filename, name, tc.want)
+			continue
+		}
+	}
+}
+
+var metaTestCases = []struct {
+	meta, want string
+}{
+	{"", ""},
+	{"text/html", ""},
+	{"text/html; charset utf-8", ""},
+	{"text/html; charset=latin-2", "latin-2"},
+	{"text/html; charset; charset = utf-8", "utf-8"},
+	{`charset="big5"`, "big5"},
+	{"charset='shift_jis'", "shift_jis"},
+}
+
+func TestFromMeta(t *testing.T) {
+	for _, tc := range metaTestCases {
+		got := fromMetaElement(tc.meta)
+		if got != tc.want {
+			t.Errorf("%q: got %q, want %q", tc.meta, got, tc.want)
+		}
+	}
+}

+ 48 - 0
html/charset/testdata/HTTP-charset.html

@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+  <title>HTTP charset</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="The character encoding of a page can be set using the HTTP header charset declaration.">
+<style type='text/css'>
+.test div { width: 50px; }</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
+</head>
+<body>
+<p class='title'>HTTP charset</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">The character encoding of a page can be set using the HTTP header charset declaration.</p>
+<div class="notes"><p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p><p>The only character encoding declaration for this HTML file is in the HTTP header, which sets the encoding to ISO 8859-15.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-003">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-001<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-001" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 48 - 0
html/charset/testdata/HTTP-vs-UTF-8-BOM.html

@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+  <title>HTTP vs UTF-8 BOM</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.">
+<style type='text/css'>
+.test div { width: 50px; }</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
+</head>
+<body>
+<p class='title'>HTTP vs UTF-8 BOM</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.</p>
+<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p><p>If the test is unsuccessful, the characters &#x00EF;&#x00BB;&#x00BF; should appear at the top of the page.  These represent the bytes that make up the UTF-8 signature when encountered in the ISO 8859-15 encoding.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-022">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-034<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-034" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 49 - 0
html/charset/testdata/HTTP-vs-meta-charset.html

@@ -0,0 +1,49 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+ <meta charset="iso-8859-1" > <title>HTTP vs meta charset</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.">
+<style type='text/css'>
+.test div { width: 50px; }.test div { width: 90px; }
+</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
+</head>
+<body>
+<p class='title'>HTTP vs meta charset</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.</p>
+<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-1.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-037">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-018<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-018" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 49 - 0
html/charset/testdata/HTTP-vs-meta-content.html

@@ -0,0 +1,49 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" > <title>HTTP vs meta content</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.">
+<style type='text/css'>
+.test div { width: 50px; }.test div { width: 90px; }
+</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
+</head>
+<body>
+<p class='title'>HTTP vs meta content</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.</p>
+<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-1.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-018">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-016<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-016" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 47 - 0
html/charset/testdata/No-encoding-declaration.html

@@ -0,0 +1,47 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+  <title>No encoding declaration</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.">
+<style type='text/css'>
+.test div { width: 50px; }</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
+</head>
+<body>
+<p class='title'>No encoding declaration</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.</p>
+<div class="notes"><p><p>The test on this page contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-034">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-015<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-015" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 1 - 0
html/charset/testdata/README

@@ -0,0 +1 @@
+These test cases come from http://www.w3.org/International/tests/html5/the-input-byte-stream/results-basics

BIN
html/charset/testdata/UTF-16BE-BOM.html


BIN
html/charset/testdata/UTF-16LE-BOM.html


+ 49 - 0
html/charset/testdata/UTF-8-BOM-vs-meta-charset.html

@@ -0,0 +1,49 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+ <meta charset="iso-8859-15"> <title>UTF-8 BOM vs meta charset</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.">
+<style type='text/css'>
+.test div { width: 50px; }.test div { width: 90px; }
+</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
+</head>
+<body>
+<p class='title'>UTF-8 BOM vs meta charset</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.</p>
+<div class="notes"><p><p>The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-024">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-038<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-038" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 48 - 0
html/charset/testdata/UTF-8-BOM-vs-meta-content.html

@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+ <meta http-equiv="content-type" content="text/html; charset=iso-8859-15"> <title>UTF-8 BOM vs meta content</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.">
+<style type='text/css'>
+.test div { width: 50px; }</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
+</head>
+<body>
+<p class='title'>UTF-8 BOM vs meta content</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.</p>
+<div class="notes"><p><p>The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-038">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-037<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-037" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 48 - 0
html/charset/testdata/meta-charset-attribute.html

@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+ <meta charset="iso-8859-15"> <title>meta charset attribute</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="The character encoding of the page can be set by a meta element with charset attribute.">
+<style type='text/css'>
+.test div { width: 50px; }</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
+</head>
+<body>
+<p class='title'>meta charset attribute</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">The character encoding of the page can be set by a meta element with charset attribute.</p>
+<div class="notes"><p><p>The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-015">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-009<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-009" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+

+ 48 - 0
html/charset/testdata/meta-content-attribute.html

@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html  lang="en" >
+<head>
+ <meta http-equiv="content-type" content="text/html; charset=iso-8859-15"> <title>meta content attribute</title>
+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
+<script src="http://w3c-test.org/resources/testharness.js"></script>
+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
+<meta name='flags' content='http'>
+<meta name="assert" content="The character encoding of the page can be set by a meta element with http-equiv and content attributes.">
+<style type='text/css'>
+.test div { width: 50px; }</style>
+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
+</head>
+<body>
+<p class='title'>meta content attribute</p>
+
+
+<div id='log'></div>
+
+
+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
+
+
+
+
+
+<div class='description'>
+<p class="assertion" title="Assertion">The character encoding of the page can be set by a meta element with http-equiv and content attributes.</p>
+<div class="notes"><p><p>The only character encoding declaration for this HTML file is in the content attribute of the meta element, which declares the encoding to be ISO 8859-15.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
+</div>
+</div>
+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-009">Next test</a></div><div class="doctype">HTML5</div>
+<p class="jump">the-input-byte-stream-007<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-007" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
+				<li>The test is read from a server that supports HTTP.</li></ul></div>
+</div>
+<script>
+test(function() {
+assert_equals(document.getElementById('box').offsetWidth, 100);
+}, " ");
+</script>
+
+</body>
+</html>
+
+