12 سال پیش · 74213743f3
--- a/html/charset/charset.go
+++ b/html/charset/charset.go
@@ -5,9 +5,14 @@
 
															 package charset
														
 
															 import (
														
 
															+	"bytes"
														
 
															+	"mime"
														
 
															 	"strings"
														
 
															+	"unicode/utf8"
														
 
															+	"code.google.com/p/go.net/html"
														
 
															 	"code.google.com/p/go.text/encoding"
														
 
															+	"code.google.com/p/go.text/encoding/charmap"
														
 
															 )
														
 
															 // Lookup returns the encoding with the specified label, and its canonical
														
@@ -19,3 +24,181 @@ func Lookup(label string) (e encoding.Encoding, name string) {
 
															 	enc := encodings[label]
														
 
															 	return enc.e, enc.name
														
 
															 }
														
 
															+
														
 
															+// DetermineEncoding determines the encoding of an HTML document by examining
														
 
															+// up to the first 1024 bytes of content and the declared Content-Type.
														
 
															+//
														
 
															+// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
														
 
															+func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
														
 
															+	if len(content) > 1024 {
														
 
															+		content = content[:1024]
														
 
															+	}
														
 
															+
														
 
															+	for _, b := range boms {
														
 
															+		if bytes.HasPrefix(content, b.bom) {
														
 
															+			e, name = Lookup(b.enc)
														
 
															+			return e, name, true
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if _, params, err := mime.ParseMediaType(contentType); err == nil {
														
 
															+		if cs, ok := params["charset"]; ok {
														
 
															+			if e, name = Lookup(cs); e != nil {
														
 
															+				return e, name, true
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if len(content) > 0 {
														
 
															+		e, name = prescan(content)
														
 
															+		if e != nil {
														
 
															+			return e, name, false
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	// Try to detect UTF-8.
														
 
															+	// First eliminate any partial rune at the end.
														
 
															+	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
														
 
															+		b := content[i]
														
 
															+		if b < 0x80 {
														
 
															+			break
														
 
															+		}
														
 
															+		if utf8.RuneStart(b) {
														
 
															+			content = content[:i]
														
 
															+			break
														
 
															+		}
														
 
															+	}
														
 
															+	hasHighBit := false
														
 
															+	for _, c := range content {
														
 
															+		if c >= 0x80 {
														
 
															+			hasHighBit = true
														
 
															+			break
														
 
															+		}
														
 
															+	}
														
 
															+	if hasHighBit && utf8.Valid(content) {
														
 
															+		return encoding.Nop, "utf-8", false
														
 
															+	}
														
 
															+
														
 
															+	// TODO: change default depending on user's locale?
														
 
															+	return charmap.Windows1252, "windows-1252", false
														
 
															+}
														
 
															+
														
 
															+func prescan(content []byte) (e encoding.Encoding, name string) {
														
 
															+	z := html.NewTokenizer(bytes.NewReader(content))
														
 
															+	for {
														
 
															+		switch z.Next() {
														
 
															+		case html.ErrorToken:
														
 
															+			return nil, ""
														
 
															+
														
 
															+		case html.StartTagToken, html.SelfClosingTagToken:
														
 
															+			tagName, hasAttr := z.TagName()
														
 
															+			if !bytes.Equal(tagName, []byte("meta")) {
														
 
															+				continue
														
 
															+			}
														
 
															+			attrList := make(map[string]bool)
														
 
															+			gotPragma := false
														
 
															+
														
 
															+			const (
														
 
															+				dontKnow = iota
														
 
															+				doNeedPragma
														
 
															+				doNotNeedPragma
														
 
															+			)
														
 
															+			needPragma := dontKnow
														
 
															+
														
 
															+			name = ""
														
 
															+			e = nil
														
 
															+			for hasAttr {
														
 
															+				var key, val []byte
														
 
															+				key, val, hasAttr = z.TagAttr()
														
 
															+				ks := string(key)
														
 
															+				if attrList[ks] {
														
 
															+					continue
														
 
															+				}
														
 
															+				attrList[ks] = true
														
 
															+				for i, c := range val {
														
 
															+					if 'A' <= c && c <= 'Z' {
														
 
															+						val[i] = c + 0x20
														
 
															+					}
														
 
															+				}
														
 
															+
														
 
															+				switch ks {
														
 
															+				case "http-equiv":
														
 
															+					if bytes.Equal(val, []byte("content-type")) {
														
 
															+						gotPragma = true
														
 
															+					}
														
 
															+
														
 
															+				case "content":
														
 
															+					if e == nil {
														
 
															+						name = fromMetaElement(string(val))
														
 
															+						if name != "" {
														
 
															+							e, name = Lookup(name)
														
 
															+							if e != nil {
														
 
															+								needPragma = doNeedPragma
														
 
															+							}
														
 
															+						}
														
 
															+					}
														
 
															+
														
 
															+				case "charset":
														
 
															+					e, name = Lookup(string(val))
														
 
															+					needPragma = doNotNeedPragma
														
 
															+				}
														
 
															+			}
														
 
															+
														
 
															+			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
														
 
															+				continue
														
 
															+			}
														
 
															+
														
 
															+			if strings.HasPrefix(name, "utf-16") {
														
 
															+				name = "utf-8"
														
 
															+				e = encoding.Nop
														
 
															+			}
														
 
															+
														
 
															+			if e != nil {
														
 
															+				return e, name
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func fromMetaElement(s string) string {
														
 
															+	for s != "" {
														
 
															+		csLoc := strings.Index(s, "charset")
														
 
															+		if csLoc == -1 {
														
 
															+			return ""
														
 
															+		}
														
 
															+		s = s[csLoc+len("charset"):]
														
 
															+		s = strings.TrimLeft(s, " \t\n\f\r")
														
 
															+		if !strings.HasPrefix(s, "=") {
														
 
															+			continue
														
 
															+		}
														
 
															+		s = s[1:]
														
 
															+		s = strings.TrimLeft(s, " \t\n\f\r")
														
 
															+		if s == "" {
														
 
															+			return ""
														
 
															+		}
														
 
															+		if q := s[0]; q == '"' || q == '\'' {
														
 
															+			s = s[1:]
														
 
															+			closeQuote := strings.IndexRune(s, rune(q))
														
 
															+			if closeQuote == -1 {
														
 
															+				return ""
														
 
															+			}
														
 
															+			return s[:closeQuote]
														
 
															+		}
														
 
															+
														
 
															+		end := strings.IndexAny(s, "; \t\n\f\r")
														
 
															+		if end == -1 {
														
 
															+			end = len(s)
														
 
															+		}
														
 
															+		return s[:end]
														
 
															+	}
														
 
															+	return ""
														
 
															+}
														
 
															+
														
 
															+var boms = []struct {
														
 
															+	bom []byte
														
 
															+	enc string
														
 
															+}{
														
 
															+	{[]byte{0xfe, 0xff}, "utf-16be"},
														
 
															+	{[]byte{0xff, 0xfe}, "utf-16le"},
														
 
															+	{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
														
 
															+}
														
--- a/html/charset/charset_test.go
+++ b/html/charset/charset_test.go
@@ -110,3 +110,56 @@ func TestNames(t *testing.T) {
 
															 		}
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+var sniffTestCases = []struct {
														
 
															+	filename, declared, want string
														
 
															+}{
														
 
															+	{"HTTP-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
														
 
															+	{"UTF-16LE-BOM.html", "", "utf-16le"},
														
 
															+	{"UTF-16BE-BOM.html", "", "utf-16be"},
														
 
															+	{"meta-content-attribute.html", "text/html", "iso-8859-15"},
														
 
															+	{"meta-charset-attribute.html", "text/html", "iso-8859-15"},
														
 
															+	{"No-encoding-declaration.html", "text/html", "utf-8"},
														
 
															+	{"HTTP-vs-UTF-8-BOM.html", "text/html; charset=iso-8859-15", "utf-8"},
														
 
															+	{"HTTP-vs-meta-content.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
														
 
															+	{"HTTP-vs-meta-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
														
 
															+	{"UTF-8-BOM-vs-meta-content.html", "text/html", "utf-8"},
														
 
															+	{"UTF-8-BOM-vs-meta-charset.html", "text/html", "utf-8"},
														
 
															+}
														
 
															+
														
 
															+func TestSniff(t *testing.T) {
														
 
															+	for _, tc := range sniffTestCases {
														
 
															+		content, err := ioutil.ReadFile("testdata/" + tc.filename)
														
 
															+		if err != nil {
														
 
															+			t.Errorf("%s: error reading file: %v", tc.filename, err)
														
 
															+			continue
														
 
															+		}
														
 
															+
														
 
															+		_, name, _ := DetermineEncoding(content, tc.declared)
														
 
															+		if name != tc.want {
														
 
															+			t.Errorf("%s: got %q, want %q", tc.filename, name, tc.want)
														
 
															+			continue
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+var metaTestCases = []struct {
														
 
															+	meta, want string
														
 
															+}{
														
 
															+	{"", ""},
														
 
															+	{"text/html", ""},
														
 
															+	{"text/html; charset utf-8", ""},
														
 
															+	{"text/html; charset=latin-2", "latin-2"},
														
 
															+	{"text/html; charset; charset = utf-8", "utf-8"},
														
 
															+	{`charset="big5"`, "big5"},
														
 
															+	{"charset='shift_jis'", "shift_jis"},
														
 
															+}
														
 
															+
														
 
															+func TestFromMeta(t *testing.T) {
														
 
															+	for _, tc := range metaTestCases {
														
 
															+		got := fromMetaElement(tc.meta)
														
 
															+		if got != tc.want {
														
 
															+			t.Errorf("%q: got %q, want %q", tc.meta, got, tc.want)
														
 
															+		}
														
 
															+	}
														
 
															+}
														
--- a/html/charset/testdata/HTTP-charset.html
+++ b/html/charset/testdata/HTTP-charset.html
@@ -0,0 +1,48 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+  <title>HTTP charset</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="The character encoding of a page can be set using the HTTP header charset declaration.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>HTTP charset</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">The character encoding of a page can be set using the HTTP header charset declaration.</p>
														
 
															+<div class="notes"><p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p><p>The only character encoding declaration for this HTML file is in the HTTP header, which sets the encoding to ISO 8859-15.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-003">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-001<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-001" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/HTTP-vs-UTF-8-BOM.html
+++ b/html/charset/testdata/HTTP-vs-UTF-8-BOM.html
@@ -0,0 +1,48 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+  <title>HTTP vs UTF-8 BOM</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>HTTP vs UTF-8 BOM</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.</p>
														
 
															+<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p><p>If the test is unsuccessful, the characters &#x00EF;&#x00BB;&#x00BF; should appear at the top of the page.  These represent the bytes that make up the UTF-8 signature when encountered in the ISO 8859-15 encoding.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-022">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-034<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-034" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/HTTP-vs-meta-charset.html
+++ b/html/charset/testdata/HTTP-vs-meta-charset.html
@@ -0,0 +1,49 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+ <meta charset="iso-8859-1" > <title>HTTP vs meta charset</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }.test div { width: 90px; }
														
 
															+</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>HTTP vs meta charset</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.</p>
														
 
															+<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-1.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-037">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-018<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-018" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/HTTP-vs-meta-content.html
+++ b/html/charset/testdata/HTTP-vs-meta-content.html
@@ -0,0 +1,49 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" > <title>HTTP vs meta content</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }.test div { width: 90px; }
														
 
															+</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>HTTP vs meta content</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.</p>
														
 
															+<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-1.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-018">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-016<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-016" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/No-encoding-declaration.html
+++ b/html/charset/testdata/No-encoding-declaration.html
@@ -0,0 +1,47 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+  <title>No encoding declaration</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>No encoding declaration</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.</p>
														
 
															+<div class="notes"><p><p>The test on this page contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-034">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-015<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-015" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/README
+++ b/html/charset/testdata/README
@@ -0,0 +1 @@
 
															+These test cases come from http://www.w3.org/International/tests/html5/the-input-byte-stream/results-basics
														
--- a/html/charset/testdata/UTF-16BE-BOM.html
+++ b/html/charset/testdata/UTF-16BE-BOM.html
--- a/html/charset/testdata/UTF-16LE-BOM.html
+++ b/html/charset/testdata/UTF-16LE-BOM.html
--- a/html/charset/testdata/UTF-8-BOM-vs-meta-charset.html
+++ b/html/charset/testdata/UTF-8-BOM-vs-meta-charset.html
@@ -0,0 +1,49 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+ <meta charset="iso-8859-15"> <title>UTF-8 BOM vs meta charset</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }.test div { width: 90px; }
														
 
															+</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>UTF-8 BOM vs meta charset</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.</p>
														
 
															+<div class="notes"><p><p>The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-024">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-038<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-038" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/UTF-8-BOM-vs-meta-content.html
+++ b/html/charset/testdata/UTF-8-BOM-vs-meta-content.html
@@ -0,0 +1,48 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+ <meta http-equiv="content-type" content="text/html; charset=iso-8859-15"> <title>UTF-8 BOM vs meta content</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>UTF-8 BOM vs meta content</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.</p>
														
 
															+<div class="notes"><p><p>The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-038">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-037<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-037" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/meta-charset-attribute.html
+++ b/html/charset/testdata/meta-charset-attribute.html
@@ -0,0 +1,48 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+ <meta charset="iso-8859-15"> <title>meta charset attribute</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="The character encoding of the page can be set by a meta element with charset attribute.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>meta charset attribute</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">The character encoding of the page can be set by a meta element with charset attribute.</p>
														
 
															+<div class="notes"><p><p>The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-015">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-009<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-009" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
														
--- a/html/charset/testdata/meta-content-attribute.html
+++ b/html/charset/testdata/meta-content-attribute.html
@@ -0,0 +1,48 @@
 
															+<!DOCTYPE html>
														
 
															+<html  lang="en" >
														
 
															+<head>
														
 
															+ <meta http-equiv="content-type" content="text/html; charset=iso-8859-15"> <title>meta content attribute</title>
														
 
															+<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
														
 
															+<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
														
 
															+<link rel="stylesheet" type="text/css" href="./generatedtests.css">
														
 
															+<script src="http://w3c-test.org/resources/testharness.js"></script>
														
 
															+<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
														
 
															+<meta name='flags' content='http'>
														
 
															+<meta name="assert" content="The character encoding of the page can be set by a meta element with http-equiv and content attributes.">
														
 
															+<style type='text/css'>
														
 
															+.test div { width: 50px; }</style>
														
 
															+<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
														
 
															+</head>
														
 
															+<body>
														
 
															+<p class='title'>meta content attribute</p>
														
 
															+
														
 
															+
														
 
															+<div id='log'></div>
														
 
															+
														
 
															+
														
 
															+<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+<div class='description'>
														
 
															+<p class="assertion" title="Assertion">The character encoding of the page can be set by a meta element with http-equiv and content attributes.</p>
														
 
															+<div class="notes"><p><p>The only character encoding declaration for this HTML file is in the content attribute of the meta element, which declares the encoding to be ISO 8859-15.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
														
 
															+</div>
														
 
															+</div>
														
 
															+<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-009">Next test</a></div><div class="doctype">HTML5</div>
														
 
															+<p class="jump">the-input-byte-stream-007<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-007" target="_blank">Detailed results for this test</a><br/>	<a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
														
 
															+<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
														
 
															+				<li>The test is read from a server that supports HTTP.</li></ul></div>
														
 
															+</div>
														
 
															+<script>
														
 
															+test(function() {
														
 
															+assert_equals(document.getElementById('box').offsetWidth, 100);
														
 
															+}, " ");
														
 
															+</script>
														
 
															+
														
 
															+</body>
														
 
															+</html>
														
 
															+
														
 
															+
	`@@ -0,0 +1 @@`
			`+These test cases come from http://www.w3.org/International/tests/html5/the-input-byte-stream/results-basics`