فهرست منبع

go.net/html/charset: add NewReader

NewReader is a convenience function for finding the encoding of
an io.Reader and making a UTF-8 version of that Reader.

R=nigeltao
CC=golang-dev
https://golang.org/cl/43510043
Andrew Balholm 12 سال پیش
والد
کامیت
3f04d1ffd7
2فایلهای تغییر یافته به همراه58 افزوده شده و 0 حذف شده
  1. 23 0
      html/charset/charset.go
  2. 35 0
      html/charset/charset_test.go

+ 23 - 0
html/charset/charset.go

@@ -6,6 +6,7 @@ package charset
 
 import (
 	"bytes"
+	"io"
 	"mime"
 	"strings"
 	"unicode/utf8"
@@ -13,6 +14,7 @@ import (
 	"code.google.com/p/go.net/html"
 	"code.google.com/p/go.text/encoding"
 	"code.google.com/p/go.text/encoding/charmap"
+	"code.google.com/p/go.text/transform"
 )
 
 // Lookup returns the encoding with the specified label, and its canonical
@@ -83,6 +85,27 @@ func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding,
 	return charmap.Windows1252, "windows-1252", false
 }
 
+// NewReader returns an io.Reader that converts the content of r to UTF-8.
+// It calls DetermineEncoding to find out what r's encoding is.
+func NewReader(r io.Reader, contentType string) (io.Reader, error) {
+	preview := make([]byte, 1024)
+	n, err := io.ReadFull(r, preview)
+	switch {
+	case err == io.ErrUnexpectedEOF:
+		preview = preview[:n]
+		r = bytes.NewReader(preview)
+	case err != nil:
+		return nil, err
+	default:
+		r = io.MultiReader(bytes.NewReader(preview), r)
+	}
+
+	if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
+		r = transform.NewReader(r, e.NewDecoder())
+	}
+	return r, nil
+}
+
 func prescan(content []byte) (e encoding.Encoding, name string) {
 	z := html.NewTokenizer(bytes.NewReader(content))
 	for {

+ 35 - 0
html/charset/charset_test.go

@@ -1,6 +1,7 @@
 package charset
 
 import (
+	"bytes"
 	"io/ioutil"
 	"strings"
 	"testing"
@@ -143,6 +144,40 @@ func TestSniff(t *testing.T) {
 	}
 }
 
+func TestReader(t *testing.T) {
+	for _, tc := range sniffTestCases {
+		content, err := ioutil.ReadFile("testdata/" + tc.filename)
+		if err != nil {
+			t.Errorf("%s: error reading file: %v", tc.filename, err)
+			continue
+		}
+
+		r, err := NewReader(bytes.NewReader(content), tc.declared)
+		if err != nil {
+			t.Errorf("%s: error creating reader: %v", tc.filename, err)
+			continue
+		}
+
+		got, err := ioutil.ReadAll(r)
+		if err != nil {
+			t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err)
+			continue
+		}
+
+		e, _ := Lookup(tc.want)
+		want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder()))
+		if err != nil {
+			t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err)
+			continue
+		}
+
+		if !bytes.Equal(got, want) {
+			t.Errorf("%s: got %q, want %q", tc.filename, got, want)
+			continue
+		}
+	}
+}
+
 var metaTestCases = []struct {
 	meta, want string
 }{