Browse Source

idna: updated to Unicode 10.0.0

Generated from x/text.

Significant changes in the interpretation
of the Bidi rule as well as sharpening of
the leading dot rules, among other things.

Issue golang/go#21471

Change-Id: I8649a4090e2bc530aad4412210a3de344fb2eab6
Reviewed-on: https://go-review.googlesource.com/63951
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Marcel van Lohuizen 8 years ago
parent
commit
4b14673ba3
3 changed files with 1501 additions and 1426 deletions
  1. 73 36
      idna/idna.go
  2. 1417 1384
      idna/tables.go
  3. 11 6
      idna/trieval.go

+ 73 - 36
idna/idna.go

@@ -21,6 +21,7 @@ import (
 	"unicode/utf8"
 
 	"golang.org/x/text/secure/bidirule"
+	"golang.org/x/text/unicode/bidi"
 	"golang.org/x/text/unicode/norm"
 )
 
@@ -68,7 +69,7 @@ func VerifyDNSLength(verify bool) Option {
 }
 
 // RemoveLeadingDots removes leading label separators. Leading runes that map to
-// dots, such as U+3002, are removed as well.
+// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well.
 //
 // This is the behavior suggested by the UTS #46 and is adopted by some
 // browsers.
@@ -92,7 +93,7 @@ func ValidateLabels(enable bool) Option {
 	}
 }
 
-// StrictDomainName limits the set of permissable ASCII characters to those
+// StrictDomainName limits the set of permissible ASCII characters to those
 // allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the
 // hyphen). This is set by default for MapForLookup and ValidateForRegistration.
 //
@@ -142,7 +143,6 @@ func MapForLookup() Option {
 		o.mapping = validateAndMap
 		StrictDomainName(true)(o)
 		ValidateLabels(true)(o)
-		RemoveLeadingDots(true)(o)
 	}
 }
 
@@ -160,7 +160,7 @@ type options struct {
 
 	// mapping implements a validation and mapping step as defined in RFC 5895
 	// or UTS 46, tailored to, for example, domain registration or lookup.
-	mapping func(p *Profile, s string) (string, error)
+	mapping func(p *Profile, s string) (mapped string, isBidi bool, err error)
 
 	// bidirule, if specified, checks whether s conforms to the Bidi Rule
 	// defined in RFC 5893.
@@ -251,23 +251,21 @@ var (
 
 	punycode = &Profile{}
 	lookup   = &Profile{options{
-		transitional:      true,
-		useSTD3Rules:      true,
-		validateLabels:    true,
-		removeLeadingDots: true,
-		trie:              trie,
-		fromPuny:          validateFromPunycode,
-		mapping:           validateAndMap,
-		bidirule:          bidirule.ValidString,
+		transitional:   true,
+		useSTD3Rules:   true,
+		validateLabels: true,
+		trie:           trie,
+		fromPuny:       validateFromPunycode,
+		mapping:        validateAndMap,
+		bidirule:       bidirule.ValidString,
 	}}
 	display = &Profile{options{
-		useSTD3Rules:      true,
-		validateLabels:    true,
-		removeLeadingDots: true,
-		trie:              trie,
-		fromPuny:          validateFromPunycode,
-		mapping:           validateAndMap,
-		bidirule:          bidirule.ValidString,
+		useSTD3Rules:   true,
+		validateLabels: true,
+		trie:           trie,
+		fromPuny:       validateFromPunycode,
+		mapping:        validateAndMap,
+		bidirule:       bidirule.ValidString,
 	}}
 	registration = &Profile{options{
 		useSTD3Rules:    true,
@@ -302,14 +300,16 @@ func (e runeError) Error() string {
 // see http://www.unicode.org/reports/tr46.
 func (p *Profile) process(s string, toASCII bool) (string, error) {
 	var err error
+	var isBidi bool
 	if p.mapping != nil {
-		s, err = p.mapping(p, s)
+		s, isBidi, err = p.mapping(p, s)
 	}
 	// Remove leading empty labels.
 	if p.removeLeadingDots {
 		for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
 		}
 	}
+	// TODO: allow for a quick check the tables data.
 	// It seems like we should only create this error on ToASCII, but the
 	// UTS 46 conformance tests suggests we should always check this.
 	if err == nil && p.verifyDNSLength && s == "" {
@@ -335,6 +335,7 @@ func (p *Profile) process(s string, toASCII bool) (string, error) {
 				// Spec says keep the old label.
 				continue
 			}
+			isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight
 			labels.set(u)
 			if err == nil && p.validateLabels {
 				err = p.fromPuny(p, u)
@@ -349,6 +350,14 @@ func (p *Profile) process(s string, toASCII bool) (string, error) {
 			err = p.validateLabel(label)
 		}
 	}
+	if isBidi && p.bidirule != nil && err == nil {
+		for labels.reset(); !labels.done(); labels.next() {
+			if !p.bidirule(labels.label()) {
+				err = &labelError{s, "B"}
+				break
+			}
+		}
+	}
 	if toASCII {
 		for labels.reset(); !labels.done(); labels.next() {
 			label := labels.label()
@@ -380,16 +389,23 @@ func (p *Profile) process(s string, toASCII bool) (string, error) {
 	return s, err
 }
 
-func normalize(p *Profile, s string) (string, error) {
-	return norm.NFC.String(s), nil
+func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) {
+	// TODO: consider first doing a quick check to see if any of these checks
+	// need to be done. This will make it slower in the general case, but
+	// faster in the common case.
+	mapped = norm.NFC.String(s)
+	isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft
+	return mapped, isBidi, nil
 }
 
-func validateRegistration(p *Profile, s string) (string, error) {
+func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) {
+	// TODO: filter need for normalization in loop below.
 	if !norm.NFC.IsNormalString(s) {
-		return s, &labelError{s, "V1"}
+		return s, false, &labelError{s, "V1"}
 	}
 	for i := 0; i < len(s); {
 		v, sz := trie.lookupString(s[i:])
+		bidi = bidi || info(v).isBidi(s[i:])
 		// Copy bytes not copied so far.
 		switch p.simplify(info(v).category()) {
 		// TODO: handle the NV8 defined in the Unicode idna data set to allow
@@ -397,21 +413,41 @@ func validateRegistration(p *Profile, s string) (string, error) {
 		case valid, deviation:
 		case disallowed, mapped, unknown, ignored:
 			r, _ := utf8.DecodeRuneInString(s[i:])
-			return s, runeError(r)
+			return s, bidi, runeError(r)
 		}
 		i += sz
 	}
-	return s, nil
+	return s, bidi, nil
 }
 
-func validateAndMap(p *Profile, s string) (string, error) {
+func (c info) isBidi(s string) bool {
+	if !c.isMapped() {
+		return c&attributesMask == rtl
+	}
+	// TODO: also store bidi info for mapped data. This is possible, but a bit
+	// cumbersome and not for the common case.
+	p, _ := bidi.LookupString(s)
+	switch p.Class() {
+	case bidi.R, bidi.AL, bidi.AN:
+		return true
+	}
+	return false
+}
+
+func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) {
 	var (
-		err error
-		b   []byte
-		k   int
+		b []byte
+		k int
 	)
+	// combinedInfoBits contains the or-ed bits of all runes. We use this
+	// to derive the mayNeedNorm bit later. This may trigger normalization
+	// overeagerly, but it will not do so in the common case. The end result
+	// is another 10% saving on BenchmarkProfile for the common case.
+	var combinedInfoBits info
 	for i := 0; i < len(s); {
 		v, sz := trie.lookupString(s[i:])
+		combinedInfoBits |= info(v)
+		bidi = bidi || info(v).isBidi(s[i:])
 		start := i
 		i += sz
 		// Copy bytes not copied so far.
@@ -438,7 +474,9 @@ func validateAndMap(p *Profile, s string) (string, error) {
 	}
 	if k == 0 {
 		// No changes so far.
-		s = norm.NFC.String(s)
+		if combinedInfoBits&mayNeedNorm != 0 {
+			s = norm.NFC.String(s)
+		}
 	} else {
 		b = append(b, s[k:]...)
 		if norm.NFC.QuickSpan(b) != len(b) {
@@ -447,7 +485,7 @@ func validateAndMap(p *Profile, s string) (string, error) {
 		// TODO: the punycode converters require strings as input.
 		s = string(b)
 	}
-	return s, err
+	return s, bidi, err
 }
 
 // A labelIter allows iterating over domain name labels.
@@ -542,6 +580,8 @@ func validateFromPunycode(p *Profile, s string) error {
 	if !norm.NFC.IsNormalString(s) {
 		return &labelError{s, "V1"}
 	}
+	// TODO: detect whether string may have to be normalized in the following
+	// loop.
 	for i := 0; i < len(s); {
 		v, sz := trie.lookupString(s[i:])
 		if c := p.simplify(info(v).category()); c != valid && c != deviation {
@@ -616,16 +656,13 @@ var joinStates = [][numJoinTypes]joinState{
 
 // validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are
 // already implicitly satisfied by the overall implementation.
-func (p *Profile) validateLabel(s string) error {
+func (p *Profile) validateLabel(s string) (err error) {
 	if s == "" {
 		if p.verifyDNSLength {
 			return &labelError{s, "A4"}
 		}
 		return nil
 	}
-	if p.bidirule != nil && !p.bidirule(s) {
-		return &labelError{s, "B"}
-	}
 	if !p.validateLabels {
 		return nil
 	}

File diff suppressed because it is too large
+ 1417 - 1384
idna/tables.go


+ 11 - 6
idna/trieval.go

@@ -26,9 +26,9 @@ package idna
 //       15..3  index into xor or mapping table
 //     }
 //   } else {
-//       15..13 unused
-//           12 modifier (including virama)
-//           11 virama modifier
+//       15..14 unused
+//       13     mayNeedNorm
+//       12..11 attributes
 //       10..8  joining type
 //        7..3  category type
 //   }
@@ -49,15 +49,20 @@ const (
 	joinShift = 8
 	joinMask  = 0x07
 
-	viramaModifier = 0x0800
+	// Attributes
+	attributesMask = 0x1800
+	viramaModifier = 0x1800
 	modifier       = 0x1000
+	rtl            = 0x0800
+
+	mayNeedNorm = 0x2000
 )
 
 // A category corresponds to a category defined in the IDNA mapping table.
 type category uint16
 
 const (
-	unknown              category = 0 // not defined currently in unicode.
+	unknown              category = 0 // not currently defined in unicode.
 	mapped               category = 1
 	disallowedSTD3Mapped category = 2
 	deviation            category = 3
@@ -110,5 +115,5 @@ func (c info) isModifier() bool {
 }
 
 func (c info) isViramaModifier() bool {
-	return c&(viramaModifier|catSmallMask) == viramaModifier
+	return c&(attributesMask|catSmallMask) == viramaModifier
 }

Some files were not shown because too many files changed in this diff