parse.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "errors"
  7. "strconv"
  8. "strings"
  9. "golang.org/x/text/internal/language"
  10. )
  11. // ValueError is returned by any of the parsing functions when the
  12. // input is well-formed but the respective subtag is not recognized
  13. // as a valid value.
  14. type ValueError interface {
  15. error
  16. // Subtag returns the subtag for which the error occurred.
  17. Subtag() string
  18. }
  19. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  20. // failed it returns an error and any part of the tag that could be parsed.
  21. // If parsing succeeded but an unknown value was found, it returns
  22. // ValueError. The Tag returned in this case is just stripped of the unknown
  23. // value. All other values are preserved. It accepts tags in the BCP 47 format
  24. // and extensions to this standard defined in
  25. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  26. // The resulting tag is canonicalized using the default canonicalization type.
  27. func Parse(s string) (t Tag, err error) {
  28. return Default.Parse(s)
  29. }
  30. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  31. // failed it returns an error and any part of the tag that could be parsed.
  32. // If parsing succeeded but an unknown value was found, it returns
  33. // ValueError. The Tag returned in this case is just stripped of the unknown
  34. // value. All other values are preserved. It accepts tags in the BCP 47 format
  35. // and extensions to this standard defined in
  36. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  37. // The resulting tag is canonicalized using the canonicalization type c.
  38. func (c CanonType) Parse(s string) (t Tag, err error) {
  39. tt, err := language.Parse(s)
  40. if err != nil {
  41. return makeTag(tt), err
  42. }
  43. tt, changed := canonicalize(c, tt)
  44. if changed {
  45. tt.RemakeString()
  46. }
  47. return makeTag(tt), err
  48. }
  49. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  50. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  51. // Base, Script or Region or slice of type Variant or Extension is passed more
  52. // than once, the latter will overwrite the former. Variants and Extensions are
  53. // accumulated, but if two extensions of the same type are passed, the latter
  54. // will replace the former. For -u extensions, though, the key-type pairs are
  55. // added, where later values overwrite older ones. A Tag overwrites all former
  56. // values and typically only makes sense as the first argument. The resulting
  57. // tag is returned after canonicalizing using the Default CanonType. If one or
  58. // more errors are encountered, one of the errors is returned.
  59. func Compose(part ...interface{}) (t Tag, err error) {
  60. return Default.Compose(part...)
  61. }
  62. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  63. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  64. // Base, Script or Region or slice of type Variant or Extension is passed more
  65. // than once, the latter will overwrite the former. Variants and Extensions are
  66. // accumulated, but if two extensions of the same type are passed, the latter
  67. // will replace the former. For -u extensions, though, the key-type pairs are
  68. // added, where later values overwrite older ones. A Tag overwrites all former
  69. // values and typically only makes sense as the first argument. The resulting
  70. // tag is returned after canonicalizing using CanonType c. If one or more errors
  71. // are encountered, one of the errors is returned.
  72. func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
  73. var b language.Builder
  74. if err = update(&b, part...); err != nil {
  75. return und, err
  76. }
  77. b.Tag, _ = canonicalize(c, b.Tag)
  78. return makeTag(b.Make()), err
  79. }
  80. var errInvalidArgument = errors.New("invalid Extension or Variant")
  81. func update(b *language.Builder, part ...interface{}) (err error) {
  82. for _, x := range part {
  83. switch v := x.(type) {
  84. case Tag:
  85. b.SetTag(v.tag())
  86. case Base:
  87. b.Tag.LangID = v.langID
  88. case Script:
  89. b.Tag.ScriptID = v.scriptID
  90. case Region:
  91. b.Tag.RegionID = v.regionID
  92. case Variant:
  93. if v.variant == "" {
  94. err = errInvalidArgument
  95. break
  96. }
  97. b.AddVariant(v.variant)
  98. case Extension:
  99. if v.s == "" {
  100. err = errInvalidArgument
  101. break
  102. }
  103. b.SetExt(v.s)
  104. case []Variant:
  105. b.ClearVariants()
  106. for _, v := range v {
  107. b.AddVariant(v.variant)
  108. }
  109. case []Extension:
  110. b.ClearExtensions()
  111. for _, e := range v {
  112. b.SetExt(e.s)
  113. }
  114. // TODO: support parsing of raw strings based on morphology or just extensions?
  115. case error:
  116. if v != nil {
  117. err = v
  118. }
  119. }
  120. }
  121. return
  122. }
  123. var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
  124. // ParseAcceptLanguage parses the contents of an Accept-Language header as
  125. // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
  126. // a list of corresponding quality weights. It is more permissive than RFC 2616
  127. // and may return non-nil slices even if the input is not valid.
  128. // The Tags will be sorted by highest weight first and then by first occurrence.
  129. // Tags with a weight of zero will be dropped. An error will be returned if the
  130. // input could not be parsed.
  131. func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
  132. var entry string
  133. for s != "" {
  134. if entry, s = split(s, ','); entry == "" {
  135. continue
  136. }
  137. entry, weight := split(entry, ';')
  138. // Scan the language.
  139. t, err := Parse(entry)
  140. if err != nil {
  141. id, ok := acceptFallback[entry]
  142. if !ok {
  143. return nil, nil, err
  144. }
  145. t = makeTag(language.Tag{LangID: id})
  146. }
  147. // Scan the optional weight.
  148. w := 1.0
  149. if weight != "" {
  150. weight = consume(weight, 'q')
  151. weight = consume(weight, '=')
  152. // consume returns the empty string when a token could not be
  153. // consumed, resulting in an error for ParseFloat.
  154. if w, err = strconv.ParseFloat(weight, 32); err != nil {
  155. return nil, nil, errInvalidWeight
  156. }
  157. // Drop tags with a quality weight of 0.
  158. if w <= 0 {
  159. continue
  160. }
  161. }
  162. tag = append(tag, t)
  163. q = append(q, float32(w))
  164. }
  165. sortStable(&tagSort{tag, q})
  166. return tag, q, nil
  167. }
  168. // consume removes a leading token c from s and returns the result or the empty
  169. // string if there is no such token.
  170. func consume(s string, c byte) string {
  171. if s == "" || s[0] != c {
  172. return ""
  173. }
  174. return strings.TrimSpace(s[1:])
  175. }
  176. func split(s string, c byte) (head, tail string) {
  177. if i := strings.IndexByte(s, c); i >= 0 {
  178. return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
  179. }
  180. return strings.TrimSpace(s), ""
  181. }
  182. // Add hack mapping to deal with a small number of cases that occur
  183. // in Accept-Language (with reasonable frequency).
  184. var acceptFallback = map[string]language.Language{
  185. "english": _en,
  186. "deutsch": _de,
  187. "italian": _it,
  188. "french": _fr,
  189. "*": _mul, // defined in the spec to match all languages.
  190. }
  191. type tagSort struct {
  192. tag []Tag
  193. q []float32
  194. }
  195. func (s *tagSort) Len() int {
  196. return len(s.q)
  197. }
  198. func (s *tagSort) Less(i, j int) bool {
  199. return s.q[i] > s.q[j]
  200. }
  201. func (s *tagSort) Swap(i, j int) {
  202. s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
  203. s.q[i], s.q[j] = s.q[j], s.q[i]
  204. }