collate.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package cldr
  5. import (
  6. "bufio"
  7. "encoding/xml"
  8. "errors"
  9. "fmt"
  10. "strconv"
  11. "strings"
  12. "unicode"
  13. "unicode/utf8"
  14. )
  15. // RuleProcessor can be passed to Collator's Process method, which
  16. // parses the rules and calls the respective method for each rule found.
  17. type RuleProcessor interface {
  18. Reset(anchor string, before int) error
  19. Insert(level int, str, context, extend string) error
  20. Index(id string)
  21. }
  22. const (
  23. // cldrIndex is a Unicode-reserved sentinel value used to mark the start
  24. // of a grouping within an index.
  25. // We ignore any rule that starts with this rune.
  26. // See https://unicode.org/reports/tr35/#Collation_Elements for details.
  27. cldrIndex = "\uFDD0"
  28. // specialAnchor is the format in which to represent logical reset positions,
  29. // such as "first tertiary ignorable".
  30. specialAnchor = "<%s/>"
  31. )
  32. // Process parses the rules for the tailorings of this collation
  33. // and calls the respective methods of p for each rule found.
  34. func (c Collation) Process(p RuleProcessor) (err error) {
  35. if len(c.Cr) > 0 {
  36. if len(c.Cr) > 1 {
  37. return fmt.Errorf("multiple cr elements, want 0 or 1")
  38. }
  39. return processRules(p, c.Cr[0].Data())
  40. }
  41. if c.Rules.Any != nil {
  42. return c.processXML(p)
  43. }
  44. return errors.New("no tailoring data")
  45. }
  46. // processRules parses rules in the Collation Rule Syntax defined in
  47. // https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
  48. func processRules(p RuleProcessor, s string) (err error) {
  49. chk := func(s string, e error) string {
  50. if err == nil {
  51. err = e
  52. }
  53. return s
  54. }
  55. i := 0 // Save the line number for use after the loop.
  56. scanner := bufio.NewScanner(strings.NewReader(s))
  57. for ; scanner.Scan() && err == nil; i++ {
  58. for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
  59. level := 5
  60. var ch byte
  61. switch ch, s = s[0], s[1:]; ch {
  62. case '&': // followed by <anchor> or '[' <key> ']'
  63. if s = skipSpace(s); consume(&s, '[') {
  64. s = chk(parseSpecialAnchor(p, s))
  65. } else {
  66. s = chk(parseAnchor(p, 0, s))
  67. }
  68. case '<': // sort relation '<'{1,4}, optionally followed by '*'.
  69. for level = 1; consume(&s, '<'); level++ {
  70. }
  71. if level > 4 {
  72. err = fmt.Errorf("level %d > 4", level)
  73. }
  74. fallthrough
  75. case '=': // identity relation, optionally followed by *.
  76. if consume(&s, '*') {
  77. s = chk(parseSequence(p, level, s))
  78. } else {
  79. s = chk(parseOrder(p, level, s))
  80. }
  81. default:
  82. chk("", fmt.Errorf("illegal operator %q", ch))
  83. break
  84. }
  85. }
  86. }
  87. if chk("", scanner.Err()); err != nil {
  88. return fmt.Errorf("%d: %v", i, err)
  89. }
  90. return nil
  91. }
  92. // parseSpecialAnchor parses the anchor syntax which is either of the form
  93. // ['before' <level>] <anchor>
  94. // or
  95. // [<label>]
  96. // The starting should already be consumed.
  97. func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
  98. i := strings.IndexByte(s, ']')
  99. if i == -1 {
  100. return "", errors.New("unmatched bracket")
  101. }
  102. a := strings.TrimSpace(s[:i])
  103. s = s[i+1:]
  104. if strings.HasPrefix(a, "before ") {
  105. l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
  106. if err != nil {
  107. return s, err
  108. }
  109. return parseAnchor(p, int(l), s)
  110. }
  111. return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
  112. }
  113. func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
  114. anchor, s, err := scanString(s)
  115. if err != nil {
  116. return s, err
  117. }
  118. return s, p.Reset(anchor, level)
  119. }
  120. func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
  121. var value, context, extend string
  122. if value, s, err = scanString(s); err != nil {
  123. return s, err
  124. }
  125. if strings.HasPrefix(value, cldrIndex) {
  126. p.Index(value[len(cldrIndex):])
  127. return
  128. }
  129. if consume(&s, '|') {
  130. if context, s, err = scanString(s); err != nil {
  131. return s, errors.New("missing string after context")
  132. }
  133. }
  134. if consume(&s, '/') {
  135. if extend, s, err = scanString(s); err != nil {
  136. return s, errors.New("missing string after extension")
  137. }
  138. }
  139. return s, p.Insert(level, value, context, extend)
  140. }
  141. // scanString scans a single input string.
  142. func scanString(s string) (str, tail string, err error) {
  143. if s = skipSpace(s); s == "" {
  144. return s, s, errors.New("missing string")
  145. }
  146. buf := [16]byte{} // small but enough to hold most cases.
  147. value := buf[:0]
  148. for s != "" {
  149. if consume(&s, '\'') {
  150. i := strings.IndexByte(s, '\'')
  151. if i == -1 {
  152. return "", "", errors.New(`unmatched single quote`)
  153. }
  154. if i == 0 {
  155. value = append(value, '\'')
  156. } else {
  157. value = append(value, s[:i]...)
  158. }
  159. s = s[i+1:]
  160. continue
  161. }
  162. r, sz := utf8.DecodeRuneInString(s)
  163. if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
  164. break
  165. }
  166. value = append(value, s[:sz]...)
  167. s = s[sz:]
  168. }
  169. return string(value), skipSpace(s), nil
  170. }
  171. func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
  172. if s = skipSpace(s); s == "" {
  173. return s, errors.New("empty sequence")
  174. }
  175. last := rune(0)
  176. for s != "" {
  177. r, sz := utf8.DecodeRuneInString(s)
  178. s = s[sz:]
  179. if r == '-' {
  180. // We have a range. The first element was already written.
  181. if last == 0 {
  182. return s, errors.New("range without starter value")
  183. }
  184. r, sz = utf8.DecodeRuneInString(s)
  185. s = s[sz:]
  186. if r == utf8.RuneError || r < last {
  187. return s, fmt.Errorf("invalid range %q-%q", last, r)
  188. }
  189. for i := last + 1; i <= r; i++ {
  190. if err := p.Insert(level, string(i), "", ""); err != nil {
  191. return s, err
  192. }
  193. }
  194. last = 0
  195. continue
  196. }
  197. if unicode.IsSpace(r) || unicode.IsPunct(r) {
  198. break
  199. }
  200. // normal case
  201. if err := p.Insert(level, string(r), "", ""); err != nil {
  202. return s, err
  203. }
  204. last = r
  205. }
  206. return s, nil
  207. }
  208. func skipSpace(s string) string {
  209. return strings.TrimLeftFunc(s, unicode.IsSpace)
  210. }
  211. // consumes returns whether the next byte is ch. If so, it gobbles it by
  212. // updating s.
  213. func consume(s *string, ch byte) (ok bool) {
  214. if *s == "" || (*s)[0] != ch {
  215. return false
  216. }
  217. *s = (*s)[1:]
  218. return true
  219. }
  220. // The following code parses Collation rules of CLDR version 24 and before.
  221. var lmap = map[byte]int{
  222. 'p': 1,
  223. 's': 2,
  224. 't': 3,
  225. 'i': 5,
  226. }
  227. type rulesElem struct {
  228. Rules struct {
  229. Common
  230. Any []*struct {
  231. XMLName xml.Name
  232. rule
  233. } `xml:",any"`
  234. } `xml:"rules"`
  235. }
  236. type rule struct {
  237. Value string `xml:",chardata"`
  238. Before string `xml:"before,attr"`
  239. Any []*struct {
  240. XMLName xml.Name
  241. rule
  242. } `xml:",any"`
  243. }
  244. var emptyValueError = errors.New("cldr: empty rule value")
  245. func (r *rule) value() (string, error) {
  246. // Convert hexadecimal Unicode codepoint notation to a string.
  247. s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
  248. r.Value = s
  249. if s == "" {
  250. if len(r.Any) != 1 {
  251. return "", emptyValueError
  252. }
  253. r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
  254. r.Any = nil
  255. } else if len(r.Any) != 0 {
  256. return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
  257. }
  258. return r.Value, nil
  259. }
  260. func (r rule) process(p RuleProcessor, name, context, extend string) error {
  261. v, err := r.value()
  262. if err != nil {
  263. return err
  264. }
  265. switch name {
  266. case "p", "s", "t", "i":
  267. if strings.HasPrefix(v, cldrIndex) {
  268. p.Index(v[len(cldrIndex):])
  269. return nil
  270. }
  271. if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
  272. return err
  273. }
  274. case "pc", "sc", "tc", "ic":
  275. level := lmap[name[0]]
  276. for _, s := range v {
  277. if err := p.Insert(level, string(s), context, extend); err != nil {
  278. return err
  279. }
  280. }
  281. default:
  282. return fmt.Errorf("cldr: unsupported tag: %q", name)
  283. }
  284. return nil
  285. }
  286. // processXML parses the format of CLDR versions 24 and older.
  287. func (c Collation) processXML(p RuleProcessor) (err error) {
  288. // Collation is generated and defined in xml.go.
  289. var v string
  290. for _, r := range c.Rules.Any {
  291. switch r.XMLName.Local {
  292. case "reset":
  293. level := 0
  294. switch r.Before {
  295. case "primary", "1":
  296. level = 1
  297. case "secondary", "2":
  298. level = 2
  299. case "tertiary", "3":
  300. level = 3
  301. case "":
  302. default:
  303. return fmt.Errorf("cldr: unknown level %q", r.Before)
  304. }
  305. v, err = r.value()
  306. if err == nil {
  307. err = p.Reset(v, level)
  308. }
  309. case "x":
  310. var context, extend string
  311. for _, r1 := range r.Any {
  312. v, err = r1.value()
  313. switch r1.XMLName.Local {
  314. case "context":
  315. context = v
  316. case "extend":
  317. extend = v
  318. }
  319. }
  320. for _, r1 := range r.Any {
  321. if t := r1.XMLName.Local; t == "context" || t == "extend" {
  322. continue
  323. }
  324. r1.rule.process(p, r1.XMLName.Local, context, extend)
  325. }
  326. default:
  327. err = r.rule.process(p, r.XMLName.Local, "", "")
  328. }
  329. if err != nil {
  330. return err
  331. }
  332. }
  333. return nil
  334. }