123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359 |
- // Copyright 2013 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package cldr
- import (
- "bufio"
- "encoding/xml"
- "errors"
- "fmt"
- "strconv"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- // RuleProcessor can be passed to Collator's Process method, which
- // parses the rules and calls the respective method for each rule found.
- type RuleProcessor interface {
- Reset(anchor string, before int) error
- Insert(level int, str, context, extend string) error
- Index(id string)
- }
- const (
- // cldrIndex is a Unicode-reserved sentinel value used to mark the start
- // of a grouping within an index.
- // We ignore any rule that starts with this rune.
- // See https://unicode.org/reports/tr35/#Collation_Elements for details.
- cldrIndex = "\uFDD0"
- // specialAnchor is the format in which to represent logical reset positions,
- // such as "first tertiary ignorable".
- specialAnchor = "<%s/>"
- )
- // Process parses the rules for the tailorings of this collation
- // and calls the respective methods of p for each rule found.
- func (c Collation) Process(p RuleProcessor) (err error) {
- if len(c.Cr) > 0 {
- if len(c.Cr) > 1 {
- return fmt.Errorf("multiple cr elements, want 0 or 1")
- }
- return processRules(p, c.Cr[0].Data())
- }
- if c.Rules.Any != nil {
- return c.processXML(p)
- }
- return errors.New("no tailoring data")
- }
- // processRules parses rules in the Collation Rule Syntax defined in
- // https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
- func processRules(p RuleProcessor, s string) (err error) {
- chk := func(s string, e error) string {
- if err == nil {
- err = e
- }
- return s
- }
- i := 0 // Save the line number for use after the loop.
- scanner := bufio.NewScanner(strings.NewReader(s))
- for ; scanner.Scan() && err == nil; i++ {
- for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
- level := 5
- var ch byte
- switch ch, s = s[0], s[1:]; ch {
- case '&': // followed by <anchor> or '[' <key> ']'
- if s = skipSpace(s); consume(&s, '[') {
- s = chk(parseSpecialAnchor(p, s))
- } else {
- s = chk(parseAnchor(p, 0, s))
- }
- case '<': // sort relation '<'{1,4}, optionally followed by '*'.
- for level = 1; consume(&s, '<'); level++ {
- }
- if level > 4 {
- err = fmt.Errorf("level %d > 4", level)
- }
- fallthrough
- case '=': // identity relation, optionally followed by *.
- if consume(&s, '*') {
- s = chk(parseSequence(p, level, s))
- } else {
- s = chk(parseOrder(p, level, s))
- }
- default:
- chk("", fmt.Errorf("illegal operator %q", ch))
- break
- }
- }
- }
- if chk("", scanner.Err()); err != nil {
- return fmt.Errorf("%d: %v", i, err)
- }
- return nil
- }
- // parseSpecialAnchor parses the anchor syntax which is either of the form
- // ['before' <level>] <anchor>
- // or
- // [<label>]
- // The starting should already be consumed.
- func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
- i := strings.IndexByte(s, ']')
- if i == -1 {
- return "", errors.New("unmatched bracket")
- }
- a := strings.TrimSpace(s[:i])
- s = s[i+1:]
- if strings.HasPrefix(a, "before ") {
- l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
- if err != nil {
- return s, err
- }
- return parseAnchor(p, int(l), s)
- }
- return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
- }
- func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
- anchor, s, err := scanString(s)
- if err != nil {
- return s, err
- }
- return s, p.Reset(anchor, level)
- }
- func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
- var value, context, extend string
- if value, s, err = scanString(s); err != nil {
- return s, err
- }
- if strings.HasPrefix(value, cldrIndex) {
- p.Index(value[len(cldrIndex):])
- return
- }
- if consume(&s, '|') {
- if context, s, err = scanString(s); err != nil {
- return s, errors.New("missing string after context")
- }
- }
- if consume(&s, '/') {
- if extend, s, err = scanString(s); err != nil {
- return s, errors.New("missing string after extension")
- }
- }
- return s, p.Insert(level, value, context, extend)
- }
- // scanString scans a single input string.
- func scanString(s string) (str, tail string, err error) {
- if s = skipSpace(s); s == "" {
- return s, s, errors.New("missing string")
- }
- buf := [16]byte{} // small but enough to hold most cases.
- value := buf[:0]
- for s != "" {
- if consume(&s, '\'') {
- i := strings.IndexByte(s, '\'')
- if i == -1 {
- return "", "", errors.New(`unmatched single quote`)
- }
- if i == 0 {
- value = append(value, '\'')
- } else {
- value = append(value, s[:i]...)
- }
- s = s[i+1:]
- continue
- }
- r, sz := utf8.DecodeRuneInString(s)
- if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
- break
- }
- value = append(value, s[:sz]...)
- s = s[sz:]
- }
- return string(value), skipSpace(s), nil
- }
- func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
- if s = skipSpace(s); s == "" {
- return s, errors.New("empty sequence")
- }
- last := rune(0)
- for s != "" {
- r, sz := utf8.DecodeRuneInString(s)
- s = s[sz:]
- if r == '-' {
- // We have a range. The first element was already written.
- if last == 0 {
- return s, errors.New("range without starter value")
- }
- r, sz = utf8.DecodeRuneInString(s)
- s = s[sz:]
- if r == utf8.RuneError || r < last {
- return s, fmt.Errorf("invalid range %q-%q", last, r)
- }
- for i := last + 1; i <= r; i++ {
- if err := p.Insert(level, string(i), "", ""); err != nil {
- return s, err
- }
- }
- last = 0
- continue
- }
- if unicode.IsSpace(r) || unicode.IsPunct(r) {
- break
- }
- // normal case
- if err := p.Insert(level, string(r), "", ""); err != nil {
- return s, err
- }
- last = r
- }
- return s, nil
- }
- func skipSpace(s string) string {
- return strings.TrimLeftFunc(s, unicode.IsSpace)
- }
- // consumes returns whether the next byte is ch. If so, it gobbles it by
- // updating s.
- func consume(s *string, ch byte) (ok bool) {
- if *s == "" || (*s)[0] != ch {
- return false
- }
- *s = (*s)[1:]
- return true
- }
- // The following code parses Collation rules of CLDR version 24 and before.
- var lmap = map[byte]int{
- 'p': 1,
- 's': 2,
- 't': 3,
- 'i': 5,
- }
- type rulesElem struct {
- Rules struct {
- Common
- Any []*struct {
- XMLName xml.Name
- rule
- } `xml:",any"`
- } `xml:"rules"`
- }
- type rule struct {
- Value string `xml:",chardata"`
- Before string `xml:"before,attr"`
- Any []*struct {
- XMLName xml.Name
- rule
- } `xml:",any"`
- }
- var emptyValueError = errors.New("cldr: empty rule value")
- func (r *rule) value() (string, error) {
- // Convert hexadecimal Unicode codepoint notation to a string.
- s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
- r.Value = s
- if s == "" {
- if len(r.Any) != 1 {
- return "", emptyValueError
- }
- r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
- r.Any = nil
- } else if len(r.Any) != 0 {
- return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
- }
- return r.Value, nil
- }
- func (r rule) process(p RuleProcessor, name, context, extend string) error {
- v, err := r.value()
- if err != nil {
- return err
- }
- switch name {
- case "p", "s", "t", "i":
- if strings.HasPrefix(v, cldrIndex) {
- p.Index(v[len(cldrIndex):])
- return nil
- }
- if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
- return err
- }
- case "pc", "sc", "tc", "ic":
- level := lmap[name[0]]
- for _, s := range v {
- if err := p.Insert(level, string(s), context, extend); err != nil {
- return err
- }
- }
- default:
- return fmt.Errorf("cldr: unsupported tag: %q", name)
- }
- return nil
- }
- // processXML parses the format of CLDR versions 24 and older.
- func (c Collation) processXML(p RuleProcessor) (err error) {
- // Collation is generated and defined in xml.go.
- var v string
- for _, r := range c.Rules.Any {
- switch r.XMLName.Local {
- case "reset":
- level := 0
- switch r.Before {
- case "primary", "1":
- level = 1
- case "secondary", "2":
- level = 2
- case "tertiary", "3":
- level = 3
- case "":
- default:
- return fmt.Errorf("cldr: unknown level %q", r.Before)
- }
- v, err = r.value()
- if err == nil {
- err = p.Reset(v, level)
- }
- case "x":
- var context, extend string
- for _, r1 := range r.Any {
- v, err = r1.value()
- switch r1.XMLName.Local {
- case "context":
- context = v
- case "extend":
- extend = v
- }
- }
- for _, r1 := range r.Any {
- if t := r1.XMLName.Local; t == "context" || t == "extend" {
- continue
- }
- r1.rule.process(p, r1.XMLName.Local, context, extend)
- }
- default:
- err = r.rule.process(p, r.XMLName.Local, "", "")
- }
- if err != nil {
- return err
- }
- }
- return nil
- }
|