123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- // Copyright 2014 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package collate
- import (
- "sort"
- "golang.org/x/text/internal/colltab"
- "golang.org/x/text/language"
- "golang.org/x/text/unicode/norm"
- )
- // newCollator creates a new collator with default options configured.
- func newCollator(t colltab.Weighter) *Collator {
- // Initialize a collator with default options.
- c := &Collator{
- options: options{
- ignore: [colltab.NumLevels]bool{
- colltab.Quaternary: true,
- colltab.Identity: true,
- },
- f: norm.NFD,
- t: t,
- },
- }
- // TODO: store vt in tags or remove.
- c.variableTop = t.Top()
- return c
- }
- // An Option is used to change the behavior of a Collator. Options override the
- // settings passed through the locale identifier.
- type Option struct {
- priority int
- f func(o *options)
- }
- type prioritizedOptions []Option
- func (p prioritizedOptions) Len() int {
- return len(p)
- }
- func (p prioritizedOptions) Swap(i, j int) {
- p[i], p[j] = p[j], p[i]
- }
- func (p prioritizedOptions) Less(i, j int) bool {
- return p[i].priority < p[j].priority
- }
- type options struct {
- // ignore specifies which levels to ignore.
- ignore [colltab.NumLevels]bool
- // caseLevel is true if there is an additional level of case matching
- // between the secondary and tertiary levels.
- caseLevel bool
- // backwards specifies the order of sorting at the secondary level.
- // This option exists predominantly to support reverse sorting of accents in French.
- backwards bool
- // numeric specifies whether any sequence of decimal digits (category is Nd)
- // is sorted at a primary level with its numeric value.
- // For example, "A-21" < "A-123".
- // This option is set by wrapping the main Weighter with NewNumericWeighter.
- numeric bool
- // alternate specifies an alternative handling of variables.
- alternate alternateHandling
- // variableTop is the largest primary value that is considered to be
- // variable.
- variableTop uint32
- t colltab.Weighter
- f norm.Form
- }
- func (o *options) setOptions(opts []Option) {
- sort.Sort(prioritizedOptions(opts))
- for _, x := range opts {
- x.f(o)
- }
- }
- // OptionsFromTag extracts the BCP47 collation options from the tag and
- // configures a collator accordingly. These options are set before any other
- // option.
- func OptionsFromTag(t language.Tag) Option {
- return Option{0, func(o *options) {
- o.setFromTag(t)
- }}
- }
- func (o *options) setFromTag(t language.Tag) {
- o.caseLevel = ldmlBool(t, o.caseLevel, "kc")
- o.backwards = ldmlBool(t, o.backwards, "kb")
- o.numeric = ldmlBool(t, o.numeric, "kn")
- // Extract settings from the BCP47 u extension.
- switch t.TypeForKey("ks") { // strength
- case "level1":
- o.ignore[colltab.Secondary] = true
- o.ignore[colltab.Tertiary] = true
- case "level2":
- o.ignore[colltab.Tertiary] = true
- case "level3", "":
- // The default.
- case "level4":
- o.ignore[colltab.Quaternary] = false
- case "identic":
- o.ignore[colltab.Quaternary] = false
- o.ignore[colltab.Identity] = false
- }
- switch t.TypeForKey("ka") {
- case "shifted":
- o.alternate = altShifted
- // The following two types are not official BCP47, but we support them to
- // give access to this otherwise hidden functionality. The name blanked is
- // derived from the LDML name blanked and posix reflects the main use of
- // the shift-trimmed option.
- case "blanked":
- o.alternate = altBlanked
- case "posix":
- o.alternate = altShiftTrimmed
- }
- // TODO: caseFirst ("kf"), reorder ("kr"), and maybe variableTop ("vt").
- // Not used:
- // - normalization ("kk", not necessary for this implementation)
- // - hiraganaQuatenary ("kh", obsolete)
- }
- func ldmlBool(t language.Tag, old bool, key string) bool {
- switch t.TypeForKey(key) {
- case "true":
- return true
- case "false":
- return false
- default:
- return old
- }
- }
- var (
- // IgnoreCase sets case-insensitive comparison.
- IgnoreCase Option = ignoreCase
- ignoreCase = Option{3, ignoreCaseF}
- // IgnoreDiacritics causes diacritical marks to be ignored. ("o" == "ö").
- IgnoreDiacritics Option = ignoreDiacritics
- ignoreDiacritics = Option{3, ignoreDiacriticsF}
- // IgnoreWidth causes full-width characters to match their half-width
- // equivalents.
- IgnoreWidth Option = ignoreWidth
- ignoreWidth = Option{2, ignoreWidthF}
- // Loose sets the collator to ignore diacritics, case and width.
- Loose Option = loose
- loose = Option{4, looseF}
- // Force ordering if strings are equivalent but not equal.
- Force Option = force
- force = Option{5, forceF}
- // Numeric specifies that numbers should sort numerically ("2" < "12").
- Numeric Option = numeric
- numeric = Option{5, numericF}
- )
- func ignoreWidthF(o *options) {
- o.ignore[colltab.Tertiary] = true
- o.caseLevel = true
- }
- func ignoreDiacriticsF(o *options) {
- o.ignore[colltab.Secondary] = true
- }
- func ignoreCaseF(o *options) {
- o.ignore[colltab.Tertiary] = true
- o.caseLevel = false
- }
- func looseF(o *options) {
- ignoreWidthF(o)
- ignoreDiacriticsF(o)
- ignoreCaseF(o)
- }
- func forceF(o *options) {
- o.ignore[colltab.Identity] = false
- }
- func numericF(o *options) { o.numeric = true }
- // Reorder overrides the pre-defined ordering of scripts and character sets.
- func Reorder(s ...string) Option {
- // TODO: need fractional weights to implement this.
- panic("TODO: implement")
- }
- // TODO: consider making these public again. These options cannot be fully
- // specified in BCP47, so an API interface seems warranted. Still a higher-level
- // interface would be nice (e.g. a POSIX option for enabling altShiftTrimmed)
- // alternateHandling identifies the various ways in which variables are handled.
- // A rune with a primary weight lower than the variable top is considered a
- // variable.
- // See https://www.unicode.org/reports/tr10/#Variable_Weighting for details.
- type alternateHandling int
- const (
- // altNonIgnorable turns off special handling of variables.
- altNonIgnorable alternateHandling = iota
- // altBlanked sets variables and all subsequent primary ignorables to be
- // ignorable at all levels. This is identical to removing all variables
- // and subsequent primary ignorables from the input.
- altBlanked
- // altShifted sets variables to be ignorable for levels one through three and
- // adds a fourth level based on the values of the ignored levels.
- altShifted
- // altShiftTrimmed is a slight variant of altShifted that is used to
- // emulate POSIX.
- altShiftTrimmed
- )
|