search.go 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search
  5. // Package search provides language-specific search and string matching.
  6. //
  7. // Natural language matching can be intricate. For example, Danish will insist
  8. // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
  9. // the lack of a dot) in a case-insensitive match. This package handles such
  10. // language-specific details.
  11. //
  12. // Text passed to any of the calls in this message does not need to be
  13. // normalized.
  14. package search // import "golang.org/x/text/search"
  15. import (
  16. "strings"
  17. "golang.org/x/text/internal/colltab"
  18. "golang.org/x/text/language"
  19. )
  20. // An Option configures a Matcher.
  21. type Option func(*Matcher)
  22. var (
  23. // WholeWord restricts matches to complete words. The default is to match at
  24. // the character level.
  25. WholeWord Option = nil
  26. // Exact requires that two strings are their exact equivalent. For example
  27. // å would not match aa in Danish. It overrides any of the ignore options.
  28. Exact Option = nil
  29. // Loose causes case, diacritics and width to be ignored.
  30. Loose Option = loose
  31. // IgnoreCase enables case-insensitive search.
  32. IgnoreCase Option = ignoreCase
  33. // IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
  34. IgnoreDiacritics Option = ignoreDiacritics
  35. // IgnoreWidth equates narrow with wide variants.
  36. IgnoreWidth Option = ignoreWidth
  37. )
  38. func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
  39. func ignoreCase(m *Matcher) { m.ignoreCase = true }
  40. func ignoreWidth(m *Matcher) { m.ignoreWidth = true }
  41. func loose(m *Matcher) {
  42. ignoreDiacritics(m)
  43. ignoreCase(m)
  44. ignoreWidth(m)
  45. }
  46. var (
  47. // Supported lists the languages for which search differs from its parent.
  48. Supported language.Coverage
  49. tags []language.Tag
  50. )
  51. func init() {
  52. ids := strings.Split(availableLocales, ",")
  53. tags = make([]language.Tag, len(ids))
  54. for i, s := range ids {
  55. tags[i] = language.Raw.MustParse(s)
  56. }
  57. Supported = language.NewCoverage(tags)
  58. }
  59. // New returns a new Matcher for the given language and options.
  60. func New(t language.Tag, opts ...Option) *Matcher {
  61. m := &Matcher{
  62. w: getTable(locales[colltab.MatchLang(t, tags)]),
  63. }
  64. for _, f := range opts {
  65. f(m)
  66. }
  67. return m
  68. }
  69. // A Matcher implements language-specific string matching.
  70. type Matcher struct {
  71. w colltab.Weighter
  72. ignoreCase bool
  73. ignoreWidth bool
  74. ignoreDiacritics bool
  75. }
  76. // An IndexOption specifies how the Index methods of Pattern or Matcher should
  77. // match the input.
  78. type IndexOption byte
  79. const (
  80. // Anchor restricts the search to the start (or end for Backwards) of the
  81. // text.
  82. Anchor IndexOption = 1 << iota
  83. // Backwards starts the search from the end of the text.
  84. Backwards
  85. anchorBackwards = Anchor | Backwards
  86. )
  87. // Index reports the start and end position of the first occurrence of pat in b
  88. // or -1, -1 if pat is not present.
  89. func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
  90. // TODO: implement optimized version that does not use a pattern.
  91. return m.Compile(pat).Index(b, opts...)
  92. }
  93. // IndexString reports the start and end position of the first occurrence of pat
  94. // in s or -1, -1 if pat is not present.
  95. func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
  96. // TODO: implement optimized version that does not use a pattern.
  97. return m.CompileString(pat).IndexString(s, opts...)
  98. }
  99. // Equal reports whether a and b are equivalent.
  100. func (m *Matcher) Equal(a, b []byte) bool {
  101. _, end := m.Index(a, b, Anchor)
  102. return end == len(a)
  103. }
  104. // EqualString reports whether a and b are equivalent.
  105. func (m *Matcher) EqualString(a, b string) bool {
  106. _, end := m.IndexString(a, b, Anchor)
  107. return end == len(a)
  108. }
  109. // Compile compiles and returns a pattern that can be used for faster searching.
  110. func (m *Matcher) Compile(b []byte) *Pattern {
  111. p := &Pattern{m: m}
  112. iter := colltab.Iter{Weighter: m.w}
  113. for iter.SetInput(b); iter.Next(); {
  114. }
  115. p.ce = iter.Elems
  116. p.deleteEmptyElements()
  117. return p
  118. }
  119. // CompileString compiles and returns a pattern that can be used for faster
  120. // searching.
  121. func (m *Matcher) CompileString(s string) *Pattern {
  122. p := &Pattern{m: m}
  123. iter := colltab.Iter{Weighter: m.w}
  124. for iter.SetInputString(s); iter.Next(); {
  125. }
  126. p.ce = iter.Elems
  127. p.deleteEmptyElements()
  128. return p
  129. }
  130. // A Pattern is a compiled search string. It is safe for concurrent use.
  131. type Pattern struct {
  132. m *Matcher
  133. ce []colltab.Elem
  134. }
  135. // Design note (TODO remove):
  136. // The cost of retrieving collation elements for each rune, which is used for
  137. // search as well, is not trivial. Also, algorithms like Boyer-Moore and
  138. // Sunday require some additional precomputing.
  139. // Index reports the start and end position of the first occurrence of p in b
  140. // or -1, -1 if p is not present.
  141. func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
  142. // Pick a large enough buffer such that we likely do not need to allocate
  143. // and small enough to not cause too much overhead initializing.
  144. var buf [8]colltab.Elem
  145. it := &colltab.Iter{
  146. Weighter: p.m.w,
  147. Elems: buf[:0],
  148. }
  149. it.SetInput(b)
  150. var optMask IndexOption
  151. for _, o := range opts {
  152. optMask |= o
  153. }
  154. switch optMask {
  155. case 0:
  156. return p.forwardSearch(it)
  157. case Anchor:
  158. return p.anchoredForwardSearch(it)
  159. case Backwards, anchorBackwards:
  160. panic("TODO: implement")
  161. default:
  162. panic("unrecognized option")
  163. }
  164. }
  165. // IndexString reports the start and end position of the first occurrence of p
  166. // in s or -1, -1 if p is not present.
  167. func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
  168. // Pick a large enough buffer such that we likely do not need to allocate
  169. // and small enough to not cause too much overhead initializing.
  170. var buf [8]colltab.Elem
  171. it := &colltab.Iter{
  172. Weighter: p.m.w,
  173. Elems: buf[:0],
  174. }
  175. it.SetInputString(s)
  176. var optMask IndexOption
  177. for _, o := range opts {
  178. optMask |= o
  179. }
  180. switch optMask {
  181. case 0:
  182. return p.forwardSearch(it)
  183. case Anchor:
  184. return p.anchoredForwardSearch(it)
  185. case Backwards, anchorBackwards:
  186. panic("TODO: implement")
  187. default:
  188. panic("unrecognized option")
  189. }
  190. }
  191. // TODO:
  192. // - Maybe IndexAll methods (probably not necessary).
  193. // - Some way to match patterns in a Reader (a bit tricky).
  194. // - Some fold transformer that folds text to comparable text, based on the
  195. // search options. This is a common technique, though very different from the
  196. // collation-based design of this package. It has a somewhat different use
  197. // case, so probably makes sense to support both. Should probably be in a
  198. // different package, though, as it uses completely different kind of tables
  199. // (based on norm, cases, width and range tables.)