contract.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package build
  5. import (
  6. "fmt"
  7. "io"
  8. "reflect"
  9. "sort"
  10. "strings"
  11. "golang.org/x/text/internal/colltab"
  12. )
  13. // This file contains code for detecting contractions and generating
  14. // the necessary tables.
  15. // Any Unicode Collation Algorithm (UCA) table entry that has more than
  16. // one rune one the left-hand side is called a contraction.
  17. // See https://www.unicode.org/reports/tr10/#Contractions for more details.
  18. //
  19. // We define the following terms:
  20. // initial: a rune that appears as the first rune in a contraction.
  21. // suffix: a sequence of runes succeeding the initial rune
  22. // in a given contraction.
  23. // non-initial: a rune that appears in a suffix.
  24. //
  25. // A rune may be both an initial and a non-initial and may be so in
  26. // many contractions. An initial may typically also appear by itself.
  27. // In case of ambiguities, the UCA requires we match the longest
  28. // contraction.
  29. //
  30. // Many contraction rules share the same set of possible suffixes.
  31. // We store sets of suffixes in a trie that associates an index with
  32. // each suffix in the set. This index can be used to look up a
  33. // collation element associated with the (starter rune, suffix) pair.
  34. //
  35. // The trie is defined on a UTF-8 byte sequence.
  36. // The overall trie is represented as an array of ctEntries. Each node of the trie
  37. // is represented as a subsequence of ctEntries, where each entry corresponds to
  38. // a possible match of a next character in the search string. An entry
  39. // also includes the length and offset to the next sequence of entries
  40. // to check in case of a match.
  41. const (
  42. final = 0
  43. noIndex = 0xFF
  44. )
  45. // ctEntry associates to a matching byte an offset and/or next sequence of
  46. // bytes to check. A ctEntry c is called final if a match means that the
  47. // longest suffix has been found. An entry c is final if c.N == 0.
  48. // A single final entry can match a range of characters to an offset.
  49. // A non-final entry always matches a single byte. Note that a non-final
  50. // entry might still resemble a completed suffix.
  51. // Examples:
  52. // The suffix strings "ab" and "ac" can be represented as:
  53. // []ctEntry{
  54. // {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
  55. // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
  56. // }
  57. //
  58. // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
  59. // []ctEntry{
  60. // {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
  61. // {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
  62. // {'d', 'd', final, 3}, // "abd" -> 3
  63. // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
  64. // {'d', 'd', final, 4}, // "abcd" -> 4
  65. // }
  66. // See genStateTests in contract_test.go for more examples.
  67. type ctEntry struct {
  68. L uint8 // non-final: byte value to match; final: lowest match in range.
  69. H uint8 // non-final: relative index to next block; final: highest match in range.
  70. N uint8 // non-final: length of next block; final: final
  71. I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
  72. }
  73. // contractTrieSet holds a set of contraction tries. The tries are stored
  74. // consecutively in the entry field.
  75. type contractTrieSet []struct{ l, h, n, i uint8 }
  76. // ctHandle is used to identify a trie in the trie set, consisting in an offset
  77. // in the array and the size of the first node.
  78. type ctHandle struct {
  79. index, n int
  80. }
  81. // appendTrie adds a new trie for the given suffixes to the trie set and returns
  82. // a handle to it. The handle will be invalid on error.
  83. func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
  84. es := make([]stridx, len(suffixes))
  85. for i, s := range suffixes {
  86. es[i].str = s
  87. }
  88. sort.Sort(offsetSort(es))
  89. for i := range es {
  90. es[i].index = i + 1
  91. }
  92. sort.Sort(genidxSort(es))
  93. i := len(*ct)
  94. n, err := genStates(ct, es)
  95. if err != nil {
  96. *ct = (*ct)[:i]
  97. return ctHandle{}, err
  98. }
  99. return ctHandle{i, n}, nil
  100. }
  101. // genStates generates ctEntries for a given suffix set and returns
  102. // the number of entries for the first node.
  103. func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
  104. if len(sis) == 0 {
  105. return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
  106. }
  107. start := len(*ct)
  108. // create entries for differing first bytes.
  109. for _, si := range sis {
  110. s := si.str
  111. if len(s) == 0 {
  112. continue
  113. }
  114. added := false
  115. c := s[0]
  116. if len(s) > 1 {
  117. for j := len(*ct) - 1; j >= start; j-- {
  118. if (*ct)[j].L == c {
  119. added = true
  120. break
  121. }
  122. }
  123. if !added {
  124. *ct = append(*ct, ctEntry{L: c, I: noIndex})
  125. }
  126. } else {
  127. for j := len(*ct) - 1; j >= start; j-- {
  128. // Update the offset for longer suffixes with the same byte.
  129. if (*ct)[j].L == c {
  130. (*ct)[j].I = uint8(si.index)
  131. added = true
  132. }
  133. // Extend range of final ctEntry, if possible.
  134. if (*ct)[j].H+1 == c {
  135. (*ct)[j].H = c
  136. added = true
  137. }
  138. }
  139. if !added {
  140. *ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
  141. }
  142. }
  143. }
  144. n := len(*ct) - start
  145. // Append nodes for the remainder of the suffixes for each ctEntry.
  146. sp := 0
  147. for i, end := start, len(*ct); i < end; i++ {
  148. fe := (*ct)[i]
  149. if fe.H == 0 { // uninitialized non-final
  150. ln := len(*ct) - start - n
  151. if ln > 0xFF {
  152. return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
  153. }
  154. fe.H = uint8(ln)
  155. // Find first non-final strings with same byte as current entry.
  156. for ; sis[sp].str[0] != fe.L; sp++ {
  157. }
  158. se := sp + 1
  159. for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
  160. }
  161. sl := sis[sp:se]
  162. sp = se
  163. for i, si := range sl {
  164. sl[i].str = si.str[1:]
  165. }
  166. nn, err := genStates(ct, sl)
  167. if err != nil {
  168. return 0, err
  169. }
  170. fe.N = uint8(nn)
  171. (*ct)[i] = fe
  172. }
  173. }
  174. sort.Sort(entrySort((*ct)[start : start+n]))
  175. return n, nil
  176. }
  177. // There may be both a final and non-final entry for a byte if the byte
  178. // is implied in a range of matches in the final entry.
  179. // We need to ensure that the non-final entry comes first in that case.
  180. type entrySort colltab.ContractTrieSet
  181. func (fe entrySort) Len() int { return len(fe) }
  182. func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
  183. func (fe entrySort) Less(i, j int) bool {
  184. return fe[i].L > fe[j].L
  185. }
  186. // stridx is used for sorting suffixes and their associated offsets.
  187. type stridx struct {
  188. str string
  189. index int
  190. }
  191. // For computing the offsets, we first sort by size, and then by string.
  192. // This ensures that strings that only differ in the last byte by 1
  193. // are sorted consecutively in increasing order such that they can
  194. // be packed as a range in a final ctEntry.
  195. type offsetSort []stridx
  196. func (si offsetSort) Len() int { return len(si) }
  197. func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
  198. func (si offsetSort) Less(i, j int) bool {
  199. if len(si[i].str) != len(si[j].str) {
  200. return len(si[i].str) > len(si[j].str)
  201. }
  202. return si[i].str < si[j].str
  203. }
  204. // For indexing, we want to ensure that strings are sorted in string order, where
  205. // for strings with the same prefix, we put longer strings before shorter ones.
  206. type genidxSort []stridx
  207. func (si genidxSort) Len() int { return len(si) }
  208. func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
  209. func (si genidxSort) Less(i, j int) bool {
  210. if strings.HasPrefix(si[j].str, si[i].str) {
  211. return false
  212. }
  213. if strings.HasPrefix(si[i].str, si[j].str) {
  214. return true
  215. }
  216. return si[i].str < si[j].str
  217. }
  218. // lookup matches the longest suffix in str and returns the associated offset
  219. // and the number of bytes consumed.
  220. func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
  221. states := (*ct)[h.index:]
  222. p := 0
  223. n := h.n
  224. for i := 0; i < n && p < len(str); {
  225. e := states[i]
  226. c := str[p]
  227. if c >= e.L {
  228. if e.L == c {
  229. p++
  230. if e.I != noIndex {
  231. index, ns = int(e.I), p
  232. }
  233. if e.N != final {
  234. // set to new state
  235. i, states, n = 0, states[int(e.H)+n:], int(e.N)
  236. } else {
  237. return
  238. }
  239. continue
  240. } else if e.N == final && c <= e.H {
  241. p++
  242. return int(c-e.L) + int(e.I), p
  243. }
  244. }
  245. i++
  246. }
  247. return
  248. }
  249. // print writes the contractTrieSet t as compilable Go code to w. It returns
  250. // the total number of bytes written and the size of the resulting data structure in bytes.
  251. func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
  252. update3 := func(nn, sz int, e error) {
  253. n += nn
  254. if err == nil {
  255. err = e
  256. }
  257. size += sz
  258. }
  259. update2 := func(nn int, e error) { update3(nn, 0, e) }
  260. update3(printArray(*t, w, name))
  261. update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
  262. update3(printStruct(*t, w, name))
  263. update2(fmt.Fprintln(w))
  264. return
  265. }
  266. func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
  267. p := func(f string, a ...interface{}) {
  268. nn, e := fmt.Fprintf(w, f, a...)
  269. n += nn
  270. if err == nil {
  271. err = e
  272. }
  273. }
  274. size = len(ct) * 4
  275. p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
  276. p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
  277. for _, fe := range ct {
  278. p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
  279. }
  280. p("}\n")
  281. return
  282. }
  283. func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
  284. n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
  285. size = int(reflect.TypeOf(ct).Size())
  286. return
  287. }