forminfo.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package norm
  5. import "encoding/binary"
  6. // This file contains Form-specific logic and wrappers for data in tables.go.
  7. // Rune info is stored in a separate trie per composing form. A composing form
  8. // and its corresponding decomposing form share the same trie. Each trie maps
  9. // a rune to a uint16. The values take two forms. For v >= 0x8000:
  10. // bits
  11. // 15: 1 (inverse of NFD_QC bit of qcInfo)
  12. // 13..7: qcInfo (see below). isYesD is always true (no decompostion).
  13. // 6..0: ccc (compressed CCC value).
  14. // For v < 0x8000, the respective rune has a decomposition and v is an index
  15. // into a byte array of UTF-8 decomposition sequences and additional info and
  16. // has the form:
  17. // <header> <decomp_byte>* [<tccc> [<lccc>]]
  18. // The header contains the number of bytes in the decomposition (excluding this
  19. // length byte). The two most significant bits of this length byte correspond
  20. // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
  21. // The byte sequence is followed by a trailing and leading CCC if the values
  22. // for these are not zero. The value of v determines which ccc are appended
  23. // to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
  24. // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
  25. // there is an additional leading ccc. The value of tccc itself is the
  26. // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
  27. // are the number of trailing non-starters.
  28. const (
  29. qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
  30. headerLenMask = 0x3F // extract the length value from the header byte
  31. headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
  32. )
  33. // Properties provides access to normalization properties of a rune.
  34. type Properties struct {
  35. pos uint8 // start position in reorderBuffer; used in composition.go
  36. size uint8 // length of UTF-8 encoding of this rune
  37. ccc uint8 // leading canonical combining class (ccc if not decomposition)
  38. tccc uint8 // trailing canonical combining class (ccc if not decomposition)
  39. nLead uint8 // number of leading non-starters.
  40. flags qcInfo // quick check flags
  41. index uint16
  42. }
  43. // functions dispatchable per form
  44. type lookupFunc func(b input, i int) Properties
  45. // formInfo holds Form-specific functions and tables.
  46. type formInfo struct {
  47. form Form
  48. composing, compatibility bool // form type
  49. info lookupFunc
  50. nextMain iterFunc
  51. }
  52. var formTable = []*formInfo{{
  53. form: NFC,
  54. composing: true,
  55. compatibility: false,
  56. info: lookupInfoNFC,
  57. nextMain: nextComposed,
  58. }, {
  59. form: NFD,
  60. composing: false,
  61. compatibility: false,
  62. info: lookupInfoNFC,
  63. nextMain: nextDecomposed,
  64. }, {
  65. form: NFKC,
  66. composing: true,
  67. compatibility: true,
  68. info: lookupInfoNFKC,
  69. nextMain: nextComposed,
  70. }, {
  71. form: NFKD,
  72. composing: false,
  73. compatibility: true,
  74. info: lookupInfoNFKC,
  75. nextMain: nextDecomposed,
  76. }}
  77. // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
  78. // unexpected behavior for the user. For example, in NFD, there is a boundary
  79. // after 'a'. However, 'a' might combine with modifiers, so from the application's
  80. // perspective it is not a good boundary. We will therefore always use the
  81. // boundaries for the combining variants.
  82. // BoundaryBefore returns true if this rune starts a new segment and
  83. // cannot combine with any rune on the left.
  84. func (p Properties) BoundaryBefore() bool {
  85. if p.ccc == 0 && !p.combinesBackward() {
  86. return true
  87. }
  88. // We assume that the CCC of the first character in a decomposition
  89. // is always non-zero if different from info.ccc and that we can return
  90. // false at this point. This is verified by maketables.
  91. return false
  92. }
  93. // BoundaryAfter returns true if runes cannot combine with or otherwise
  94. // interact with this or previous runes.
  95. func (p Properties) BoundaryAfter() bool {
  96. // TODO: loosen these conditions.
  97. return p.isInert()
  98. }
  99. // We pack quick check data in 4 bits:
  100. // 5: Combines forward (0 == false, 1 == true)
  101. // 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
  102. // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
  103. // 1..0: Number of trailing non-starters.
  104. //
  105. // When all 4 bits are zero, the character is inert, meaning it is never
  106. // influenced by normalization.
  107. type qcInfo uint8
  108. func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
  109. func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
  110. func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
  111. func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
  112. func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
  113. func (p Properties) isInert() bool {
  114. return p.flags&qcInfoMask == 0 && p.ccc == 0
  115. }
  116. func (p Properties) multiSegment() bool {
  117. return p.index >= firstMulti && p.index < endMulti
  118. }
  119. func (p Properties) nLeadingNonStarters() uint8 {
  120. return p.nLead
  121. }
  122. func (p Properties) nTrailingNonStarters() uint8 {
  123. return uint8(p.flags & 0x03)
  124. }
  125. // Decomposition returns the decomposition for the underlying rune
  126. // or nil if there is none.
  127. func (p Properties) Decomposition() []byte {
  128. // TODO: create the decomposition for Hangul?
  129. if p.index == 0 {
  130. return nil
  131. }
  132. i := p.index
  133. n := decomps[i] & headerLenMask
  134. i++
  135. return decomps[i : i+uint16(n)]
  136. }
  137. // Size returns the length of UTF-8 encoding of the rune.
  138. func (p Properties) Size() int {
  139. return int(p.size)
  140. }
  141. // CCC returns the canonical combining class of the underlying rune.
  142. func (p Properties) CCC() uint8 {
  143. if p.index >= firstCCCZeroExcept {
  144. return 0
  145. }
  146. return ccc[p.ccc]
  147. }
  148. // LeadCCC returns the CCC of the first rune in the decomposition.
  149. // If there is no decomposition, LeadCCC equals CCC.
  150. func (p Properties) LeadCCC() uint8 {
  151. return ccc[p.ccc]
  152. }
  153. // TrailCCC returns the CCC of the last rune in the decomposition.
  154. // If there is no decomposition, TrailCCC equals CCC.
  155. func (p Properties) TrailCCC() uint8 {
  156. return ccc[p.tccc]
  157. }
  158. func buildRecompMap() {
  159. recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
  160. var buf [8]byte
  161. for i := 0; i < len(recompMapPacked); i += 8 {
  162. copy(buf[:], recompMapPacked[i:i+8])
  163. key := binary.BigEndian.Uint32(buf[:4])
  164. val := binary.BigEndian.Uint32(buf[4:])
  165. recompMap[key] = rune(val)
  166. }
  167. }
  168. // Recomposition
  169. // We use 32-bit keys instead of 64-bit for the two codepoint keys.
  170. // This clips off the bits of three entries, but we know this will not
  171. // result in a collision. In the unlikely event that changes to
  172. // UnicodeData.txt introduce collisions, the compiler will catch it.
  173. // Note that the recomposition map for NFC and NFKC are identical.
  174. // combine returns the combined rune or 0 if it doesn't exist.
  175. //
  176. // The caller is responsible for calling
  177. // recompMapOnce.Do(buildRecompMap) sometime before this is called.
  178. func combine(a, b rune) rune {
  179. key := uint32(uint16(a))<<16 + uint32(uint16(b))
  180. if recompMap == nil {
  181. panic("caller error") // see func comment
  182. }
  183. return recompMap[key]
  184. }
  185. func lookupInfoNFC(b input, i int) Properties {
  186. v, sz := b.charinfoNFC(i)
  187. return compInfo(v, sz)
  188. }
  189. func lookupInfoNFKC(b input, i int) Properties {
  190. v, sz := b.charinfoNFKC(i)
  191. return compInfo(v, sz)
  192. }
  193. // Properties returns properties for the first rune in s.
  194. func (f Form) Properties(s []byte) Properties {
  195. if f == NFC || f == NFD {
  196. return compInfo(nfcData.lookup(s))
  197. }
  198. return compInfo(nfkcData.lookup(s))
  199. }
  200. // PropertiesString returns properties for the first rune in s.
  201. func (f Form) PropertiesString(s string) Properties {
  202. if f == NFC || f == NFD {
  203. return compInfo(nfcData.lookupString(s))
  204. }
  205. return compInfo(nfkcData.lookupString(s))
  206. }
  207. // compInfo converts the information contained in v and sz
  208. // to a Properties. See the comment at the top of the file
  209. // for more information on the format.
  210. func compInfo(v uint16, sz int) Properties {
  211. if v == 0 {
  212. return Properties{size: uint8(sz)}
  213. } else if v >= 0x8000 {
  214. p := Properties{
  215. size: uint8(sz),
  216. ccc: uint8(v),
  217. tccc: uint8(v),
  218. flags: qcInfo(v >> 8),
  219. }
  220. if p.ccc > 0 || p.combinesBackward() {
  221. p.nLead = uint8(p.flags & 0x3)
  222. }
  223. return p
  224. }
  225. // has decomposition
  226. h := decomps[v]
  227. f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
  228. p := Properties{size: uint8(sz), flags: f, index: v}
  229. if v >= firstCCC {
  230. v += uint16(h&headerLenMask) + 1
  231. c := decomps[v]
  232. p.tccc = c >> 2
  233. p.flags |= qcInfo(c & 0x3)
  234. if v >= firstLeadingCCC {
  235. p.nLead = c & 0x3
  236. if v >= firstStarterWithNLead {
  237. // We were tricked. Remove the decomposition.
  238. p.flags &= 0x03
  239. p.index = 0
  240. return p
  241. }
  242. p.ccc = decomps[v+1]
  243. }
  244. }
  245. return p
  246. }