iter.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package norm
  5. import (
  6. "fmt"
  7. "unicode/utf8"
  8. )
  9. // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
  10. // sequence of starter and non-starter runes for the purpose of normalization.
  11. const MaxSegmentSize = maxByteBufferSize
  12. // An Iter iterates over a string or byte slice, while normalizing it
  13. // to a given Form.
  14. type Iter struct {
  15. rb reorderBuffer
  16. buf [maxByteBufferSize]byte
  17. info Properties // first character saved from previous iteration
  18. next iterFunc // implementation of next depends on form
  19. asciiF iterFunc
  20. p int // current position in input source
  21. multiSeg []byte // remainder of multi-segment decomposition
  22. }
  23. type iterFunc func(*Iter) []byte
  24. // Init initializes i to iterate over src after normalizing it to Form f.
  25. func (i *Iter) Init(f Form, src []byte) {
  26. i.p = 0
  27. if len(src) == 0 {
  28. i.setDone()
  29. i.rb.nsrc = 0
  30. return
  31. }
  32. i.multiSeg = nil
  33. i.rb.init(f, src)
  34. i.next = i.rb.f.nextMain
  35. i.asciiF = nextASCIIBytes
  36. i.info = i.rb.f.info(i.rb.src, i.p)
  37. i.rb.ss.first(i.info)
  38. }
  39. // InitString initializes i to iterate over src after normalizing it to Form f.
  40. func (i *Iter) InitString(f Form, src string) {
  41. i.p = 0
  42. if len(src) == 0 {
  43. i.setDone()
  44. i.rb.nsrc = 0
  45. return
  46. }
  47. i.multiSeg = nil
  48. i.rb.initString(f, src)
  49. i.next = i.rb.f.nextMain
  50. i.asciiF = nextASCIIString
  51. i.info = i.rb.f.info(i.rb.src, i.p)
  52. i.rb.ss.first(i.info)
  53. }
  54. // Seek sets the segment to be returned by the next call to Next to start
  55. // at position p. It is the responsibility of the caller to set p to the
  56. // start of a segment.
  57. func (i *Iter) Seek(offset int64, whence int) (int64, error) {
  58. var abs int64
  59. switch whence {
  60. case 0:
  61. abs = offset
  62. case 1:
  63. abs = int64(i.p) + offset
  64. case 2:
  65. abs = int64(i.rb.nsrc) + offset
  66. default:
  67. return 0, fmt.Errorf("norm: invalid whence")
  68. }
  69. if abs < 0 {
  70. return 0, fmt.Errorf("norm: negative position")
  71. }
  72. if int(abs) >= i.rb.nsrc {
  73. i.setDone()
  74. return int64(i.p), nil
  75. }
  76. i.p = int(abs)
  77. i.multiSeg = nil
  78. i.next = i.rb.f.nextMain
  79. i.info = i.rb.f.info(i.rb.src, i.p)
  80. i.rb.ss.first(i.info)
  81. return abs, nil
  82. }
  83. // returnSlice returns a slice of the underlying input type as a byte slice.
  84. // If the underlying is of type []byte, it will simply return a slice.
  85. // If the underlying is of type string, it will copy the slice to the buffer
  86. // and return that.
  87. func (i *Iter) returnSlice(a, b int) []byte {
  88. if i.rb.src.bytes == nil {
  89. return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
  90. }
  91. return i.rb.src.bytes[a:b]
  92. }
  93. // Pos returns the byte position at which the next call to Next will commence processing.
  94. func (i *Iter) Pos() int {
  95. return i.p
  96. }
  97. func (i *Iter) setDone() {
  98. i.next = nextDone
  99. i.p = i.rb.nsrc
  100. }
  101. // Done returns true if there is no more input to process.
  102. func (i *Iter) Done() bool {
  103. return i.p >= i.rb.nsrc
  104. }
  105. // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
  106. // For any input a and b for which f(a) == f(b), subsequent calls
  107. // to Next will return the same segments.
  108. // Modifying runes are grouped together with the preceding starter, if such a starter exists.
  109. // Although not guaranteed, n will typically be the smallest possible n.
  110. func (i *Iter) Next() []byte {
  111. return i.next(i)
  112. }
  113. func nextASCIIBytes(i *Iter) []byte {
  114. p := i.p + 1
  115. if p >= i.rb.nsrc {
  116. p0 := i.p
  117. i.setDone()
  118. return i.rb.src.bytes[p0:p]
  119. }
  120. if i.rb.src.bytes[p] < utf8.RuneSelf {
  121. p0 := i.p
  122. i.p = p
  123. return i.rb.src.bytes[p0:p]
  124. }
  125. i.info = i.rb.f.info(i.rb.src, i.p)
  126. i.next = i.rb.f.nextMain
  127. return i.next(i)
  128. }
  129. func nextASCIIString(i *Iter) []byte {
  130. p := i.p + 1
  131. if p >= i.rb.nsrc {
  132. i.buf[0] = i.rb.src.str[i.p]
  133. i.setDone()
  134. return i.buf[:1]
  135. }
  136. if i.rb.src.str[p] < utf8.RuneSelf {
  137. i.buf[0] = i.rb.src.str[i.p]
  138. i.p = p
  139. return i.buf[:1]
  140. }
  141. i.info = i.rb.f.info(i.rb.src, i.p)
  142. i.next = i.rb.f.nextMain
  143. return i.next(i)
  144. }
  145. func nextHangul(i *Iter) []byte {
  146. p := i.p
  147. next := p + hangulUTF8Size
  148. if next >= i.rb.nsrc {
  149. i.setDone()
  150. } else if i.rb.src.hangul(next) == 0 {
  151. i.rb.ss.next(i.info)
  152. i.info = i.rb.f.info(i.rb.src, i.p)
  153. i.next = i.rb.f.nextMain
  154. return i.next(i)
  155. }
  156. i.p = next
  157. return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
  158. }
  159. func nextDone(i *Iter) []byte {
  160. return nil
  161. }
  162. // nextMulti is used for iterating over multi-segment decompositions
  163. // for decomposing normal forms.
  164. func nextMulti(i *Iter) []byte {
  165. j := 0
  166. d := i.multiSeg
  167. // skip first rune
  168. for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
  169. }
  170. for j < len(d) {
  171. info := i.rb.f.info(input{bytes: d}, j)
  172. if info.BoundaryBefore() {
  173. i.multiSeg = d[j:]
  174. return d[:j]
  175. }
  176. j += int(info.size)
  177. }
  178. // treat last segment as normal decomposition
  179. i.next = i.rb.f.nextMain
  180. return i.next(i)
  181. }
  182. // nextMultiNorm is used for iterating over multi-segment decompositions
  183. // for composing normal forms.
  184. func nextMultiNorm(i *Iter) []byte {
  185. j := 0
  186. d := i.multiSeg
  187. for j < len(d) {
  188. info := i.rb.f.info(input{bytes: d}, j)
  189. if info.BoundaryBefore() {
  190. i.rb.compose()
  191. seg := i.buf[:i.rb.flushCopy(i.buf[:])]
  192. i.rb.insertUnsafe(input{bytes: d}, j, info)
  193. i.multiSeg = d[j+int(info.size):]
  194. return seg
  195. }
  196. i.rb.insertUnsafe(input{bytes: d}, j, info)
  197. j += int(info.size)
  198. }
  199. i.multiSeg = nil
  200. i.next = nextComposed
  201. return doNormComposed(i)
  202. }
  203. // nextDecomposed is the implementation of Next for forms NFD and NFKD.
  204. func nextDecomposed(i *Iter) (next []byte) {
  205. outp := 0
  206. inCopyStart, outCopyStart := i.p, 0
  207. for {
  208. if sz := int(i.info.size); sz <= 1 {
  209. i.rb.ss = 0
  210. p := i.p
  211. i.p++ // ASCII or illegal byte. Either way, advance by 1.
  212. if i.p >= i.rb.nsrc {
  213. i.setDone()
  214. return i.returnSlice(p, i.p)
  215. } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
  216. i.next = i.asciiF
  217. return i.returnSlice(p, i.p)
  218. }
  219. outp++
  220. } else if d := i.info.Decomposition(); d != nil {
  221. // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
  222. // Case 1: there is a leftover to copy. In this case the decomposition
  223. // must begin with a modifier and should always be appended.
  224. // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
  225. p := outp + len(d)
  226. if outp > 0 {
  227. i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
  228. // TODO: this condition should not be possible, but we leave it
  229. // in for defensive purposes.
  230. if p > len(i.buf) {
  231. return i.buf[:outp]
  232. }
  233. } else if i.info.multiSegment() {
  234. // outp must be 0 as multi-segment decompositions always
  235. // start a new segment.
  236. if i.multiSeg == nil {
  237. i.multiSeg = d
  238. i.next = nextMulti
  239. return nextMulti(i)
  240. }
  241. // We are in the last segment. Treat as normal decomposition.
  242. d = i.multiSeg
  243. i.multiSeg = nil
  244. p = len(d)
  245. }
  246. prevCC := i.info.tccc
  247. if i.p += sz; i.p >= i.rb.nsrc {
  248. i.setDone()
  249. i.info = Properties{} // Force BoundaryBefore to succeed.
  250. } else {
  251. i.info = i.rb.f.info(i.rb.src, i.p)
  252. }
  253. switch i.rb.ss.next(i.info) {
  254. case ssOverflow:
  255. i.next = nextCGJDecompose
  256. fallthrough
  257. case ssStarter:
  258. if outp > 0 {
  259. copy(i.buf[outp:], d)
  260. return i.buf[:p]
  261. }
  262. return d
  263. }
  264. copy(i.buf[outp:], d)
  265. outp = p
  266. inCopyStart, outCopyStart = i.p, outp
  267. if i.info.ccc < prevCC {
  268. goto doNorm
  269. }
  270. continue
  271. } else if r := i.rb.src.hangul(i.p); r != 0 {
  272. outp = decomposeHangul(i.buf[:], r)
  273. i.p += hangulUTF8Size
  274. inCopyStart, outCopyStart = i.p, outp
  275. if i.p >= i.rb.nsrc {
  276. i.setDone()
  277. break
  278. } else if i.rb.src.hangul(i.p) != 0 {
  279. i.next = nextHangul
  280. return i.buf[:outp]
  281. }
  282. } else {
  283. p := outp + sz
  284. if p > len(i.buf) {
  285. break
  286. }
  287. outp = p
  288. i.p += sz
  289. }
  290. if i.p >= i.rb.nsrc {
  291. i.setDone()
  292. break
  293. }
  294. prevCC := i.info.tccc
  295. i.info = i.rb.f.info(i.rb.src, i.p)
  296. if v := i.rb.ss.next(i.info); v == ssStarter {
  297. break
  298. } else if v == ssOverflow {
  299. i.next = nextCGJDecompose
  300. break
  301. }
  302. if i.info.ccc < prevCC {
  303. goto doNorm
  304. }
  305. }
  306. if outCopyStart == 0 {
  307. return i.returnSlice(inCopyStart, i.p)
  308. } else if inCopyStart < i.p {
  309. i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
  310. }
  311. return i.buf[:outp]
  312. doNorm:
  313. // Insert what we have decomposed so far in the reorderBuffer.
  314. // As we will only reorder, there will always be enough room.
  315. i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
  316. i.rb.insertDecomposed(i.buf[0:outp])
  317. return doNormDecomposed(i)
  318. }
  319. func doNormDecomposed(i *Iter) []byte {
  320. for {
  321. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  322. if i.p += int(i.info.size); i.p >= i.rb.nsrc {
  323. i.setDone()
  324. break
  325. }
  326. i.info = i.rb.f.info(i.rb.src, i.p)
  327. if i.info.ccc == 0 {
  328. break
  329. }
  330. if s := i.rb.ss.next(i.info); s == ssOverflow {
  331. i.next = nextCGJDecompose
  332. break
  333. }
  334. }
  335. // new segment or too many combining characters: exit normalization
  336. return i.buf[:i.rb.flushCopy(i.buf[:])]
  337. }
  338. func nextCGJDecompose(i *Iter) []byte {
  339. i.rb.ss = 0
  340. i.rb.insertCGJ()
  341. i.next = nextDecomposed
  342. i.rb.ss.first(i.info)
  343. buf := doNormDecomposed(i)
  344. return buf
  345. }
  346. // nextComposed is the implementation of Next for forms NFC and NFKC.
  347. func nextComposed(i *Iter) []byte {
  348. outp, startp := 0, i.p
  349. var prevCC uint8
  350. for {
  351. if !i.info.isYesC() {
  352. goto doNorm
  353. }
  354. prevCC = i.info.tccc
  355. sz := int(i.info.size)
  356. if sz == 0 {
  357. sz = 1 // illegal rune: copy byte-by-byte
  358. }
  359. p := outp + sz
  360. if p > len(i.buf) {
  361. break
  362. }
  363. outp = p
  364. i.p += sz
  365. if i.p >= i.rb.nsrc {
  366. i.setDone()
  367. break
  368. } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
  369. i.rb.ss = 0
  370. i.next = i.asciiF
  371. break
  372. }
  373. i.info = i.rb.f.info(i.rb.src, i.p)
  374. if v := i.rb.ss.next(i.info); v == ssStarter {
  375. break
  376. } else if v == ssOverflow {
  377. i.next = nextCGJCompose
  378. break
  379. }
  380. if i.info.ccc < prevCC {
  381. goto doNorm
  382. }
  383. }
  384. return i.returnSlice(startp, i.p)
  385. doNorm:
  386. // reset to start position
  387. i.p = startp
  388. i.info = i.rb.f.info(i.rb.src, i.p)
  389. i.rb.ss.first(i.info)
  390. if i.info.multiSegment() {
  391. d := i.info.Decomposition()
  392. info := i.rb.f.info(input{bytes: d}, 0)
  393. i.rb.insertUnsafe(input{bytes: d}, 0, info)
  394. i.multiSeg = d[int(info.size):]
  395. i.next = nextMultiNorm
  396. return nextMultiNorm(i)
  397. }
  398. i.rb.ss.first(i.info)
  399. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  400. return doNormComposed(i)
  401. }
  402. func doNormComposed(i *Iter) []byte {
  403. // First rune should already be inserted.
  404. for {
  405. if i.p += int(i.info.size); i.p >= i.rb.nsrc {
  406. i.setDone()
  407. break
  408. }
  409. i.info = i.rb.f.info(i.rb.src, i.p)
  410. if s := i.rb.ss.next(i.info); s == ssStarter {
  411. break
  412. } else if s == ssOverflow {
  413. i.next = nextCGJCompose
  414. break
  415. }
  416. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  417. }
  418. i.rb.compose()
  419. seg := i.buf[:i.rb.flushCopy(i.buf[:])]
  420. return seg
  421. }
  422. func nextCGJCompose(i *Iter) []byte {
  423. i.rb.ss = 0 // instead of first
  424. i.rb.insertCGJ()
  425. i.next = nextComposed
  426. // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
  427. // even if they are not. This is particularly dubious for U+FF9E and UFF9A.
  428. // If we ever change that, insert a check here.
  429. i.rb.ss.first(i.info)
  430. i.rb.insertUnsafe(i.rb.src, i.p, i.info)
  431. return doNormComposed(i)
  432. }