icu.go 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build icu
  5. // +build icu
  6. package main
  7. /*
  8. #cgo LDFLAGS: -licui18n -licuuc
  9. #include <stdlib.h>
  10. #include <unicode/ucol.h>
  11. #include <unicode/uiter.h>
  12. #include <unicode/utypes.h>
  13. */
  14. import "C"
  15. import (
  16. "fmt"
  17. "log"
  18. "unicode/utf16"
  19. "unicode/utf8"
  20. "unsafe"
  21. )
  22. func init() {
  23. AddFactory(CollatorFactory{"icu", newUTF16,
  24. "Main ICU collator, using native strings."})
  25. AddFactory(CollatorFactory{"icu8", newUTF8iter,
  26. "ICU collator using ICU iterators to process UTF8."})
  27. AddFactory(CollatorFactory{"icu16", newUTF8conv,
  28. "ICU collation by first converting UTF8 to UTF16."})
  29. }
  30. func icuCharP(s []byte) *C.char {
  31. return (*C.char)(unsafe.Pointer(&s[0]))
  32. }
  33. func icuUInt8P(s []byte) *C.uint8_t {
  34. return (*C.uint8_t)(unsafe.Pointer(&s[0]))
  35. }
  36. func icuUCharP(s []uint16) *C.UChar {
  37. return (*C.UChar)(unsafe.Pointer(&s[0]))
  38. }
  39. func icuULen(s []uint16) C.int32_t {
  40. return C.int32_t(len(s))
  41. }
  42. func icuSLen(s []byte) C.int32_t {
  43. return C.int32_t(len(s))
  44. }
  45. // icuCollator implements a Collator based on ICU.
  46. type icuCollator struct {
  47. loc *C.char
  48. col *C.UCollator
  49. keyBuf []byte
  50. }
  51. const growBufSize = 10 * 1024 * 1024
  52. func (c *icuCollator) init(locale string) error {
  53. err := C.UErrorCode(0)
  54. c.loc = C.CString(locale)
  55. c.col = C.ucol_open(c.loc, &err)
  56. if err > 0 {
  57. return fmt.Errorf("failed opening collator for %q", locale)
  58. } else if err < 0 {
  59. loc := C.ucol_getLocaleByType(c.col, 0, &err)
  60. fmt, ok := map[int]string{
  61. -127: "warning: using default collator: %s",
  62. -128: "warning: using fallback collator: %s",
  63. }[int(err)]
  64. if ok {
  65. log.Printf(fmt, C.GoString(loc))
  66. }
  67. }
  68. c.keyBuf = make([]byte, 0, growBufSize)
  69. return nil
  70. }
  71. func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
  72. if len(c.keyBuf) == cap(c.keyBuf) {
  73. c.keyBuf = make([]byte, 0, growBufSize)
  74. }
  75. b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
  76. return icuUInt8P(b), icuSLen(b)
  77. }
  78. func (c *icuCollator) extendBuf(n C.int32_t) []byte {
  79. end := len(c.keyBuf) + int(n)
  80. if end > cap(c.keyBuf) {
  81. if len(c.keyBuf) == 0 {
  82. log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
  83. }
  84. c.keyBuf = make([]byte, 0, growBufSize)
  85. return nil
  86. }
  87. b := c.keyBuf[len(c.keyBuf):end]
  88. c.keyBuf = c.keyBuf[:end]
  89. return b
  90. }
  91. func (c *icuCollator) Close() error {
  92. C.ucol_close(c.col)
  93. C.free(unsafe.Pointer(c.loc))
  94. return nil
  95. }
  96. // icuUTF16 implements the Collator interface.
  97. type icuUTF16 struct {
  98. icuCollator
  99. }
  100. func newUTF16(locale string) (Collator, error) {
  101. c := &icuUTF16{}
  102. return c, c.init(locale)
  103. }
  104. func (c *icuUTF16) Compare(a, b Input) int {
  105. return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
  106. }
  107. func (c *icuUTF16) Key(s Input) []byte {
  108. bp, bn := c.buf()
  109. n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
  110. if b := c.extendBuf(n); b != nil {
  111. return b
  112. }
  113. return c.Key(s)
  114. }
  115. // icuUTF8iter implements the Collator interface
  116. // This implementation wraps the UTF8 string in an iterator
  117. // which is passed to the collator.
  118. type icuUTF8iter struct {
  119. icuCollator
  120. a, b C.UCharIterator
  121. }
  122. func newUTF8iter(locale string) (Collator, error) {
  123. c := &icuUTF8iter{}
  124. return c, c.init(locale)
  125. }
  126. func (c *icuUTF8iter) Compare(a, b Input) int {
  127. err := C.UErrorCode(0)
  128. C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
  129. C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
  130. return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
  131. }
  132. func (c *icuUTF8iter) Key(s Input) []byte {
  133. err := C.UErrorCode(0)
  134. state := [2]C.uint32_t{}
  135. C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
  136. bp, bn := c.buf()
  137. n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
  138. if n >= bn {
  139. // Force failure.
  140. if c.extendBuf(n+1) != nil {
  141. log.Fatal("expected extension to fail")
  142. }
  143. return c.Key(s)
  144. }
  145. return c.extendBuf(n)
  146. }
  147. // icuUTF8conv implements the Collator interface.
  148. // This implementation first converts the give UTF8 string
  149. // to UTF16 and then calls the main ICU collation function.
  150. type icuUTF8conv struct {
  151. icuCollator
  152. }
  153. func newUTF8conv(locale string) (Collator, error) {
  154. c := &icuUTF8conv{}
  155. return c, c.init(locale)
  156. }
  157. func (c *icuUTF8conv) Compare(sa, sb Input) int {
  158. a := encodeUTF16(sa.UTF8)
  159. b := encodeUTF16(sb.UTF8)
  160. return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
  161. }
  162. func (c *icuUTF8conv) Key(s Input) []byte {
  163. a := encodeUTF16(s.UTF8)
  164. bp, bn := c.buf()
  165. n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
  166. if b := c.extendBuf(n); b != nil {
  167. return b
  168. }
  169. return c.Key(s)
  170. }
  171. func encodeUTF16(b []byte) []uint16 {
  172. a := []uint16{}
  173. for len(b) > 0 {
  174. r, sz := utf8.DecodeRune(b)
  175. b = b[sz:]
  176. r1, r2 := utf16.EncodeRune(r)
  177. if r1 != 0xFFFD {
  178. a = append(a, uint16(r1), uint16(r2))
  179. } else {
  180. a = append(a, uint16(r))
  181. }
  182. }
  183. return a
  184. }