width.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate stringer -type=Kind
  5. //go:generate go run gen.go gen_common.go gen_trieval.go
  6. // Package width provides functionality for handling different widths in text.
  7. //
  8. // Wide characters behave like ideographs; they tend to allow line breaks after
  9. // each character and remain upright in vertical text layout. Narrow characters
  10. // are kept together in words or runs that are rotated sideways in vertical text
  11. // layout.
  12. //
  13. // For more information, see https://unicode.org/reports/tr11/.
  14. package width // import "golang.org/x/text/width"
  15. import (
  16. "unicode/utf8"
  17. "golang.org/x/text/transform"
  18. )
  19. // TODO
  20. // 1) Reduce table size by compressing blocks.
  21. // 2) API proposition for computing display length
  22. // (approximation, fixed pitch only).
  23. // 3) Implement display length.
  24. // Kind indicates the type of width property as defined in https://unicode.org/reports/tr11/.
  25. type Kind int
  26. const (
  27. // Neutral characters do not occur in legacy East Asian character sets.
  28. Neutral Kind = iota
  29. // EastAsianAmbiguous characters that can be sometimes wide and sometimes
  30. // narrow and require additional information not contained in the character
  31. // code to further resolve their width.
  32. EastAsianAmbiguous
  33. // EastAsianWide characters are wide in its usual form. They occur only in
  34. // the context of East Asian typography. These runes may have explicit
  35. // halfwidth counterparts.
  36. EastAsianWide
  37. // EastAsianNarrow characters are narrow in its usual form. They often have
  38. // fullwidth counterparts.
  39. EastAsianNarrow
  40. // Note: there exist Narrow runes that do not have fullwidth or wide
  41. // counterparts, despite what the definition says (e.g. U+27E6).
  42. // EastAsianFullwidth characters have a compatibility decompositions of type
  43. // wide that map to a narrow counterpart.
  44. EastAsianFullwidth
  45. // EastAsianHalfwidth characters have a compatibility decomposition of type
  46. // narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON
  47. // SIGN.
  48. EastAsianHalfwidth
  49. // Note: there exist runes that have a halfwidth counterparts but that are
  50. // classified as Ambiguous, rather than wide (e.g. U+2190).
  51. )
  52. // TODO: the generated tries need to return size 1 for invalid runes for the
  53. // width to be computed correctly (each byte should render width 1)
  54. var trie = newWidthTrie(0)
  55. // Lookup reports the Properties of the first rune in b and the number of bytes
  56. // of its UTF-8 encoding.
  57. func Lookup(b []byte) (p Properties, size int) {
  58. v, sz := trie.lookup(b)
  59. return Properties{elem(v), b[sz-1]}, sz
  60. }
  61. // LookupString reports the Properties of the first rune in s and the number of
  62. // bytes of its UTF-8 encoding.
  63. func LookupString(s string) (p Properties, size int) {
  64. v, sz := trie.lookupString(s)
  65. return Properties{elem(v), s[sz-1]}, sz
  66. }
  67. // LookupRune reports the Properties of rune r.
  68. func LookupRune(r rune) Properties {
  69. var buf [4]byte
  70. n := utf8.EncodeRune(buf[:], r)
  71. v, _ := trie.lookup(buf[:n])
  72. last := byte(r)
  73. if r >= utf8.RuneSelf {
  74. last = 0x80 + byte(r&0x3f)
  75. }
  76. return Properties{elem(v), last}
  77. }
  78. // Properties provides access to width properties of a rune.
  79. type Properties struct {
  80. elem elem
  81. last byte
  82. }
  83. func (e elem) kind() Kind {
  84. return Kind(e >> typeShift)
  85. }
  86. // Kind returns the Kind of a rune as defined in Unicode TR #11.
  87. // See https://unicode.org/reports/tr11/ for more details.
  88. func (p Properties) Kind() Kind {
  89. return p.elem.kind()
  90. }
  91. // Folded returns the folded variant of a rune or 0 if the rune is canonical.
  92. func (p Properties) Folded() rune {
  93. if p.elem&tagNeedsFold != 0 {
  94. buf := inverseData[byte(p.elem)]
  95. buf[buf[0]] ^= p.last
  96. r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
  97. return r
  98. }
  99. return 0
  100. }
  101. // Narrow returns the narrow variant of a rune or 0 if the rune is already
  102. // narrow or doesn't have a narrow variant.
  103. func (p Properties) Narrow() rune {
  104. if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) {
  105. buf := inverseData[byte(p.elem)]
  106. buf[buf[0]] ^= p.last
  107. r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
  108. return r
  109. }
  110. return 0
  111. }
  112. // Wide returns the wide variant of a rune or 0 if the rune is already
  113. // wide or doesn't have a wide variant.
  114. func (p Properties) Wide() rune {
  115. if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) {
  116. buf := inverseData[byte(p.elem)]
  117. buf[buf[0]] ^= p.last
  118. r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
  119. return r
  120. }
  121. return 0
  122. }
  123. // TODO for Properties:
  124. // - Add Fullwidth/Halfwidth or Inverted methods for computing variants
  125. // mapping.
  126. // - Add width information (including information on non-spacing runes).
  127. // Transformer implements the transform.Transformer interface.
  128. type Transformer struct {
  129. t transform.SpanningTransformer
  130. }
  131. // Reset implements the transform.Transformer interface.
  132. func (t Transformer) Reset() { t.t.Reset() }
  133. // Transform implements the transform.Transformer interface.
  134. func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  135. return t.t.Transform(dst, src, atEOF)
  136. }
  137. // Span implements the transform.SpanningTransformer interface.
  138. func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) {
  139. return t.t.Span(src, atEOF)
  140. }
  141. // Bytes returns a new byte slice with the result of applying t to b.
  142. func (t Transformer) Bytes(b []byte) []byte {
  143. b, _, _ = transform.Bytes(t, b)
  144. return b
  145. }
  146. // String returns a string with the result of applying t to s.
  147. func (t Transformer) String(s string) string {
  148. s, _, _ = transform.String(t, s)
  149. return s
  150. }
  151. var (
  152. // Fold is a transform that maps all runes to their canonical width.
  153. //
  154. // Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm
  155. // provide a more generic folding mechanism.
  156. Fold Transformer = Transformer{foldTransform{}}
  157. // Widen is a transform that maps runes to their wide variant, if
  158. // available.
  159. Widen Transformer = Transformer{wideTransform{}}
  160. // Narrow is a transform that maps runes to their narrow variant, if
  161. // available.
  162. Narrow Transformer = Transformer{narrowTransform{}}
  163. )
  164. // TODO: Consider the following options:
  165. // - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some
  166. // generalized variant of this.
  167. // - Consider a wide Won character to be the default width (or some generalized
  168. // variant of this).
  169. // - Filter the set of characters that gets converted (the preferred approach is
  170. // to allow applying filters to transforms).