strings.go 11 KB


  1. // Copyright 2014 Richard Lehane. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package types
  15. import (
  16. "encoding/binary"
  17. "strings"
  18. "unicode/utf16"
  19. )
  20. func nullTerminated(s string) string {
  21. return s[:strings.Index(s, "\x00")]
  22. }
  23. type UnicodeString []uint16
  24. func (s UnicodeString) Type() string {
  25. return "UnicodeString"
  26. }
  27. func (s UnicodeString) Length() int {
  28. return 4 + len(s)*2
  29. }
  30. func (s UnicodeString) String() string {
  31. if len(s) == 0 {
  32. return ""
  33. }
  34. return nullTerminated(string(utf16.Decode(s)))
  35. }
  36. func MakeUnicode(b []byte) (Type, error) {
  37. if len(b) < 4 {
  38. return UnicodeString{}, ErrType
  39. }
  40. l := int(binary.LittleEndian.Uint32(b[:4]))
  41. if l == 0 {
  42. return UnicodeString{}, nil
  43. }
  44. if len(b) < l*2+4 {
  45. return UnicodeString{}, ErrType
  46. }
  47. s := make(UnicodeString, l)
  48. for i := range s {
  49. start := i*2 + 4
  50. s[i] = binary.LittleEndian.Uint16(b[start : start+2])
  51. }
  52. return s, nil
  53. }
  54. type CodeString struct {
  55. id CodePageID
  56. Chars []byte
  57. }
  58. func (s *CodeString) SetId(i CodePageID) {
  59. s.id = i
  60. }
  61. func (s *CodeString) Encoding() string {
  62. return CodePageIDs[s.id]
  63. }
  64. func (s *CodeString) Type() string {
  65. return "CodeString"
  66. }
  67. func (s *CodeString) Length() int {
  68. return 4 + len(s.Chars)
  69. }
  70. func (s *CodeString) String() string {
  71. if len(s.Chars) == 0 {
  72. return ""
  73. }
  74. if s.id == 1200 {
  75. chars := make([]uint16, len(s.Chars)/2)
  76. for i := range chars {
  77. chars[i] = binary.LittleEndian.Uint16(s.Chars[i*2 : i*2+2])
  78. }
  79. return nullTerminated(string(utf16.Decode(chars)))
  80. }
  81. return nullTerminated(string(s.Chars))
  82. }
  83. func MakeCodeString(b []byte) (Type, error) {
  84. if len(b) < 4 {
  85. return &CodeString{}, ErrType
  86. }
  87. s := &CodeString{}
  88. l := int(binary.LittleEndian.Uint32(b[:4]))
  89. if l == 0 {
  90. return s, nil
  91. }
  92. if len(b) < l+4 {
  93. return s, ErrType
  94. }
  95. s.Chars = make([]byte, l)
  96. copy(s.Chars, b[4:l+4])
  97. return s, nil
  98. }
  99. type CodePageID uint16
  100. var CodePageIDs map[CodePageID]string = map[CodePageID]string{
  101. 37: "IBM037 - IBM EBCDIC US-Canada",
  102. 437: "IBM437 - OEM United States",
  103. 500: "IBM500 - IBM EBCDIC International",
  104. 708: "ASMO-708 - Arabic (ASMO 708)",
  105. 709: "Arabic (ASMO-449+, BCON V4)",
  106. 710: "Arabic - Transparent Arabic",
  107. 720: "DOS-720 - Arabic (Transparent ASMO); Arabic (DOS)",
  108. 737: "ibm737 - OEM Greek (formerly 437G); Greek (DOS)",
  109. 775: "ibm775 - OEM Baltic; Baltic (DOS)",
  110. 850: "ibm850 - OEM Multilingual Latin 1; Western European (DOS)",
  111. 852: "ibm852 - OEM Latin 2; Central European (DOS)",
  112. 855: "IBM855 - OEM Cyrillic (primarily Russian)",
  113. 857: "ibm857 - OEM Turkish; Turkish (DOS)",
  114. 858: "IBM00858 - OEM Multilingual Latin 1 + Euro symbol",
  115. 860: "IBM860 - OEM Portuguese; Portuguese (DOS)",
  116. 861: "ibm861 - OEM Icelandic; Icelandic (DOS)",
  117. 862: "DOS-862 - OEM Hebrew; Hebrew (DOS)",
  118. 863: "IBM863 - OEM French Canadian; French Canadian (DOS)",
  119. 864: "IBM864 - OEM Arabic; Arabic (864)",
  120. 865: "IBM865 - OEM Nordic; Nordic (DOS)",
  121. 866: "cp866 - OEM Russian; Cyrillic (DOS)",
  122. 869: "ibm869 - OEM Modern Greek; Greek, Modern (DOS)",
  123. 870: "IBM870 - IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2",
  124. 874: "windows-874 - ANSI/OEM Thai (ISO 8859-11); Thai (Windows)",
  125. 875: "cp875 - IBM EBCDIC Greek Modern",
  126. 932: "shift_jis - ANSI/OEM Japanese; Japanese (Shift-JIS)",
  127. 936: "gb2312 - ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)",
  128. 949: "ks_c_5601-1987 - ANSI/OEM Korean (Unified Hangul Code)",
  129. 950: "big5 - ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)",
  130. 1026: "IBM1026 - IBM EBCDIC Turkish (Latin 5)",
  131. 1047: "IBM01047 - BM EBCDIC Latin 1/Open System",
  132. 1140: "IBM01140 - IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)",
  133. 1141: "IBM01141 - IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)",
  134. 1142: "IBM01142 - IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)",
  135. 1143: "IBM01143 - IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)",
  136. 1144: "IBM01144 - IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)",
  137. 1145: "IBM01145 - IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)",
  138. 1146: "IBM01146 - IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)",
  139. 1147: "IBM01147 - IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)",
  140. 1148: "IBM01148 - IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)",
  141. 1149: "IBM01149 - IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)",
  142. 1200: "utf-16 - Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications",
  143. 1201: "unicodeFFFE - Unicode UTF-16, big endian byte order; available only to managed applications",
  144. 1250: "windows-1250 - ANSI Central European; Central European (Windows)",
  145. 1251: "windows-1251 - ANSI Cyrillic; Cyrillic (Windows)",
  146. 1252: "windows-1252 - ANSI Latin 1; Western European (Windows)",
  147. 1253: "windows-1253 - ANSI Greek; Greek (Windows)",
  148. 1254: "windows-1254 - ANSI Turkish; Turkish (Windows)",
  149. 1255: "windows-1255 - ANSI Hebrew; Hebrew (Windows)",
  150. 1256: "windows-1256 - ANSI Arabic; Arabic (Windows)",
  151. 1257: "windows-1257 - ANSI Baltic; Baltic (Windows)",
  152. 1258: "windows-1258 - ANSI/OEM Vietnamese; Vietnamese (Windows)",
  153. 1361: "Johab - Korean (Johab)",
  154. 10000: "macintosh - MAC Roman; Western European (Mac)",
  155. 10001: "x-mac-japanese - Japanese (Mac)",
  156. 10002: "x-mac-chinesetrad - MAC Traditional Chinese (Big5); Chinese Traditional (Mac)",
  157. 10003: "x-mac-korean - Korean (Mac)",
  158. 10004: "x-mac-arabic - Arabic (Mac)",
  159. 10005: "x-mac-hebrew - Hebrew (Mac)",
  160. 10006: "x-mac-greek - Greek (Mac)",
  161. 10007: "x-mac-cyrillic - Cyrillic (Mac)",
  162. 10008: "x-mac-chinesesimp - MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)",
  163. 10010: "x-mac-romanian - Romanian (Mac)",
  164. 10017: "x-mac-ukrainian - Ukrainian (Mac)",
  165. 10021: "x-mac-thai - Thai (Mac)",
  166. 10029: "x-mac-ce - MAC Latin 2; Central European (Mac)",
  167. 10079: "x-mac-icelandic - Icelandic (Mac)",
  168. 10081: "x-mac-turkish - Turkish (Mac)",
  169. 10082: "x-mac-croatian - Croatian (Mac)",
  170. 12000: "utf-32 - Unicode UTF-32, little endian byte order; available only to managed applications",
  171. 12001: "utf-32BE - Unicode UTF-32, big endian byte order; available only to managed applications",
  172. 20000: "x-Chinese_CNS - CNS Taiwan; Chinese Traditional (CNS)",
  173. 20001: "x-cp20001 - TCA Taiwan",
  174. 20002: "x_Chinese-Eten - Eten Taiwan; Chinese Traditional (Eten)",
  175. 20003: "x-cp20003 - IBM5550 Taiwan",
  176. 20004: "x-cp20004 - TeleText Taiwan",
  177. 20005: "x-cp20005 - Wang Taiwan",
  178. 20105: "x-IA5 - IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)",
  179. 20106: "x-IA5-German - IA5 German (7-bit)",
  180. 20107: "x-IA5-Swedish - IA5 Swedish (7-bit)",
  181. 20108: "x-IA5-Norwegian - IA5 Norwegian (7-bit)",
  182. 20127: "us-ascii - US-ASCII (7-bit)",
  183. 20261: "x-cp20261 - T.61",
  184. 20269: "x-cp20269 - ISO 6937 Non-Spacing Accent",
  185. 20273: "IBM273 - IBM EBCDIC Germany",
  186. 20277: "IBM277 - IBM EBCDIC Denmark-Norway",
  187. 20278: "IBM278 - IBM EBCDIC Finland-Sweden",
  188. 20280: "IBM280 - IBM EBCDIC Italy",
  189. 20284: "IBM284 - IBM EBCDIC Latin America-Spain",
  190. 20285: "IBM285 - IBM EBCDIC United Kingdom",
  191. 20290: "IBM290 - IBM EBCDIC Japanese Katakana Extended",
  192. 20297: "IBM297 - IBM EBCDIC France",
  193. 20420: "IBM420 - IBM EBCDIC Arabic",
  194. 20423: "IBM423 - IBM EBCDIC Greek",
  195. 20424: "IBM424 - IBM EBCDIC Hebrew",
  196. 20833: "x-EBCDIC-KoreanExtended - IBM EBCDIC Korean Extended",
  197. 20838: "IBM-Thai - IBM EBCDIC Thai",
  198. 20866: "koi8-r - Russian (KOI8-R); Cyrillic (KOI8-R)",
  199. 20871: "IBM871 - IBM EBCDIC Icelandic",
  200. 20880: "IBM880 - IBM EBCDIC Cyrillic Russian",
  201. 20905: "IBM905 - IBM EBCDIC Turkish",
  202. 20924: "IBM00924 - IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)",
  203. 20932: "EUC-JP - Japanese (JIS 0208-1990 and 0212-1990)",
  204. 20936: "x-cp20936 - Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)",
  205. 20949: "x-cp20949 - Korean Wansung",
  206. 21025: "cp1025 - IBM EBCDIC Cyrillic Serbian-Bulgarian",
  207. 21027: "(deprecated)",
  208. 21866: "koi8-u - Ukrainian (KOI8-U); Cyrillic (KOI8-U)",
  209. 28591: "iso-8859-1 - ISO 8859-1 Latin 1; Western European (ISO)",
  210. 28592: "iso-8859-2 - ISO 8859-2 Central European; Central European (ISO)",
  211. 28593: "iso-8859-3 - ISO 8859-3 Latin 3",
  212. 28594: "iso-8859-4 - ISO 8859-4 Baltic",
  213. 28595: "iso-8859-5 - ISO 8859-5 Cyrillic",
  214. 28596: "iso-8859-6 - ISO 8859-6 Arabic",
  215. 28597: "iso-8859-7 - ISO 8859-7 Greek",
  216. 28598: "iso-8859-8 - ISO 8859-8 Hebrew; Hebrew (ISO-Visual)",
  217. 28599: "iso-8859-9 - ISO 8859-9 Turkish",
  218. 28603: "iso-8859-13 - ISO 8859-13 Estonian",
  219. 28605: "iso-8859-15 - ISO 8859-15 Latin 9",
  220. 29001: "x-Europa - Europa 3",
  221. 38598: "iso-8859-8-i - ISO 8859-8 Hebrew; Hebrew (ISO-Logical)",
  222. 50220: "iso-2022-jp - ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)",
  223. 50221: "csISO2022JP - ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)",
  224. 50222: "iso-2022-jp - ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)",
  225. 50225: "iso-2022-kr - ISO 2022 Korean",
  226. 50227: "x-cp50227 - ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)",
  227. 50229: "ISO 2022 - Traditional Chinese",
  228. 50930: "EBCDIC - Japanese (Katakana) Extended",
  229. 50931: "EBCDIC - US-Canada and Japanese",
  230. 50933: "EBCDIC - Korean Extended and Korean",
  231. 50935: "EBCDIC - Simplified Chinese Extended and Simplified Chinese",
  232. 50936: "EBCDIC - Simplified Chinese",
  233. 50937: "EBCDIC - US-Canada and Traditional Chinese",
  234. 50939: "EBCDIC - Japanese (Latin) Extended and Japanese",
  235. 51932: "euc-jp - EUC Japanese",
  236. 51936: "EUC-CN - EUC Simplified Chinese; Chinese Simplified (EUC)",
  237. 51949: "euc-kr - EUC Korean",
  238. 51950: "EUC - Traditional Chinese",
  239. 52936: "hz-gb-2312 - HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)",
  240. 54936: "GB18030 - Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)",
  241. 57002: "x-iscii-de - ISCII Devanagari",
  242. 57003: "x-iscii-be - ISCII Bengali",
  243. 57004: "x-iscii-ta - ISCII Tamil",
  244. 57005: "x-iscii-te - ISCII Telugu",
  245. 57006: "x-iscii-as - ISCII Assamese",
  246. 57007: "x-iscii-or - ISCII Oriya",
  247. 57008: "x-iscii-ka - ISCII Kannada",
  248. 57009: "x-iscii-ma - ISCII Malayalam",
  249. 57010: "x-iscii-gu - ISCII Gujarati",
  250. 57011: "x-iscii-pa - ISCII Punjabi",
  251. 65000: "utf-7 - Unicode (UTF-7)",
  252. 65001: "utf-8 - Unicode (UTF-8)",
  253. }