123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- // Copyright 2014 Richard Lehane. All rights reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package types
- import (
- "encoding/binary"
- "strings"
- "unicode/utf16"
- )
- func nullTerminated(s string) string {
- return s[:strings.Index(s, "\x00")]
- }
- type UnicodeString []uint16
- func (s UnicodeString) Type() string {
- return "UnicodeString"
- }
- func (s UnicodeString) Length() int {
- return 4 + len(s)*2
- }
- func (s UnicodeString) String() string {
- if len(s) == 0 {
- return ""
- }
- return nullTerminated(string(utf16.Decode(s)))
- }
- func MakeUnicode(b []byte) (Type, error) {
- if len(b) < 4 {
- return UnicodeString{}, ErrType
- }
- l := int(binary.LittleEndian.Uint32(b[:4]))
- if l == 0 {
- return UnicodeString{}, nil
- }
- if len(b) < l*2+4 {
- return UnicodeString{}, ErrType
- }
- s := make(UnicodeString, l)
- for i := range s {
- start := i*2 + 4
- s[i] = binary.LittleEndian.Uint16(b[start : start+2])
- }
- return s, nil
- }
- type CodeString struct {
- id CodePageID
- Chars []byte
- }
- func (s *CodeString) SetId(i CodePageID) {
- s.id = i
- }
- func (s *CodeString) Encoding() string {
- return CodePageIDs[s.id]
- }
- func (s *CodeString) Type() string {
- return "CodeString"
- }
- func (s *CodeString) Length() int {
- return 4 + len(s.Chars)
- }
- func (s *CodeString) String() string {
- if len(s.Chars) == 0 {
- return ""
- }
- if s.id == 1200 {
- chars := make([]uint16, len(s.Chars)/2)
- for i := range chars {
- chars[i] = binary.LittleEndian.Uint16(s.Chars[i*2 : i*2+2])
- }
- return nullTerminated(string(utf16.Decode(chars)))
- }
- return nullTerminated(string(s.Chars))
- }
- func MakeCodeString(b []byte) (Type, error) {
- if len(b) < 4 {
- return &CodeString{}, ErrType
- }
- s := &CodeString{}
- l := int(binary.LittleEndian.Uint32(b[:4]))
- if l == 0 {
- return s, nil
- }
- if len(b) < l+4 {
- return s, ErrType
- }
- s.Chars = make([]byte, l)
- copy(s.Chars, b[4:l+4])
- return s, nil
- }
- type CodePageID uint16
- var CodePageIDs map[CodePageID]string = map[CodePageID]string{
- 37: "IBM037 - IBM EBCDIC US-Canada",
- 437: "IBM437 - OEM United States",
- 500: "IBM500 - IBM EBCDIC International",
- 708: "ASMO-708 - Arabic (ASMO 708)",
- 709: "Arabic (ASMO-449+, BCON V4)",
- 710: "Arabic - Transparent Arabic",
- 720: "DOS-720 - Arabic (Transparent ASMO); Arabic (DOS)",
- 737: "ibm737 - OEM Greek (formerly 437G); Greek (DOS)",
- 775: "ibm775 - OEM Baltic; Baltic (DOS)",
- 850: "ibm850 - OEM Multilingual Latin 1; Western European (DOS)",
- 852: "ibm852 - OEM Latin 2; Central European (DOS)",
- 855: "IBM855 - OEM Cyrillic (primarily Russian)",
- 857: "ibm857 - OEM Turkish; Turkish (DOS)",
- 858: "IBM00858 - OEM Multilingual Latin 1 + Euro symbol",
- 860: "IBM860 - OEM Portuguese; Portuguese (DOS)",
- 861: "ibm861 - OEM Icelandic; Icelandic (DOS)",
- 862: "DOS-862 - OEM Hebrew; Hebrew (DOS)",
- 863: "IBM863 - OEM French Canadian; French Canadian (DOS)",
- 864: "IBM864 - OEM Arabic; Arabic (864)",
- 865: "IBM865 - OEM Nordic; Nordic (DOS)",
- 866: "cp866 - OEM Russian; Cyrillic (DOS)",
- 869: "ibm869 - OEM Modern Greek; Greek, Modern (DOS)",
- 870: "IBM870 - IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2",
- 874: "windows-874 - ANSI/OEM Thai (ISO 8859-11); Thai (Windows)",
- 875: "cp875 - IBM EBCDIC Greek Modern",
- 932: "shift_jis - ANSI/OEM Japanese; Japanese (Shift-JIS)",
- 936: "gb2312 - ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)",
- 949: "ks_c_5601-1987 - ANSI/OEM Korean (Unified Hangul Code)",
- 950: "big5 - ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)",
- 1026: "IBM1026 - IBM EBCDIC Turkish (Latin 5)",
- 1047: "IBM01047 - BM EBCDIC Latin 1/Open System",
- 1140: "IBM01140 - IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)",
- 1141: "IBM01141 - IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)",
- 1142: "IBM01142 - IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)",
- 1143: "IBM01143 - IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)",
- 1144: "IBM01144 - IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)",
- 1145: "IBM01145 - IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)",
- 1146: "IBM01146 - IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)",
- 1147: "IBM01147 - IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)",
- 1148: "IBM01148 - IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)",
- 1149: "IBM01149 - IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)",
- 1200: "utf-16 - Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications",
- 1201: "unicodeFFFE - Unicode UTF-16, big endian byte order; available only to managed applications",
- 1250: "windows-1250 - ANSI Central European; Central European (Windows)",
- 1251: "windows-1251 - ANSI Cyrillic; Cyrillic (Windows)",
- 1252: "windows-1252 - ANSI Latin 1; Western European (Windows)",
- 1253: "windows-1253 - ANSI Greek; Greek (Windows)",
- 1254: "windows-1254 - ANSI Turkish; Turkish (Windows)",
- 1255: "windows-1255 - ANSI Hebrew; Hebrew (Windows)",
- 1256: "windows-1256 - ANSI Arabic; Arabic (Windows)",
- 1257: "windows-1257 - ANSI Baltic; Baltic (Windows)",
- 1258: "windows-1258 - ANSI/OEM Vietnamese; Vietnamese (Windows)",
- 1361: "Johab - Korean (Johab)",
- 10000: "macintosh - MAC Roman; Western European (Mac)",
- 10001: "x-mac-japanese - Japanese (Mac)",
- 10002: "x-mac-chinesetrad - MAC Traditional Chinese (Big5); Chinese Traditional (Mac)",
- 10003: "x-mac-korean - Korean (Mac)",
- 10004: "x-mac-arabic - Arabic (Mac)",
- 10005: "x-mac-hebrew - Hebrew (Mac)",
- 10006: "x-mac-greek - Greek (Mac)",
- 10007: "x-mac-cyrillic - Cyrillic (Mac)",
- 10008: "x-mac-chinesesimp - MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)",
- 10010: "x-mac-romanian - Romanian (Mac)",
- 10017: "x-mac-ukrainian - Ukrainian (Mac)",
- 10021: "x-mac-thai - Thai (Mac)",
- 10029: "x-mac-ce - MAC Latin 2; Central European (Mac)",
- 10079: "x-mac-icelandic - Icelandic (Mac)",
- 10081: "x-mac-turkish - Turkish (Mac)",
- 10082: "x-mac-croatian - Croatian (Mac)",
- 12000: "utf-32 - Unicode UTF-32, little endian byte order; available only to managed applications",
- 12001: "utf-32BE - Unicode UTF-32, big endian byte order; available only to managed applications",
- 20000: "x-Chinese_CNS - CNS Taiwan; Chinese Traditional (CNS)",
- 20001: "x-cp20001 - TCA Taiwan",
- 20002: "x_Chinese-Eten - Eten Taiwan; Chinese Traditional (Eten)",
- 20003: "x-cp20003 - IBM5550 Taiwan",
- 20004: "x-cp20004 - TeleText Taiwan",
- 20005: "x-cp20005 - Wang Taiwan",
- 20105: "x-IA5 - IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)",
- 20106: "x-IA5-German - IA5 German (7-bit)",
- 20107: "x-IA5-Swedish - IA5 Swedish (7-bit)",
- 20108: "x-IA5-Norwegian - IA5 Norwegian (7-bit)",
- 20127: "us-ascii - US-ASCII (7-bit)",
- 20261: "x-cp20261 - T.61",
- 20269: "x-cp20269 - ISO 6937 Non-Spacing Accent",
- 20273: "IBM273 - IBM EBCDIC Germany",
- 20277: "IBM277 - IBM EBCDIC Denmark-Norway",
- 20278: "IBM278 - IBM EBCDIC Finland-Sweden",
- 20280: "IBM280 - IBM EBCDIC Italy",
- 20284: "IBM284 - IBM EBCDIC Latin America-Spain",
- 20285: "IBM285 - IBM EBCDIC United Kingdom",
- 20290: "IBM290 - IBM EBCDIC Japanese Katakana Extended",
- 20297: "IBM297 - IBM EBCDIC France",
- 20420: "IBM420 - IBM EBCDIC Arabic",
- 20423: "IBM423 - IBM EBCDIC Greek",
- 20424: "IBM424 - IBM EBCDIC Hebrew",
- 20833: "x-EBCDIC-KoreanExtended - IBM EBCDIC Korean Extended",
- 20838: "IBM-Thai - IBM EBCDIC Thai",
- 20866: "koi8-r - Russian (KOI8-R); Cyrillic (KOI8-R)",
- 20871: "IBM871 - IBM EBCDIC Icelandic",
- 20880: "IBM880 - IBM EBCDIC Cyrillic Russian",
- 20905: "IBM905 - IBM EBCDIC Turkish",
- 20924: "IBM00924 - IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)",
- 20932: "EUC-JP - Japanese (JIS 0208-1990 and 0212-1990)",
- 20936: "x-cp20936 - Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)",
- 20949: "x-cp20949 - Korean Wansung",
- 21025: "cp1025 - IBM EBCDIC Cyrillic Serbian-Bulgarian",
- 21027: "(deprecated)",
- 21866: "koi8-u - Ukrainian (KOI8-U); Cyrillic (KOI8-U)",
- 28591: "iso-8859-1 - ISO 8859-1 Latin 1; Western European (ISO)",
- 28592: "iso-8859-2 - ISO 8859-2 Central European; Central European (ISO)",
- 28593: "iso-8859-3 - ISO 8859-3 Latin 3",
- 28594: "iso-8859-4 - ISO 8859-4 Baltic",
- 28595: "iso-8859-5 - ISO 8859-5 Cyrillic",
- 28596: "iso-8859-6 - ISO 8859-6 Arabic",
- 28597: "iso-8859-7 - ISO 8859-7 Greek",
- 28598: "iso-8859-8 - ISO 8859-8 Hebrew; Hebrew (ISO-Visual)",
- 28599: "iso-8859-9 - ISO 8859-9 Turkish",
- 28603: "iso-8859-13 - ISO 8859-13 Estonian",
- 28605: "iso-8859-15 - ISO 8859-15 Latin 9",
- 29001: "x-Europa - Europa 3",
- 38598: "iso-8859-8-i - ISO 8859-8 Hebrew; Hebrew (ISO-Logical)",
- 50220: "iso-2022-jp - ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)",
- 50221: "csISO2022JP - ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)",
- 50222: "iso-2022-jp - ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)",
- 50225: "iso-2022-kr - ISO 2022 Korean",
- 50227: "x-cp50227 - ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)",
- 50229: "ISO 2022 - Traditional Chinese",
- 50930: "EBCDIC - Japanese (Katakana) Extended",
- 50931: "EBCDIC - US-Canada and Japanese",
- 50933: "EBCDIC - Korean Extended and Korean",
- 50935: "EBCDIC - Simplified Chinese Extended and Simplified Chinese",
- 50936: "EBCDIC - Simplified Chinese",
- 50937: "EBCDIC - US-Canada and Traditional Chinese",
- 50939: "EBCDIC - Japanese (Latin) Extended and Japanese",
- 51932: "euc-jp - EUC Japanese",
- 51936: "EUC-CN - EUC Simplified Chinese; Chinese Simplified (EUC)",
- 51949: "euc-kr - EUC Korean",
- 51950: "EUC - Traditional Chinese",
- 52936: "hz-gb-2312 - HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)",
- 54936: "GB18030 - Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)",
- 57002: "x-iscii-de - ISCII Devanagari",
- 57003: "x-iscii-be - ISCII Bengali",
- 57004: "x-iscii-ta - ISCII Tamil",
- 57005: "x-iscii-te - ISCII Telugu",
- 57006: "x-iscii-as - ISCII Assamese",
- 57007: "x-iscii-or - ISCII Oriya",
- 57008: "x-iscii-ka - ISCII Kannada",
- 57009: "x-iscii-ma - ISCII Malayalam",
- 57010: "x-iscii-gu - ISCII Gujarati",
- 57011: "x-iscii-pa - ISCII Punjabi",
- 65000: "utf-7 - Unicode (UTF-7)",
- 65001: "utf-8 - Unicode (UTF-8)",
- }
|