paddy.he
/
github.com_richardlehane_msoleps-v1.0.1


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
							// Copyright 2014 Richard Lehane. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package types

import (
	"encoding/binary"
	"strings"
	"unicode/utf16"
)

func nullTerminated(s string) string {
	return s[:strings.Index(s, "\x00")]
}

type UnicodeString []uint16

func (s UnicodeString) Type() string {
	return "UnicodeString"
}

func (s UnicodeString) Length() int {
	return 4 + len(s)*2
}

func (s UnicodeString) String() string {
	if len(s) == 0 {
		return ""
	}
	return nullTerminated(string(utf16.Decode(s)))
}

func MakeUnicode(b []byte) (Type, error) {
	if len(b) < 4 {
		return UnicodeString{}, ErrType
	}
	l := int(binary.LittleEndian.Uint32(b[:4]))
	if l == 0 {
		return UnicodeString{}, nil
	}
	if len(b) < l*2+4 {
		return UnicodeString{}, ErrType
	}
	s := make(UnicodeString, l)
	for i := range s {
		start := i*2 + 4
		s[i] = binary.LittleEndian.Uint16(b[start : start+2])
	}
	return s, nil
}

type CodeString struct {
	id    CodePageID
	Chars []byte
}

func (s *CodeString) SetId(i CodePageID) {
	s.id = i
}

func (s *CodeString) Encoding() string {
	return CodePageIDs[s.id]
}

func (s *CodeString) Type() string {
	return "CodeString"
}

func (s *CodeString) Length() int {
	return 4 + len(s.Chars)
}

func (s *CodeString) String() string {
	if len(s.Chars) == 0 {
		return ""
	}
	if s.id == 1200 {
		chars := make([]uint16, len(s.Chars)/2)
		for i := range chars {
			chars[i] = binary.LittleEndian.Uint16(s.Chars[i*2 : i*2+2])
		}
		return nullTerminated(string(utf16.Decode(chars)))
	}
	return nullTerminated(string(s.Chars))
}

func MakeCodeString(b []byte) (Type, error) {
	if len(b) < 4 {
		return &CodeString{}, ErrType
	}
	s := &CodeString{}
	l := int(binary.LittleEndian.Uint32(b[:4]))
	if l == 0 {
		return s, nil
	}
	if len(b) < l+4 {
		return s, ErrType
	}
	s.Chars = make([]byte, l)
	copy(s.Chars, b[4:l+4])
	return s, nil
}

type CodePageID uint16

var CodePageIDs map[CodePageID]string = map[CodePageID]string{
	37:    "IBM037 - IBM EBCDIC US-Canada",
	437:   "IBM437 - OEM United States",
	500:   "IBM500 - IBM EBCDIC International",
	708:   "ASMO-708 - Arabic (ASMO 708)",
	709:   "Arabic (ASMO-449+, BCON V4)",
	710:   "Arabic - Transparent Arabic",
	720:   "DOS-720 - Arabic (Transparent ASMO); Arabic (DOS)",
	737:   "ibm737 - OEM Greek (formerly 437G); Greek (DOS)",
	775:   "ibm775 - OEM Baltic; Baltic (DOS)",
	850:   "ibm850 - OEM Multilingual Latin 1; Western European (DOS)",
	852:   "ibm852 - OEM Latin 2; Central European (DOS)",
	855:   "IBM855 - OEM Cyrillic (primarily Russian)",
	857:   "ibm857 - OEM Turkish; Turkish (DOS)",
	858:   "IBM00858 - OEM Multilingual Latin 1 + Euro symbol",
	860:   "IBM860 - OEM Portuguese; Portuguese (DOS)",
	861:   "ibm861 - OEM Icelandic; Icelandic (DOS)",
	862:   "DOS-862 - OEM Hebrew; Hebrew (DOS)",
	863:   "IBM863 - OEM French Canadian; French Canadian (DOS)",
	864:   "IBM864 - OEM Arabic; Arabic (864)",
	865:   "IBM865 - OEM Nordic; Nordic (DOS)",
	866:   "cp866 - OEM Russian; Cyrillic (DOS)",
	869:   "ibm869 - OEM Modern Greek; Greek, Modern (DOS)",
	870:   "IBM870 - IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2",
	874:   "windows-874 - ANSI/OEM Thai (ISO 8859-11); Thai (Windows)",
	875:   "cp875 - IBM EBCDIC Greek Modern",
	932:   "shift_jis - ANSI/OEM Japanese; Japanese (Shift-JIS)",
	936:   "gb2312 - ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)",
	949:   "ks_c_5601-1987 - ANSI/OEM Korean (Unified Hangul Code)",
	950:   "big5 - ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)",
	1026:  "IBM1026 - IBM EBCDIC Turkish (Latin 5)",
	1047:  "IBM01047 - BM EBCDIC Latin 1/Open System",
	1140:  "IBM01140 - IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)",
	1141:  "IBM01141 - IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)",
	1142:  "IBM01142 - IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)",
	1143:  "IBM01143 - IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)",
	1144:  "IBM01144 - IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)",
	1145:  "IBM01145 - IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)",
	1146:  "IBM01146 - IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)",
	1147:  "IBM01147 - IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)",
	1148:  "IBM01148 - IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)",
	1149:  "IBM01149 - IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)",
	1200:  "utf-16 - Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications",
	1201:  "unicodeFFFE - Unicode UTF-16, big endian byte order; available only to managed applications",
	1250:  "windows-1250 - ANSI Central European; Central European (Windows)",
	1251:  "windows-1251 - ANSI Cyrillic; Cyrillic (Windows)",
	1252:  "windows-1252 - ANSI Latin 1; Western European (Windows)",
	1253:  "windows-1253 - ANSI Greek; Greek (Windows)",
	1254:  "windows-1254 - ANSI Turkish; Turkish (Windows)",
	1255:  "windows-1255 - ANSI Hebrew; Hebrew (Windows)",
	1256:  "windows-1256 - ANSI Arabic; Arabic (Windows)",
	1257:  "windows-1257 - ANSI Baltic; Baltic (Windows)",
	1258:  "windows-1258 - ANSI/OEM Vietnamese; Vietnamese (Windows)",
	1361:  "Johab - Korean (Johab)",
	10000: "macintosh - MAC Roman; Western European (Mac)",
	10001: "x-mac-japanese - Japanese (Mac)",
	10002: "x-mac-chinesetrad - MAC Traditional Chinese (Big5); Chinese Traditional (Mac)",
	10003: "x-mac-korean - Korean (Mac)",
	10004: "x-mac-arabic - Arabic (Mac)",
	10005: "x-mac-hebrew - Hebrew (Mac)",
	10006: "x-mac-greek - Greek (Mac)",
	10007: "x-mac-cyrillic - Cyrillic (Mac)",
	10008: "x-mac-chinesesimp - MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)",
	10010: "x-mac-romanian - Romanian (Mac)",
	10017: "x-mac-ukrainian - Ukrainian (Mac)",
	10021: "x-mac-thai - Thai (Mac)",
	10029: "x-mac-ce - MAC Latin 2; Central European (Mac)",
	10079: "x-mac-icelandic - Icelandic (Mac)",
	10081: "x-mac-turkish - Turkish (Mac)",
	10082: "x-mac-croatian - Croatian (Mac)",
	12000: "utf-32 - Unicode UTF-32, little endian byte order; available only to managed applications",
	12001: "utf-32BE - Unicode UTF-32, big endian byte order; available only to managed applications",
	20000: "x-Chinese_CNS - CNS Taiwan; Chinese Traditional (CNS)",
	20001: "x-cp20001 - TCA Taiwan",
	20002: "x_Chinese-Eten - Eten Taiwan; Chinese Traditional (Eten)",
	20003: "x-cp20003 - IBM5550 Taiwan",
	20004: "x-cp20004 - TeleText Taiwan",
	20005: "x-cp20005 - Wang Taiwan",
	20105: "x-IA5 - IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)",
	20106: "x-IA5-German - IA5 German (7-bit)",
	20107: "x-IA5-Swedish - IA5 Swedish (7-bit)",
	20108: "x-IA5-Norwegian - IA5 Norwegian (7-bit)",
	20127: "us-ascii - US-ASCII (7-bit)",
	20261: "x-cp20261 - T.61",
	20269: "x-cp20269 - ISO 6937 Non-Spacing Accent",
	20273: "IBM273 - IBM EBCDIC Germany",
	20277: "IBM277 - IBM EBCDIC Denmark-Norway",
	20278: "IBM278 - IBM EBCDIC Finland-Sweden",
	20280: "IBM280 - IBM EBCDIC Italy",
	20284: "IBM284 - IBM EBCDIC Latin America-Spain",
	20285: "IBM285 - IBM EBCDIC United Kingdom",
	20290: "IBM290 - IBM EBCDIC Japanese Katakana Extended",
	20297: "IBM297 - IBM EBCDIC France",
	20420: "IBM420 - IBM EBCDIC Arabic",
	20423: "IBM423 - IBM EBCDIC Greek",
	20424: "IBM424 - IBM EBCDIC Hebrew",
	20833: "x-EBCDIC-KoreanExtended - IBM EBCDIC Korean Extended",
	20838: "IBM-Thai - IBM EBCDIC Thai",
	20866: "koi8-r - Russian (KOI8-R); Cyrillic (KOI8-R)",
	20871: "IBM871 - IBM EBCDIC Icelandic",
	20880: "IBM880 - IBM EBCDIC Cyrillic Russian",
	20905: "IBM905 - IBM EBCDIC Turkish",
	20924: "IBM00924 - IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)",
	20932: "EUC-JP - Japanese (JIS 0208-1990 and 0212-1990)",
	20936: "x-cp20936 - Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)",
	20949: "x-cp20949 - Korean Wansung",
	21025: "cp1025 - IBM EBCDIC Cyrillic Serbian-Bulgarian",
	21027: "(deprecated)",
	21866: "koi8-u - Ukrainian (KOI8-U); Cyrillic (KOI8-U)",
	28591: "iso-8859-1 - ISO 8859-1 Latin 1; Western European (ISO)",
	28592: "iso-8859-2 - ISO 8859-2 Central European; Central European (ISO)",
	28593: "iso-8859-3 - ISO 8859-3 Latin 3",
	28594: "iso-8859-4 - ISO 8859-4 Baltic",
	28595: "iso-8859-5 - ISO 8859-5 Cyrillic",
	28596: "iso-8859-6 - ISO 8859-6 Arabic",
	28597: "iso-8859-7 - ISO 8859-7 Greek",
	28598: "iso-8859-8 - ISO 8859-8 Hebrew; Hebrew (ISO-Visual)",
	28599: "iso-8859-9 - ISO 8859-9 Turkish",
	28603: "iso-8859-13 - ISO 8859-13 Estonian",
	28605: "iso-8859-15 - ISO 8859-15 Latin 9",
	29001: "x-Europa - Europa 3",
	38598: "iso-8859-8-i - ISO 8859-8 Hebrew; Hebrew (ISO-Logical)",
	50220: "iso-2022-jp - ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)",
	50221: "csISO2022JP - ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)",
	50222: "iso-2022-jp - ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)",
	50225: "iso-2022-kr - ISO 2022 Korean",
	50227: "x-cp50227 - ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)",
	50229: "ISO 2022 - Traditional Chinese",
	50930: "EBCDIC - Japanese (Katakana) Extended",
	50931: "EBCDIC - US-Canada and Japanese",
	50933: "EBCDIC - Korean Extended and Korean",
	50935: "EBCDIC - Simplified Chinese Extended and Simplified Chinese",
	50936: "EBCDIC - Simplified Chinese",
	50937: "EBCDIC - US-Canada and Traditional Chinese",
	50939: "EBCDIC - Japanese (Latin) Extended and Japanese",
	51932: "euc-jp - EUC Japanese",
	51936: "EUC-CN - EUC Simplified Chinese; Chinese Simplified (EUC)",
	51949: "euc-kr - EUC Korean",
	51950: "EUC - Traditional Chinese",
	52936: "hz-gb-2312 - HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)",
	54936: "GB18030 - Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)",
	57002: "x-iscii-de - ISCII Devanagari",
	57003: "x-iscii-be - ISCII Bengali",
	57004: "x-iscii-ta - ISCII Tamil",
	57005: "x-iscii-te - ISCII Telugu",
	57006: "x-iscii-as - ISCII Assamese",
	57007: "x-iscii-or - ISCII Oriya",
	57008: "x-iscii-ka - ISCII Kannada",
	57009: "x-iscii-ma - ISCII Malayalam",
	57010: "x-iscii-gu - ISCII Gujarati",
	57011: "x-iscii-pa - ISCII Punjabi",
	65000: "utf-7 - Unicode (UTF-7)",
	65001: "utf-8 - Unicode (UTF-8)",
}