123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- # -*- coding: utf-8 -*-
- """生成初始的 kMandarin_8105.txt"""
- from merge_unihan import parse_pinyins, code_to_hanzi
- def parse_china_x():
- with open('tools/china-8105-06062014.txt') as fp:
- for line in fp:
- line = line.strip()
- if line.startswith('#') or not line:
- continue
- yield line.split()[0]
- def parse_zdic():
- with open('zdic.txt') as fp:
- return parse_pinyins(fp)
- def parse_kmandain():
- with open('pinyin.txt') as fp:
- return parse_pinyins(fp)
- def diff(kmandarin, zdic, commons):
- for key in commons:
- hanzi = code_to_hanzi(key)
- if key in kmandarin:
- value = kmandarin[key][0]
- if key in zdic and value != zdic[key][0]:
- yield '{0}: {1} # {2} -> {3}'.format(
- key, value, hanzi, zdic[key][0]
- )
- else:
- yield '{0}: {1} # {2}'.format(key, value, hanzi)
- elif key in zdic:
- value = zdic[key][0]
- yield '{0}: {1} # {2}'.format(key, value, hanzi)
- else:
- yield '# {0}: {1} # {2}'.format(key, '<-', hanzi)
- if __name__ == '__main__':
- zdic = parse_zdic()
- kmandarin = parse_kmandain()
- commons = parse_china_x()
- lst = diff(kmandarin, zdic, commons)
- for x in lst:
- print(x)
|