gen_8105.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. # -*- coding: utf-8 -*-
  2. """生成初始的 kMandarin_8105.txt"""
  3. from merge_unihan import parse_pinyins, code_to_hanzi
  4. def parse_china_x():
  5. with open('tools/china-8105-06062014.txt') as fp:
  6. for line in fp:
  7. line = line.strip()
  8. if line.startswith('#') or not line:
  9. continue
  10. yield line.split()[0]
  11. def parse_zdic():
  12. with open('zdic.txt') as fp:
  13. return parse_pinyins(fp)
  14. def parse_kmandain():
  15. with open('pinyin.txt') as fp:
  16. return parse_pinyins(fp)
  17. def diff(kmandarin, zdic, commons):
  18. for key in commons:
  19. hanzi = code_to_hanzi(key)
  20. if key in kmandarin:
  21. value = kmandarin[key][0]
  22. if key in zdic and value != zdic[key][0]:
  23. yield '{0}: {1} # {2} -> {3}'.format(
  24. key, value, hanzi, zdic[key][0]
  25. )
  26. else:
  27. yield '{0}: {1} # {2}'.format(key, value, hanzi)
  28. elif key in zdic:
  29. value = zdic[key][0]
  30. yield '{0}: {1} # {2}'.format(key, value, hanzi)
  31. else:
  32. yield '# {0}: {1} # {2}'.format(key, '<-', hanzi)
  33. if __name__ == '__main__':
  34. zdic = parse_zdic()
  35. kmandarin = parse_kmandain()
  36. commons = parse_china_x()
  37. lst = diff(kmandarin, zdic, commons)
  38. for x in lst:
  39. print(x)