parse_pinyin.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # -*- coding: utf-8 -*-
  2. import functools
  3. import operator
  4. import re
  5. def re_match_pinyin_line(kind):
  6. return re.compile(
  7. r'^U\+(?P<code>[0-9A-Z]+)\t{}\t(?P<pinyin>.+)$'.format(kind)
  8. )
  9. PINYIN = r'[^\d\.,]+'
  10. re_khanyupinyin = re.compile(r'''
  11. (?:\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:
  12. ((?:%(pinyin)s,)*)
  13. (%(pinyin)s)
  14. ''' % ({'pinyin': PINYIN}), re.X)
  15. re_kmandarin = re.compile(r'''
  16. ()()
  17. ({pinyin})
  18. '''.format(pinyin=PINYIN), re.X)
  19. re_kxhc1983 = re.compile(r'''
  20. ()()[0-9]{4}\.[0-9]{3}\*?
  21. (?:,[0-9]{4}\.[0-9]{3}\*?)*:
  22. (%(pinyin)s)
  23. ''' % ({'pinyin': PINYIN}), re.X)
  24. re_khanyupinlu = re.compile(r'''
  25. ()()({pinyin})\([0-9]+\)
  26. '''.format(pinyin=PINYIN), re.X)
  27. re_kinds_map = {
  28. 'kHanyuPinyin': re_khanyupinyin,
  29. 'kMandarin': re_kmandarin,
  30. 'kXHC1983': re_kxhc1983,
  31. 'kHanyuPinlu': re_khanyupinlu,
  32. }
  33. def remove_dup_items(lst):
  34. new_list = []
  35. for item in lst:
  36. if item not in new_list:
  37. new_list.append(item)
  38. return new_list
  39. def parse(lines, kind='kHanyuPinyin', ignore_prefix='#') -> str:
  40. re_line = re_match_pinyin_line(kind)
  41. re_pinyin = re_kinds_map[kind]
  42. for line in lines:
  43. line = line.strip()
  44. if line.startswith(ignore_prefix):
  45. continue
  46. match = re_line.match(line)
  47. if match is None:
  48. continue
  49. code = match.group('code')
  50. raw_pinyin = match.group('pinyin')
  51. raw_pinyins = re_pinyin.findall(raw_pinyin)
  52. # 处理有三个或三个以上拼音的情况,此时 raw_pinyins 类似
  53. # [(' xī,', 'lǔ '), (' lǔ,', 'xī')] or [('shú,dú,', 'tù')]
  54. for n, values in enumerate(raw_pinyins):
  55. value = []
  56. for v in values:
  57. value.extend(v.split(','))
  58. raw_pinyins[n] = value
  59. pinyins = functools.reduce(
  60. operator.add, raw_pinyins
  61. )
  62. pinyins = [x.strip() for x in pinyins if x.strip()]
  63. pinyins = remove_dup_items(pinyins)
  64. pinyin = ','.join(pinyins)
  65. yield code, pinyin
  66. def save_data(pinyins, writer):
  67. for code, pinyin in pinyins:
  68. gl = {}
  69. exec('hanzi=chr(0x{})'.format(code), gl)
  70. hanzi = gl['hanzi']
  71. line = 'U+{code}: {pinyin} # {hanzi}\n'.format(
  72. code=code, pinyin=pinyin, hanzi=hanzi
  73. )
  74. writer.write(line)
  75. if __name__ == '__main__':
  76. with open('Unihan_Readings.txt') as fp:
  77. for kind in ('kHanyuPinyin', 'kMandarin',
  78. 'kHanyuPinlu', 'kXHC1983'):
  79. fp.seek(0)
  80. with open('{}.txt'.format(kind), 'w') as writer:
  81. pinyins = parse(fp.readlines(), kind=kind)
  82. save_data(pinyins, writer)