gen_gb_pua.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import sys
  4. sys.path.append('.')
  5. from merge_unihan import parse_pinyins
  6. def get_pinyins(file_path):
  7. with open(file_path) as fp:
  8. return parse_pinyins(fp)
  9. def get_pua_map():
  10. text = '''
  11. # A6D9 E78D () FE10 (︐)
  12. # A6DA E78E () FE12 (︒)
  13. # A6DB E78F () FE11 (︑)
  14. # A6DC E790 () FE13 (︓)
  15. # A6DD E791 () FE14 (︔)
  16. # A6DE E792 () FE15 (︕)
  17. # A6DF E793 () FE16 (︖)
  18. # A6EC E794 () FE17 (︗)
  19. # A6ED E795 () FE18 (︘)
  20. # A8BC E7C7 () 1E3F (ḿ) 1E3F (ḿ)
  21. # A8BF E7C8 () 01F9 (ǹ) 01F9 (ǹ)
  22. # A989 E7E7 () 303E (〾) 303E (〾)
  23. # A98A E7E8 () 2FF0 (⿰) 2FF0 (⿰)
  24. # A98B E7E9 () 2FF1 (⿱) 2FF1 (⿱)
  25. # A98C E7EA () 2FF2 (⿲) 2FF2 (⿲)
  26. # A98D E7EB () 2FF3 (⿳) 2FF3 (⿳)
  27. # A98E E7EC () 2FF4 (⿴) 2FF4 (⿴)
  28. # A98F E7ED () 2FF5 (⿵) 2FF5 (⿵)
  29. # A990 E7EE () 2FF6 (⿶) 2FF6 (⿶)
  30. # A991 E7EF () 2FF7 (⿷) 2FF7 (⿷)
  31. # A992 E7F0 () 2FF8 (⿸) 2FF8 (⿸)
  32. # A993 E7F1 () 2FF9 (⿹) 2FF9 (⿹)
  33. # A994 E7F2 () 2FFA (⿺) 2FFA (⿺)
  34. # A995 E7F3 () 2FFB (⿻) 2FFB (⿻)
  35. FE50 E815 () 2E81 (⺁) 2E81 (⺁)
  36. FE51 E816 () E816 () 20087 (𠂇)
  37. FE52 E817 () E817 () 20089 (𠂉)
  38. FE53 E818 () E818 () 200CC (𠃌)
  39. FE54 E819 () 2E84 (⺄) 2E84 (⺄)
  40. FE55 E81A () 3473 (㑳) 3473 (㑳)
  41. FE56 E81B () 3447 (㑇) 3447 (㑇)
  42. FE57 E81C () 2E88 (⺈) 2E88 (⺈)
  43. FE58 E81D () 2E8B (⺋) 2E8B (⺋)
  44. FE59 E81E () E81E () 9FB4 (龴)
  45. FE5A E81F () 359E (㖞) 359E (㖞)
  46. FE5B E820 () 361A (㘚) 361A (㘚)
  47. FE5C E821 () 360E (㘎) 360E (㘎)
  48. FE5D E822 () 2E8C (⺌) 2E8C (⺌)
  49. FE5E E823 () 2E97 (⺗) 2E97 (⺗)
  50. FE5F E824 () 396E (㥮) 396E (㥮)
  51. FE60 E825 () 3918 (㤘) 3918 (㤘)
  52. FE61 E826 () E826 () 9FB5 (龵)
  53. FE62 E827 () 39CF (㧏) 39CF (㧏)
  54. FE63 E828 () 39DF (㧟) 39DF (㧟)
  55. FE64 E829 () 3A73 (㩳) 3A73 (㩳)
  56. FE65 E82A () 39D0 (㧐) 39D0 (㧐)
  57. FE66 E82B () E82B () 9FB6 (龶)
  58. FE67 E82C () E82C () 9FB7 (龷)
  59. FE68 E82D () 3B4E (㭎) 3B4E (㭎)
  60. FE69 E82E () 3C6E (㱮) 3C6E (㱮)
  61. FE6A E82F () 3CE0 (㳠) 3CE0 (㳠)
  62. FE6B E830 () 2EA7 (⺧) 2EA7 (⺧)
  63. FE6C E831 () E831 () 215D7 (𡗗)
  64. FE6D E832 () E832 () 9FB8 (龸)
  65. FE6E E833 () 2EAA (⺪) 2EAA (⺪)
  66. FE6F E834 () 4056 (䁖) 4056 (䁖)
  67. FE70 E835 () 415F (䅟) 415F (䅟)
  68. FE71 E836 () 2EAE (⺮) 2EAE (⺮)
  69. FE72 E837 () 4337 (䌷) 4337 (䌷)
  70. FE73 E838 () 2EB3 (⺳) 2EB3 (⺳)
  71. FE74 E839 () 2EB6 (⺶) 2EB6 (⺶)
  72. FE75 E83A () 2EB7 (⺷) 2EB7 (⺷)
  73. FE76 E83B () E83B () 2298F (𢦏)
  74. FE77 E83C () 43B1 (䎱) 43B1 (䎱)
  75. FE78 E83D () 43AC (䎬) 43AC (䎬)
  76. FE79 E83E () 2EBB (⺻) 2EBB (⺻)
  77. FE7A E83F () 43DD (䏝) 43DD (䏝)
  78. FE7B E840 () 44D6 (䓖) 44D6 (䓖)
  79. FE7C E841 () 4661 (䙡) 4661 (䙡)
  80. FE7D E842 () 464C (䙌) 464C (䙌)
  81. FE7E E843 () E843 () 9FB9 (龹)
  82. FE80 E844 () 4723 (䜣) 4723 (䜣)
  83. FE81 E845 () 4729 (䜩) 4729 (䜩)
  84. FE82 E846 () 477C (䝼) 477C (䝼)
  85. FE83 E847 () 478D (䞍) 478D (䞍)
  86. FE84 E848 () 2ECA (⻊) 2ECA (⻊)
  87. FE85 E849 () 4947 (䥇) 4947 (䥇)
  88. FE86 E84A () 497A (䥺) 497A (䥺)
  89. FE87 E84B () 497D (䥽) 497D (䥽)
  90. FE88 E84C () 4982 (䦂) 4982 (䦂)
  91. FE89 E84D () 4983 (䦃) 4983 (䦃)
  92. FE8A E84E () 4985 (䦅) 4985 (䦅)
  93. FE8B E84F () 4986 (䦆) 4986 (䦆)
  94. FE8C E850 () 499F (䦟) 499F (䦟)
  95. FE8D E851 () 499B (䦛) 499B (䦛)
  96. FE8E E852 () 49B7 (䦷) 49B7 (䦷)
  97. FE8F E853 () 49B6 (䦶) 49B6 (䦶)
  98. FE90 E854 () E854 () 9FBA (龺)
  99. FE91 E855 () E855 () 241FE (𤇾)
  100. FE92 E856 () 4CA3 (䲣) 4CA3 (䲣)
  101. FE93 E857 () 4C9F (䲟) 4C9F (䲟)
  102. FE94 E858 () 4CA0 (䲠) 4CA0 (䲠)
  103. FE95 E859 () 4CA1 (䲡) 4CA1 (䲡)
  104. FE96 E85A () 4C77 (䱷) 4C77 (䱷)
  105. FE97 E85B () 4CA2 (䲢) 4CA2 (䲢)
  106. FE98 E85C () 4D13 (䴓) 4D13 (䴓)
  107. FE99 E85D () 4D14 (䴔) 4D14 (䴔)
  108. FE9A E85E () 4D15 (䴕) 4D15 (䴕)
  109. FE9B E85F () 4D16 (䴖) 4D16 (䴖)
  110. FE9C E860 () 4D17 (䴗) 4D17 (䴗)
  111. FE9D E861 () 4D18 (䴘) 4D18 (䴘)
  112. FE9E E862 () 4D19 (䴙) 4D19 (䴙)
  113. FE9F E863 () 4DAE (䶮) 4DAE (䶮)
  114. FEA0 E864 () E864 () 9FBB (龻)
  115. '''.strip()
  116. for line in text.split('\n'):
  117. if line.startswith('#'):
  118. continue
  119. gb, gbk, gb_18030, unicode_4_1 = line.split('\t')
  120. # print(gb, gbk, gb_18030, unicode_4_1)
  121. # print(get_han_point(gbk), get_han_point(unicode_4_1))
  122. yield get_han_point(gbk), get_han_point(unicode_4_1)
  123. def get_han_point(text):
  124. if not text:
  125. return '', ''
  126. regex = re.compile(r'(?P<point>[A-Z0-9]+) \((?P<han>[^\)]+)\)')
  127. result = regex.findall(text)
  128. return result[0]
  129. def point_to_u_point(point):
  130. point = point.upper()
  131. if not point.startswith('U+'):
  132. point = 'U+' + point
  133. return point
  134. def gen_pua_data(gbk, unicode_4_1, pinyin_map):
  135. gbk_point, gbk_han = gbk
  136. gbk_point = point_to_u_point(gbk_point)
  137. unicode_4_1_point, unicode_4_1_han = unicode_4_1
  138. unicode_4_1_point = point_to_u_point(unicode_4_1_point)
  139. pinyins = ','.join(pinyin_map.get(unicode_4_1_point, []))
  140. prefix = ''
  141. if not pinyins:
  142. prefix = '# '
  143. return (
  144. '{prefix}{gbk_point}: {pinyins} # {gbk_han} '
  145. 'Unihan: {unicode_4_1_point} {unicode_4_1_han}'
  146. ).format(**locals())
  147. if __name__ == '__main__':
  148. pinyin_map = get_pinyins('pinyin.txt')
  149. print('# GBK/GB 18030 PUA 映射\n'
  150. '# 详见:https://zh.wikipedia.org/wiki/GB_18030#PUA')
  151. for gbk, unicode_4_1 in get_pua_map():
  152. print(gen_pua_data(gbk, unicode_4_1, pinyin_map))