parser.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. import re
  2. from dataclasses import dataclass
  3. from collections import defaultdict
  4. from typing import List
  5. from core.line_parser import OcrResult
  6. import numpy as np
  7. import cpca
  8. import address_correction.fix_address as fa
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  15. # 父类
  16. class Parser(object):
  17. def __init__(self, ocr_results: List[List[OcrResult]], ocr_line: List[List[OcrResult]]):
  18. self.result = ocr_results
  19. self.result_line = ocr_line
  20. self.bool_ranks = bool(ocr_results)
  21. self.id_index = None
  22. self.res = defaultdict(RecItem)
  23. self.keys = ['type', "address", 'address_province', 'address_city', 'address_region', 'address_detail',
  24. 'name', 'id', 'gender',
  25. # 出生地
  26. 'birthplace', 'birthplace_province', 'birthplace_city', 'birthplace_region',
  27. # 籍贯
  28. 'native_place', 'native_place_province', 'native_place_city', 'native_place_region',
  29. 'blood_type', 'religion']
  30. for key in self.keys:
  31. self.res[key] = RecItem()
  32. ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039\u0041-\u005a\u0061-\u007a]')
  33. if self.bool_ranks:
  34. for item in self.result:
  35. tail = ['', 1.]
  36. for k in range(len(item)):
  37. item[k].txt = ''.join(re.findall(ch, item[k].txt))
  38. tail[0] = tail[0] + item[k].txt
  39. tail[1] = tail[1] + item[k].conf
  40. tail[1] = (tail[1] - 1.) / len(item)
  41. item.append(tail)
  42. for item in self.result_line:
  43. tail = ['', 1.]
  44. for k in range(len(item)):
  45. item[k].txt = ''.join(re.findall(ch, item[k].txt))
  46. tail[0] = tail[0] + item[k].txt
  47. tail[1] = tail[1] + item[k].conf
  48. tail[1] = (tail[1] - 1.) / len(item)
  49. item.append(tail)
  50. def parse(self):
  51. return self.res
  52. def split_addr(self, place: str):
  53. if place == "birth":
  54. place = "birthplace"
  55. elif place == "native":
  56. place = "native_place"
  57. elif place == "address":
  58. place = "address"
  59. print(self.res[place].text, '=======')
  60. conf = self.res[place].confidence
  61. df = cpca.transform([self.res[place].text])
  62. if df.adcode[0] is None:
  63. self.res[f"{place}_detail"] = RecItem(self.res[place].text, self.res[place].confidence)
  64. return
  65. df = df.replace([None], [''])
  66. province = df.iloc[0, 0] or ''
  67. city = df.iloc[0, 1] or ''
  68. region = df.iloc[0, 2] or ''
  69. detail = df.iloc[0, 3] or ''
  70. # 修复地名更新
  71. if len(detail) == 1: detail = ''
  72. "===========地址纠错============"
  73. province = fa.fix_first_level(province)
  74. # 1. 当province有值、city为空、region为空,detail为空不走地址纠错 即 '上海市'
  75. # 2. 当province有值、city有值、region有值,不走地址纠错 即 '三级行政单位都识别出来了'
  76. unfix_add_1 = province != '' and city == '' and region == '' and detail == ''
  77. unfix_add_2 = province != '' and city != '' and region == '' and detail == ''
  78. unfix_add_3 = province != '' and city != '' and region != ''
  79. if (unfix_add_1 or unfix_add_2 or unfix_add_3) is False:
  80. if place == "address":
  81. # 1. 第二级单位未识别出来 ---- 切三个字符去纠错
  82. if city == '':
  83. if province == "内蒙古自治区":
  84. may_region = detail
  85. elif '县' in detail:
  86. may_region = detail.split('县')[0] + '县'
  87. elif '区' in detail:
  88. may_region = detail.split('区')[0] + '区'
  89. elif '族' in detail:
  90. may_region = detail.split('族')[0] + '族'
  91. else:
  92. may_region = detail[:3]
  93. province, city = fa.fix_second_level(province, may_region)
  94. if city in detail:
  95. detail = detail[len(city):]
  96. province, city, region = fa.fix_third_level(province, city, may_region)
  97. detail = detail[len(region):]
  98. if region == '':
  99. province, city, region = fa.fix_third_level(province, city, detail if province == "内蒙古自治区" or '族' in detail else detail[:3])
  100. detail = detail[len(region):]
  101. elif city == '':
  102. if province == "内蒙古自治区":
  103. may_region = detail
  104. elif '市' in detail:
  105. may_region = detail.split('市')[0] + '市'
  106. elif '族' in detail:
  107. may_region = detail.split('族')[0] + '族'
  108. elif '县' in detail:
  109. may_region = detail.split('县')[0] + '县'
  110. elif '区' in detail:
  111. may_region = detail.split('区')[0] + '区'
  112. else:
  113. may_region = detail[:3]
  114. province, city = fa.fix_second_level(province, may_region)
  115. if city in detail:
  116. detail = detail[len(city):]
  117. province, city, region = fa.fix_third_level(province, city, may_region)
  118. detail = detail[len(region):]
  119. elif region == '':
  120. province, city, region = fa.fix_third_level(province, city, detail if province == "内蒙古自治区" or '族' in detail else detail[:3])
  121. detail = ''
  122. else:
  123. detail = ''
  124. province, city, region = fa.fix_third_level(province, city, region)
  125. "===========地址纠错============"
  126. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  127. self.res[f"{place}_province"] = RecItem(province, conf)
  128. self.res[f"{place}_city"] = RecItem(city, conf)
  129. if detail and "旗" in detail and "红旗" not in detail:
  130. temp_region = []
  131. temp_region.insert(0, detail.split("旗")[0] + "旗")
  132. self.res[f"{place}_region"] = RecItem(temp_region[0], conf)
  133. self.res[f"{place}_detail"] = RecItem(detail.split("旗")[-1], conf)
  134. else:
  135. self.res[f"{place}_region"] = RecItem(region, conf)
  136. self.res[f"{place}_detail"] = RecItem(detail, conf)
  137. self.res[place].text = province + city + region + detail
  138. # 1 户口本首页
  139. class FrontRegBookParser(Parser):
  140. def type_(self):
  141. """
  142. 户别
  143. """
  144. def get_txt(bool_ranks):
  145. if bool_ranks:
  146. gtxt = fa.Correction(self.result[1][0].txt)
  147. gconf = self.result[1][0].conf
  148. return gtxt, gconf
  149. else:
  150. for i in self.result_line[:-1]:
  151. if '户别' in i[-1][0] or i[-1][0][0] == '别':
  152. gtxt = i[-1][0].split('别')[-1].split('户主')[0]
  153. gconf = i[-1][1]
  154. gtxt = re.sub(r'[0-9]+', '', gtxt)
  155. return gtxt, gconf
  156. txt, conf = get_txt(self.bool_ranks)
  157. if txt == '':
  158. txt, conf = get_txt(False)
  159. self.res["type"] = RecItem(fa.Correction(txt), conf)
  160. return
  161. def address(self):
  162. """
  163. 首页住址
  164. """
  165. address_txt = ''
  166. address_conf = 0.
  167. if self.bool_ranks:
  168. address = self.result[0][0]
  169. address_txt = address.txt
  170. address_conf = address.conf
  171. else:
  172. for i in self.result_line[:-1]:
  173. if '住址' in i[-1][0]:
  174. address_txt = i[-1][0].split('住址')[-1]
  175. address_conf = i[-1][1]
  176. self.res["address"] = RecItem(fa.Correction(address_txt), address_conf)
  177. self.split_addr('address')
  178. # 存入
  179. def parse(self):
  180. self.type_()
  181. self.address()
  182. return {key: self.res[key].to_dict() for key in self.keys}
  183. # 0 常驻人口页
  184. class PeopleRegBookParser(Parser):
  185. def full_name(self):
  186. """
  187. 姓名
  188. 属 result[1]
  189. 位 0
  190. """
  191. name = self.result[1][0]
  192. name_val = fa.Correction(name.txt)
  193. conf = name.conf
  194. if len(name_val) < 5:
  195. self.res["name"] = RecItem(name_val, conf)
  196. else:
  197. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  198. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  199. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  200. for n in range(len(point_unicode)):
  201. point = re.findall(point_unicode[n], name_val)
  202. if len(point) != 0:
  203. name_list = name_val.split(point[0])
  204. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  205. return
  206. else:
  207. self.res["name"] = RecItem(name_val, conf)
  208. return
  209. def card_no(self):
  210. """
  211. 身份证号码
  212. 属 0/1都可
  213. """
  214. for i in range(len(self.result[1][:-1])):
  215. res = self.result[1][i]
  216. txt = res.txt
  217. conf = res.conf
  218. if "X" in txt or "x" in txt:
  219. code_val = re.findall("\d*[X|x]", txt)
  220. else:
  221. code_val = re.findall("\d{10,18}", txt)
  222. if len(code_val) > 0:
  223. id_num = fa.Correction(str(code_val[0]))
  224. self.id_index = i
  225. if len(id_num) == 18 or len(id_num) > 10:
  226. self.res["id"].text = id_num
  227. self.res["id"].confidence = conf
  228. if len(id_num) == 18:
  229. self.res["gender"].text = ("男" if int(id_num[16]) % 2 else "女")
  230. self.res["gender"].confidence = conf
  231. return
  232. raise Exception('身份证号识别出错')
  233. def blood_type(self):
  234. """
  235. 血型
  236. 属 result[1]
  237. """
  238. blood_val = ''
  239. conf = 0.
  240. for res in self.result[1][:-1]:
  241. txt = res.txt
  242. conf = res.conf
  243. if "血型" in txt:
  244. blood_val = txt.split("血型")[-1]
  245. self.res["blood_type"] = RecItem(blood_val, conf)
  246. def religion(self):
  247. """
  248. 宗教信仰
  249. """
  250. religion_val = ''
  251. for res in self.result[1][:-1]:
  252. txt = res.txt
  253. conf = res.conf
  254. if "宗教信仰是" in txt:
  255. religion_val = txt.split("宗教信仰是")[-1]
  256. self.res["religion"] = RecItem(fa.Correction(religion_val), conf)
  257. def birthplace(self):
  258. """
  259. 出生地
  260. """
  261. birth_place = self.result[1][1]
  262. birth_place_txt = birth_place.txt
  263. birth_place_conf = birth_place.conf
  264. if birth_place_txt:
  265. self.res["birthplace"] = RecItem(fa.Correction(birth_place_txt), birth_place_conf)
  266. self.split_addr("birth")
  267. def native_place(self):
  268. """
  269. 籍贯
  270. """
  271. native_place = self.result[1][2]
  272. native_place_txt = native_place.txt
  273. native_place_conf = native_place.conf
  274. self.res["native_place"] = RecItem(fa.Correction(native_place_txt), native_place_conf)
  275. self.split_addr("native")
  276. def parse(self):
  277. self.full_name()
  278. self.card_no()
  279. self.blood_type()
  280. self.religion()
  281. self.birthplace()
  282. self.native_place()
  283. return {k: self.res[k].to_dict() for k in self.keys}