parser.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. import re
  2. import string
  3. import math
  4. from dataclasses import dataclass
  5. from collections import defaultdict
  6. import numpy as np
  7. import cpca
  8. from typing import List
  9. from zhon.hanzi import punctuation
  10. from core.line_parser import OcrResult
  11. @dataclass
  12. class RecItem:
  13. text: str = ''
  14. confidence: float = 0.
  15. def to_dict(self):
  16. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  17. class Parser(object):
  18. def __init__(self, ocr_results: List[List[OcrResult]]):
  19. self.result = ocr_results
  20. self.res = defaultdict(RecItem)
  21. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  22. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  23. for key in self.keys:
  24. self.res[key] = RecItem()
  25. def parse(self):
  26. return self.res
  27. class FrontParser(Parser):
  28. """
  29. 出生年月日
  30. """
  31. def __init__(self, ocr_results: List[List[OcrResult]]):
  32. Parser.__init__(self, ocr_results)
  33. def birth(self):
  34. if len(self.res["id"].text) == 18:
  35. # 342423 2001 0 2 1 5 6552
  36. # 012345 6789 10 11 12 13 14
  37. str_num = self.res["id"].text
  38. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  39. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  40. def card_no(self):
  41. """
  42. 身份证号码
  43. """
  44. for idx, row in enumerate(self.result):
  45. for r in row:
  46. txt = r.txt
  47. # 身份证号码
  48. if "X" in txt or "x" in txt:
  49. res = re.findall("\d*[X|x]", txt)
  50. else:
  51. res = re.findall("\d{16,18}", txt)
  52. if len(res) > 0:
  53. if len(res[0]) == 18:
  54. self.res["id"].text = res[0]
  55. self.res["id"].confidence = r.conf
  56. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  57. self.res["gender"].confidence = r.conf
  58. if idx < 2:
  59. self.result = self.result[idx + 1:]
  60. self.result.reverse()
  61. else:
  62. self.result = self.result[:idx]
  63. print('---------------------')
  64. for row in self.result:
  65. print(row)
  66. print('---------------------')
  67. return
  68. raise Exception('无法识别')
  69. def name(self):
  70. """
  71. 姓名
  72. """
  73. name_val = None
  74. conf = 0.
  75. for i in range(len(self.result)):
  76. res = self.result[i]
  77. for j in range(len(self.result[i])):
  78. txt = self.result[i][j].txt
  79. conf = self.result[i][j].conf
  80. mini_dis = [99999., 0]
  81. is_name = '姓' in txt or '名' in txt
  82. if is_name and len(res) > 1:
  83. for k in range(len(self.result[i])):
  84. if k == j: continue
  85. p = np.array(res[j].center) - np.array(res[k].center)
  86. min = math.hypot(p[0], p[1])
  87. if min < mini_dis[0]:
  88. mini_dis = [min, k]
  89. conf = self.result[i][k].conf
  90. name_val = self.result[i][mini_dis[1]].txt
  91. elif is_name and len(txt) > 3:
  92. conf = self.result[i][mini_dis[1]].conf
  93. name_val = txt.split("姓名")[-1]
  94. if name_val is None:
  95. raise Exception('无法识别')
  96. if len(name_val) < 5:
  97. self.res["name"] = RecItem(name_val, conf)
  98. else:
  99. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  100. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  101. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  102. for n in range(len(point_unicode)):
  103. point = re.findall(point_unicode[n], name_val)
  104. if len(point) != 0:
  105. name_list = name_val.split(point[0])
  106. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  107. return
  108. def national(self):
  109. """
  110. 性别 <-- id
  111. 民族汉
  112. """
  113. for i in range(len(self.result)):
  114. res = self.result[i]
  115. for j in range(len(self.result[i])):
  116. txt = self.result[i][j].txt
  117. conf = self.result[i][j].conf
  118. mini_dis = [99999., 0]
  119. # 分框
  120. if '族' in txt and len(txt) < 3:
  121. for k in range(len(self.result[i])):
  122. if k == j: continue
  123. p = np.array(res[j].center) - np.array(res[k].center)
  124. min = math.hypot(p[0], p[1])
  125. if min < mini_dis[0]:
  126. mini_dis = [min, k]
  127. self.res["ethnicity"] = RecItem(self.result[i][mini_dis[1]].txt, conf)
  128. return
  129. # 合框
  130. elif '族' in txt:
  131. self.res["ethnicity"] = RecItem(txt.split("族")[-1], conf)
  132. return
  133. def address(self):
  134. """
  135. 身份证地址
  136. """
  137. res = []
  138. confs = []
  139. for row in self.result[2:]:
  140. for r in row:
  141. txt = r.txt
  142. if '性别' in txt or '出生' in txt or '民族' in txt: continue
  143. punctuation_str = punctuation
  144. for i in punctuation:
  145. txt = txt.replace(i, '')
  146. if (
  147. "住址" in txt
  148. or "址" in txt
  149. or "省" in txt
  150. or "市" in txt
  151. or "县" in txt
  152. or "街" in txt
  153. or "乡" in txt
  154. or "村" in txt
  155. or "镇" in txt
  156. or "区" in txt
  157. or "城" in txt
  158. or "组" in txt
  159. or "旗" in txt
  160. or "号" in txt
  161. or "户" in txt
  162. or "室" in txt
  163. ):
  164. # if "住址" in txt or "省" in txt or "址" in txt:
  165. if "住址" in txt or "址" in txt:
  166. res.append(txt.split("址")[-1])
  167. else:
  168. res.append(txt)
  169. confs.append(r.conf)
  170. if len(res) > 0:
  171. self.res["address"] = RecItem("".join(res), np.mean(confs))
  172. self.split_addr()
  173. return
  174. raise Exception('无法识别')
  175. def split_addr(self):
  176. print(self.res['address'].text, '=======')
  177. conf = self.res["address"].confidence
  178. df = cpca.transform([self.res["address"].text])
  179. # print(df)
  180. province = df.iloc[0, 0]
  181. city = df.iloc[0, 1]
  182. region = df.iloc[0, 2]
  183. detail = df.iloc[0, 3]
  184. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  185. self.res["address_province"] = RecItem(province, conf)
  186. self.res["address_city"] = RecItem(city, conf)
  187. if detail and "旗" in detail:
  188. temp_region = []
  189. temp_region.insert(0, detail.split("旗")[0] + "旗")
  190. self.res["address_region"] = RecItem(temp_region[0], conf)
  191. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  192. # elif detail and "旗" in detail:
  193. else:
  194. self.res["address_region"] = RecItem(region, conf)
  195. self.res["address_detail"] = RecItem(detail, conf)
  196. if not self.res['address_region'].text or not self.res['address_detail'].text:
  197. raise Exception('无法识别')
  198. def parse(self):
  199. self.card_no()
  200. self.name()
  201. self.national()
  202. self.birth()
  203. self.address()
  204. return {key: self.res[key].to_dict() for key in self.keys}
  205. class BackParser(Parser):
  206. def __init__(self, ocr_results: List[List[OcrResult]]):
  207. Parser.__init__(self, ocr_results)
  208. def expire_date(self):
  209. for row in self.result:
  210. for r in row:
  211. txt = r.txt
  212. txt = txt.replace('.', '')
  213. res = re.findall('\d{8}\-\d{4}', txt)
  214. if res:
  215. self.res["expire_date"] = RecItem(res[0]+res[0][4:8], r.conf)
  216. return
  217. res = re.findall('\d{8}\-长期', txt)
  218. if res:
  219. self.res["expire_date"] = RecItem(res[0], r.conf)
  220. return
  221. raise Exception('无法识别')
  222. def parse(self):
  223. self.expire_date()
  224. if not self.res["expire_date"].text:
  225. raise Exception("无法识别")
  226. return {key: self.res[key].to_dict() for key in self.keys}