parser.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. import re
  2. import string
  3. import math
  4. from dataclasses import dataclass
  5. from collections import defaultdict
  6. import numpy as np
  7. import cpca
  8. from typing import List
  9. from core.line_parser import OcrResult
  10. @dataclass
  11. class RecItem:
  12. text: str = ''
  13. confidence: float = 0.
  14. def to_dict(self):
  15. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  16. class Parser(object):
  17. def __init__(self, ocr_results: List[List[OcrResult]]):
  18. self.result = ocr_results
  19. self.res = defaultdict(RecItem)
  20. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  21. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  22. for key in self.keys:
  23. self.res[key] = RecItem()
  24. def parse(self):
  25. return self.res
  26. class FrontParser(Parser):
  27. """
  28. 出生年月日
  29. """
  30. def __init__(self, ocr_results: List[List[OcrResult]]):
  31. Parser.__init__(self, ocr_results)
  32. def birth(self):
  33. if len(self.res["id"].text) == 18:
  34. # 342423 2001 0 2 1 5 6552
  35. # 012345 6789 10 11 12 13 14
  36. str_num = self.res["id"].text
  37. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  38. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  39. def card_no(self):
  40. """
  41. 身份证号码
  42. """
  43. for idx, row in enumerate(self.result):
  44. for r in row:
  45. txt = r.txt
  46. # 身份证号码
  47. if "X" in txt or "x" in txt:
  48. res = re.findall("\d*[X|x]", txt)
  49. else:
  50. res = re.findall("\d{16,18}", txt)
  51. if len(res) > 0:
  52. if len(res[0]) == 18:
  53. self.res["id"].text = res[0]
  54. self.res["id"].confidence = r.conf
  55. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  56. self.res["gender"].confidence = r.conf
  57. if idx < 2:
  58. self.result = self.result[idx + 1:]
  59. self.result.reverse()
  60. else:
  61. self.result = self.result[:idx]
  62. return
  63. raise Exception('无法识别')
  64. def name(self):
  65. """
  66. 姓名
  67. """
  68. name_val = None
  69. conf = 0.
  70. for i in range(len(self.result)):
  71. res = self.result[i]
  72. for j in range(len(self.result[i])):
  73. txt = self.result[i][j].txt
  74. conf = self.result[i][j].conf
  75. mini_dis = [99999., 0]
  76. is_name = '姓' in txt or '名' in txt
  77. if is_name and len(res) > 1:
  78. for k in range(len(self.result[i])):
  79. if k == j: continue
  80. p = np.array(res[j].ct) - np.array(res[k].ct)
  81. min = math.hypot(p[0], p[1])
  82. if min < mini_dis[0]:
  83. mini_dis = [min, k]
  84. conf = self.result[i][k].conf
  85. name_val = self.result[i][mini_dis[1]].txt
  86. elif is_name and len(txt) > 3:
  87. conf = self.result[i][mini_dis[1]].conf
  88. name_val = txt.split("姓名")[-1]
  89. if name_val is None:
  90. raise Exception('无法识别')
  91. if len(name_val) < 5:
  92. self.res["name"] = RecItem(name_val, conf)
  93. else:
  94. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  95. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  96. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  97. for n in range(len(point_unicode)):
  98. point = re.findall(point_unicode[n], name_val)
  99. if len(point) != 0:
  100. name_list = name_val.split(point[0])
  101. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  102. return
  103. def national(self):
  104. """
  105. 性别 <-- id
  106. 民族汉
  107. """
  108. for i in range(len(self.result)):
  109. res = self.result[i]
  110. for j in range(len(self.result[i])):
  111. txt = self.result[i][j].txt
  112. conf = self.result[i][j].conf
  113. mini_dis = [99999., 0]
  114. # 分框
  115. if '族' in txt and len(txt) < 3:
  116. for k in range(len(self.result[i])):
  117. if k == j: continue
  118. p = np.array(res[j].ct) - np.array(res[k].ct)
  119. min = math.hypot(p[0], p[1])
  120. if min < mini_dis[0]:
  121. mini_dis = [min, k]
  122. self.res["ethnicity"] = RecItem(self.result[i][mini_dis[1]].txt, conf)
  123. return
  124. # 合框
  125. elif '族' in txt:
  126. self.res["ethnicity"] = RecItem(txt.split("族")[-1], conf)
  127. return
  128. # for nation in self.result[1]:
  129. # txt = nation.txt
  130. # conf = nation.conf
  131. # res = re.findall(".*族[\u4e00-\u9fa5]+", txt)
  132. #
  133. # if len(res) > 0:
  134. # self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf)
  135. # return
  136. def address(self):
  137. """
  138. 身份证地址
  139. """
  140. res = []
  141. confs = []
  142. for row in self.result[3:]:
  143. for r in row:
  144. txt = r.txt
  145. if (
  146. "住址" in txt
  147. or "址" in txt
  148. or "省" in txt
  149. or "市" in txt
  150. or "县" in txt
  151. or "街" in txt
  152. or "乡" in txt
  153. or "村" in txt
  154. or "镇" in txt
  155. or "区" in txt
  156. or "城" in txt
  157. or "组" in txt
  158. or "旗" in txt
  159. or "号" in txt
  160. or "户" in txt
  161. ):
  162. # if "住址" in txt or "省" in txt or "址" in txt:
  163. if "住址" in txt or "址" in txt:
  164. res.append(txt.split("址")[-1])
  165. else:
  166. res.append(txt)
  167. confs.append(r.conf)
  168. if len(res) > 0:
  169. self.res["address"] = RecItem("".join(res), np.mean(confs))
  170. self.split_addr()
  171. return
  172. raise Exception('无法识别')
  173. def split_addr(self):
  174. print(self.res['address'].text, '=======')
  175. conf = self.res["address"].confidence
  176. df = cpca.transform([self.res["address"].text])
  177. # print(df)
  178. province = df.iloc[0, 0]
  179. city = df.iloc[0, 1]
  180. region = df.iloc[0, 2]
  181. detail = df.iloc[0, 3]
  182. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  183. self.res["address_province"] = RecItem(province, conf)
  184. self.res["address_city"] = RecItem(city, conf)
  185. if detail and "旗" in detail:
  186. temp_region = []
  187. temp_region.insert(0, detail.split("旗")[0] + "旗")
  188. self.res["address_region"] = RecItem(temp_region[0], conf)
  189. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  190. # elif detail and "旗" in detail:
  191. else:
  192. self.res["address_region"] = RecItem(region, conf)
  193. self.res["address_detail"] = RecItem(detail, conf)
  194. if not self.res['address_region'].text or not self.res['address_detail'].text:
  195. raise Exception('无法识别')
  196. def parse(self):
  197. self.card_no()
  198. self.name()
  199. self.national()
  200. self.birth()
  201. self.address()
  202. return {key: self.res[key].to_dict() for key in self.keys}
  203. class BackParser(Parser):
  204. def __init__(self, ocr_results: List[OcrResult]):
  205. Parser.__init__(self, ocr_results)
  206. def expire_date(self):
  207. for row in self.result:
  208. for r in row:
  209. txt = r.txt
  210. txt = txt.replace('.', '')
  211. res = re.findall('\d{8}\-\d{8}', txt)
  212. if res:
  213. self.res["expire_date"] = RecItem(res[0], r.conf)
  214. return
  215. res = re.findall('\d{8}\-长期', txt)
  216. if res:
  217. self.res["expire_date"] = RecItem(res[0], r.conf)
  218. return
  219. raise Exception('无法识别')
  220. def parse(self):
  221. self.expire_date()
  222. if not self.res["expire_date"].text:
  223. raise Exception("无法识别")
  224. return {key: self.res[key].to_dict() for key in self.keys}