parser.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. import re
  2. import string
  3. import math
  4. from dataclasses import dataclass
  5. from collections import defaultdict
  6. import numpy as np
  7. import cpca
  8. from typing import List
  9. from core.line_parser import OcrResult
  10. @dataclass
  11. class RecItem:
  12. text: str = ''
  13. confidence: float = 0.
  14. def to_dict(self):
  15. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  16. class Parser(object):
  17. def __init__(self, ocr_results: List[List[OcrResult]]):
  18. self.result = ocr_results
  19. self.res = defaultdict(RecItem)
  20. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  21. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  22. for key in self.keys:
  23. self.res[key] = RecItem()
  24. def parse(self):
  25. return self.res
  26. class FrontParser(Parser):
  27. """
  28. 出生年月日
  29. """
  30. def __init__(self, ocr_results: List[List[OcrResult]]):
  31. Parser.__init__(self, ocr_results)
  32. def birth(self):
  33. if len(self.res["id"].text) == 18:
  34. # 342423 2001 0 2 1 5 6552
  35. # 012345 6789 10 11 12 13 14
  36. str_num = self.res["id"].text
  37. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  38. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  39. def card_no(self):
  40. """
  41. 身份证号码
  42. """
  43. for idx, row in enumerate(self.result):
  44. for r in row:
  45. txt = r.txt
  46. # 身份证号码
  47. if "X" in txt or "x" in txt:
  48. res = re.findall("\d*[X|x]", txt)
  49. else:
  50. res = re.findall("\d{16,18}", txt)
  51. if len(res) > 0:
  52. if len(res[0]) == 18:
  53. self.res["id"].text = res[0]
  54. self.res["id"].confidence = r.conf
  55. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  56. self.res["gender"].confidence = r.conf
  57. if idx < 2:
  58. self.result = self.result[idx + 1:]
  59. self.result.reverse()
  60. else:
  61. self.result = self.result[:idx]
  62. print('---------------------')
  63. for row in self.result:
  64. print(row)
  65. print('---------------------')
  66. return
  67. raise Exception('无法识别')
  68. def name(self):
  69. """
  70. 姓名
  71. """
  72. name_val = None
  73. conf = 0.
  74. for i in range(len(self.result)):
  75. res = self.result[i]
  76. for j in range(len(self.result[i])):
  77. txt = self.result[i][j].txt
  78. conf = self.result[i][j].conf
  79. mini_dis = [99999., 0]
  80. is_name = '姓' in txt or '名' in txt
  81. if is_name and len(res) > 1:
  82. for k in range(len(self.result[i])):
  83. if k == j: continue
  84. p = np.array(res[j].center) - np.array(res[k].center)
  85. min = math.hypot(p[0], p[1])
  86. if min < mini_dis[0]:
  87. mini_dis = [min, k]
  88. conf = self.result[i][k].conf
  89. name_val = self.result[i][mini_dis[1]].txt
  90. elif is_name and len(txt) > 3:
  91. conf = self.result[i][mini_dis[1]].conf
  92. name_val = txt.split("姓名")[-1]
  93. if name_val is None:
  94. raise Exception('无法识别')
  95. if len(name_val) < 5:
  96. self.res["name"] = RecItem(name_val, conf)
  97. else:
  98. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  99. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  100. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  101. for n in range(len(point_unicode)):
  102. point = re.findall(point_unicode[n], name_val)
  103. if len(point) != 0:
  104. name_list = name_val.split(point[0])
  105. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  106. return
  107. def national(self):
  108. """
  109. 性别 <-- id
  110. 民族汉
  111. """
  112. for i in range(len(self.result)):
  113. res = self.result[i]
  114. for j in range(len(self.result[i])):
  115. txt = self.result[i][j].txt
  116. conf = self.result[i][j].conf
  117. mini_dis = [99999., 0]
  118. # 分框
  119. if '族' in txt and len(txt) < 3:
  120. for k in range(len(self.result[i])):
  121. if k == j: continue
  122. p = np.array(res[j].center) - np.array(res[k].center)
  123. min = math.hypot(p[0], p[1])
  124. if min < mini_dis[0]:
  125. mini_dis = [min, k]
  126. self.res["ethnicity"] = RecItem(self.result[i][mini_dis[1]].txt, conf)
  127. return
  128. # 合框
  129. elif '族' in txt:
  130. self.res["ethnicity"] = RecItem(txt.split("族")[-1], conf)
  131. return
  132. # for nation in self.result[1]:
  133. # txt = nation.txt
  134. # conf = nation.conf
  135. # res = re.findall(".*族[\u4e00-\u9fa5]+", txt)
  136. #
  137. # if len(res) > 0:
  138. # self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf)
  139. # return
  140. def address(self):
  141. """
  142. 身份证地址
  143. """
  144. res = []
  145. confs = []
  146. for row in self.result[2:]:
  147. for r in row:
  148. txt = r.txt
  149. if '性别' in txt or '出生' in txt or '民族' in txt: continue
  150. if (
  151. "住址" in txt
  152. or "址" in txt
  153. or "省" in txt
  154. or "市" in txt
  155. or "县" in txt
  156. or "街" in txt
  157. or "乡" in txt
  158. or "村" in txt
  159. or "镇" in txt
  160. or "区" in txt
  161. or "城" in txt
  162. or "组" in txt
  163. or "旗" in txt
  164. or "号" in txt
  165. or "户" in txt
  166. ):
  167. # if "住址" in txt or "省" in txt or "址" in txt:
  168. if "住址" in txt or "址" in txt:
  169. res.append(txt.split("址")[-1])
  170. else:
  171. res.append(txt)
  172. confs.append(r.conf)
  173. if len(res) > 0:
  174. self.res["address"] = RecItem("".join(res), np.mean(confs))
  175. self.split_addr()
  176. return
  177. raise Exception('无法识别')
  178. def split_addr(self):
  179. print(self.res['address'].text, '=======')
  180. conf = self.res["address"].confidence
  181. df = cpca.transform([self.res["address"].text])
  182. # print(df)
  183. province = df.iloc[0, 0]
  184. city = df.iloc[0, 1]
  185. region = df.iloc[0, 2]
  186. detail = df.iloc[0, 3]
  187. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  188. self.res["address_province"] = RecItem(province, conf)
  189. self.res["address_city"] = RecItem(city, conf)
  190. if detail and "旗" in detail:
  191. temp_region = []
  192. temp_region.insert(0, detail.split("旗")[0] + "旗")
  193. self.res["address_region"] = RecItem(temp_region[0], conf)
  194. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  195. # elif detail and "旗" in detail:
  196. else:
  197. self.res["address_region"] = RecItem(region, conf)
  198. self.res["address_detail"] = RecItem(detail, conf)
  199. if not self.res['address_region'].text or not self.res['address_detail'].text:
  200. raise Exception('无法识别')
  201. def parse(self):
  202. self.card_no()
  203. self.name()
  204. self.national()
  205. self.birth()
  206. self.address()
  207. return {key: self.res[key].to_dict() for key in self.keys}
  208. class BackParser(Parser):
  209. def __init__(self, ocr_results: List[OcrResult]):
  210. Parser.__init__(self, ocr_results)
  211. def expire_date(self):
  212. for row in self.result:
  213. for r in row:
  214. txt = r.txt
  215. txt = txt.replace('.', '')
  216. res = re.findall('\d{8}\-\d{8}', txt)
  217. if res:
  218. self.res["expire_date"] = RecItem(res[0], r.conf)
  219. return
  220. res = re.findall('\d{8}\-长期', txt)
  221. if res:
  222. self.res["expire_date"] = RecItem(res[0], r.conf)
  223. return
  224. raise Exception('无法识别')
  225. def parse(self):
  226. self.expire_date()
  227. if not self.res["expire_date"].text:
  228. raise Exception("无法识别")
  229. return {key: self.res[key].to_dict() for key in self.keys}