parser.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. import re
  2. from collections import defaultdict
  3. from dataclasses import dataclass
  4. from typing import List
  5. import cpca
  6. import numpy as np
  7. from zhon.hanzi import punctuation
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  15. class Parser(object):
  16. def __init__(self, ocr_results: List[List[OcrResult]]):
  17. self.result = ocr_results
  18. self.res = defaultdict(RecItem)
  19. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  20. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  21. for key in self.keys:
  22. self.res[key] = RecItem()
  23. def parse(self):
  24. return self.res
  25. class FrontParser(Parser):
  26. """
  27. 出生年月日
  28. """
  29. def __init__(self, ocr_results: List[List[OcrResult]]):
  30. Parser.__init__(self, ocr_results)
  31. self.id_ok = True
  32. def birth(self):
  33. if len(self.res["id"].text) == 18:
  34. # 342423 2001 0 2 1 5 6552
  35. # 012345 6789 10 11 12 13 14
  36. str_num = self.res["id"].text
  37. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  38. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  39. else:
  40. # 出生年月
  41. idx = 2
  42. txt = ''.join([r.txt for r in self.result[idx]])
  43. conf = np.mean([r.conf for r in self.result[idx]])
  44. res = re.match('.*(\d{4})[\u4E00-\u9FA5]+(\d{1,2})[\u4E00-\u9FA5]+(\d{1,2})', txt)
  45. if res and len(res.groups()) == 3:
  46. year, month, day = res.groups()
  47. self.res['birthday'] = RecItem(f'{year}年{month}月{day}日', conf)
  48. # 性别
  49. idx = 1
  50. txt = ''.join([r.txt for r in self.result[idx]])
  51. conf = np.mean([r.conf for r in self.result[idx]])
  52. if '男' in txt:
  53. self.res['gender'] = RecItem('男', conf)
  54. else:
  55. self.res['gender'] = RecItem('女', conf)
  56. def card_no(self):
  57. """
  58. 身份证号码
  59. """
  60. for idx, row in enumerate(self.result):
  61. for r in row:
  62. txt = r.txt
  63. # 身份证号码
  64. res = re.findall("\d{10,18}[X|x|×]*", txt)
  65. print(res, '~~~~~')
  66. if res:
  67. if idx < 2:
  68. self.result = self.result[idx + 1:]
  69. self.result.reverse()
  70. else:
  71. self.result = self.result[:idx]
  72. print('--------after id no -------------')
  73. for row in self.result:
  74. print('---')
  75. print(''.join([r.txt for r in row]))
  76. print('--------after id no -------------')
  77. if len(res[0]) == 18:
  78. for x in ['×', 'x']:
  79. res[0] = res[0].replace(x, 'X')
  80. self.res["id"].text = res[0]
  81. self.res["id"].confidence = r.conf
  82. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  83. self.res["gender"].confidence = r.conf
  84. return
  85. else:
  86. return
  87. # raise Exception('无法识别')
  88. @staticmethod
  89. def extract_zhon(txt):
  90. # 提取中文字
  91. res = re.findall('[\u4E00-\u9FA5]+', txt)
  92. if res:
  93. return res[0]
  94. def name(self):
  95. def parser_name(name_val):
  96. if len(name_val) < 5:
  97. self.res["name"] = RecItem(name_val, conf)
  98. else:
  99. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  100. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  101. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  102. for item in point_unicode:
  103. point = re.findall(item, name_val)
  104. if len(point) != 0:
  105. name_list = name_val.split(point[0])
  106. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  107. return
  108. if len(self.result[0]) > 1:
  109. for r in self.result[0]:
  110. if '姓' in r.txt or '名' in r.txt:
  111. r.txt = '姓名'
  112. txt = ''.join([r.txt for r in self.result[0]])
  113. conf = np.array([r.conf for r in self.result[0]]).mean()
  114. res = re.split('.*姓.', txt)
  115. if len(res) == 2:
  116. parser_name(res[-1])
  117. res = re.split('.*名', txt)
  118. if len(res) == 2:
  119. parser_name(res[-1])
  120. def national(self):
  121. """
  122. 民族汉
  123. """
  124. txt = ''.join([r.txt for r in self.result[1]])
  125. conf = np.array([r.conf for r in self.result[1]]).mean()
  126. res = re.split('.*民.', txt)
  127. if len(res) == 2:
  128. self.res['ethnicity'] = RecItem(res[-1], conf)
  129. return
  130. res = re.split('.*族', txt)
  131. if len(res) == 2:
  132. self.res['ethnicity'] = RecItem(res[-1], conf)
  133. return
  134. def address(self):
  135. """
  136. 身份证地址
  137. """
  138. res = []
  139. confs = []
  140. for row in self.result[2:]:
  141. for r in row:
  142. txt = r.txt
  143. if '性别' in txt or '出生' in txt or '民族' in txt or '年' in txt: continue
  144. for i in punctuation:
  145. txt = txt.replace(i, '')
  146. # if (
  147. # "住址" in txt
  148. # or "址" in txt
  149. # or "省" in txt
  150. # or "市" in txt
  151. # or "县" in txt
  152. # or "街" in txt
  153. # or "乡" in txt
  154. # or "村" in txt
  155. # or "镇" in txt
  156. # or "区" in txt
  157. # or "城" in txt
  158. # or "组" in txt
  159. # or "旗" in txt
  160. # or "号" in txt
  161. # or "户" in txt
  162. # or "室" in txt
  163. # or "嘎查" in txt
  164. # or "楼" in txt
  165. # or "路" in txt
  166. # ):
  167. # if "住址" in txt or "省" in txt or "址" in txt:
  168. if ("住址" in txt or "址" in txt) and len(res) == 0:
  169. res.append(txt.split("址")[-1])
  170. else:
  171. res.append(txt)
  172. confs.append(r.conf)
  173. if len(res) > 0:
  174. error_dict = [('呼呼', '呼'), ('霸桥', '灞桥'),
  175. ('漳尔市', '淖尔市'), ('屹旦', '圪旦'), ('营家村', '菅家村'),
  176. ('四四川', '四川'), ('止口', ''), ('装柏村', '裴柏村'),
  177. ('安安徽', '安徽'), ('吃梁村', '圪梁村'), ('中熬本台', '中敖本台')]
  178. txt = "".join(res)
  179. txt = txt.split("址")[-1]
  180. for k, v in error_dict:
  181. txt = txt.replace(k, v)
  182. self.res["address"] = RecItem(txt, np.mean(confs))
  183. self.split_addr()
  184. return
  185. raise Exception('无法识别')
  186. def split_addr(self):
  187. print(self.res['address'].text, '=======')
  188. conf = self.res["address"].confidence
  189. df = cpca.transform([self.res["address"].text])
  190. province = df.iloc[0, 0]
  191. df = df.replace([None], [''], regex=True)
  192. city = df.iloc[0, 1]
  193. region = df.iloc[0, 2]
  194. detail = df.iloc[0, 3]
  195. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  196. self.res["address_province"] = RecItem(province, conf)
  197. self.res["address_city"] = RecItem(city, conf)
  198. if detail and "旗" in detail:
  199. temp_region = []
  200. temp_region.insert(0, detail.split("旗")[0] + "旗")
  201. self.res["address_region"] = RecItem(temp_region[0], conf)
  202. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  203. # elif detail and "旗" in detail:
  204. else:
  205. self.res["address_region"] = RecItem(region, conf)
  206. self.res["address_detail"] = RecItem(detail, conf)
  207. # if not self.res['address_region'].text or not self.res['address_detail'].text:
  208. # raise Exception('无法识别区域或者地址详情')
  209. def parse(self):
  210. self.card_no()
  211. self.name()
  212. self.national()
  213. self.birth()
  214. self.address()
  215. return {key: self.res[key].to_dict() for key in self.keys}
  216. class BackParser(Parser):
  217. def __init__(self, ocr_results: List[List[OcrResult]]):
  218. Parser.__init__(self, ocr_results)
  219. def expire_date(self):
  220. for row in self.result:
  221. for r in row:
  222. txt = r.txt
  223. txt = txt.replace('.', '')
  224. res = re.findall('\d{8}\-\d{4}', txt)
  225. if res:
  226. self.res["expire_date"] = RecItem(res[0] + res[0][4:8], r.conf)
  227. return
  228. res = re.findall('\d{8}\-长期', txt)
  229. if res:
  230. self.res["expire_date"] = RecItem(res[0], r.conf)
  231. return
  232. raise Exception('无法识别')
  233. def parse(self):
  234. self.expire_date()
  235. if not self.res["expire_date"].text:
  236. raise Exception("无法识别")
  237. return {key: self.res[key].to_dict() for key in self.keys}