parser.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import re
  2. from collections import defaultdict
  3. from dataclasses import dataclass
  4. from typing import List
  5. import cpca
  6. import numpy as np
  7. from zhon.hanzi import punctuation
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  15. class Parser(object):
  16. def __init__(self, ocr_results: List[List[OcrResult]]):
  17. self.result = ocr_results
  18. self.res = defaultdict(RecItem)
  19. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  20. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  21. for key in self.keys:
  22. self.res[key] = RecItem()
  23. def parse(self):
  24. return self.res
  25. class FrontParser(Parser):
  26. """
  27. 出生年月日
  28. """
  29. def __init__(self, ocr_results: List[List[OcrResult]]):
  30. Parser.__init__(self, ocr_results)
  31. self.id_ok = True
  32. def birth(self):
  33. if len(self.res["id"].text) == 18:
  34. # 342423 2001 0 2 1 5 6552
  35. # 012345 6789 10 11 12 13 14
  36. str_num = self.res["id"].text
  37. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  38. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  39. else:
  40. # 出生年月
  41. idx = 2
  42. txt = ''.join([r.txt for r in self.result[idx]])
  43. conf = np.mean([r.conf for r in self.result[idx]])
  44. res = re.match('.*(\d{4})[\u4E00-\u9FA5]+(\d{1,2})[\u4E00-\u9FA5]+(\d{1,2})', txt)
  45. if res and len(res.groups()) == 3:
  46. year, month, day = res.groups()
  47. self.res['birthday'] = RecItem(f'{year}年{month}月{day}日', conf)
  48. # 性别
  49. idx = 1
  50. txt = ''.join([r.txt for r in self.result[idx]])
  51. conf = np.mean([r.conf for r in self.result[idx]])
  52. if '男' in txt:
  53. self.res['gender'] = RecItem('男', conf)
  54. else:
  55. self.res['gender'] = RecItem('女', conf)
  56. def card_no(self):
  57. """
  58. 身份证号码
  59. """
  60. for idx, row in enumerate(self.result):
  61. for r in row:
  62. txt = r.txt
  63. # 身份证号码
  64. res = re.findall("\d{10,18}[X|x|×]*", txt)
  65. print(res, '~~~~~')
  66. if res:
  67. if idx < 2:
  68. self.result = self.result[idx + 1:]
  69. self.result.reverse()
  70. else:
  71. self.result = self.result[:idx]
  72. print('--------after id no -------------')
  73. for row in self.result:
  74. print('---')
  75. print(''.join([r.txt for r in row]))
  76. print('--------after id no -------------')
  77. if len(res[0]) == 18:
  78. for x in ['×', 'x']:
  79. res[0] = res[0].replace(x, 'X')
  80. self.res["id"].text = res[0]
  81. self.res["id"].confidence = r.conf
  82. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  83. self.res["gender"].confidence = r.conf
  84. return
  85. else:
  86. return
  87. # raise Exception('无法识别')
  88. @staticmethod
  89. def extract_zhon(txt):
  90. # 提取中文字
  91. res = re.findall('[\u4E00-\u9FA5]+', txt)
  92. if res:
  93. return res[0]
  94. def name(self):
  95. def parser_name(name_val):
  96. if len(name_val) < 5:
  97. self.res["name"] = RecItem(name_val, conf)
  98. else:
  99. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  100. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  101. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  102. for item in point_unicode:
  103. point = re.findall(item, name_val)
  104. if len(point) != 0:
  105. name_list = name_val.split(point[0])
  106. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  107. return
  108. if len(self.result[0]) > 1:
  109. for r in self.result[0]:
  110. if '姓' in r.txt or '名' in r.txt:
  111. r.txt = '姓名'
  112. txt = ''.join([r.txt for r in self.result[0]])
  113. conf = np.array([r.conf for r in self.result[0]]).mean()
  114. res = re.split('.*姓.', txt)
  115. if len(res) == 2:
  116. parser_name(res[-1])
  117. res = re.split('.*名', txt)
  118. if len(res) == 2:
  119. parser_name(res[-1])
  120. def national(self):
  121. """
  122. 民族汉
  123. """
  124. txt = ''.join([r.txt for r in self.result[1]])
  125. conf = np.array([r.conf for r in self.result[1]]).mean()
  126. res = re.split('.*民.', txt)
  127. if len(res) == 2:
  128. self.res['ethnicity'] = RecItem(res[-1], conf)
  129. return
  130. res = re.split('.*族', txt)
  131. if len(res) == 2:
  132. self.res['ethnicity'] = RecItem(res[-1], conf)
  133. return
  134. def address(self):
  135. """
  136. 身份证地址
  137. """
  138. res = []
  139. confs = []
  140. for row in self.result[2:]:
  141. for r in row:
  142. txt = r.txt
  143. if '性别' in txt or '出生' in txt or '民族' in txt or '年' in txt: continue
  144. for i in punctuation:
  145. txt = txt.replace(i, '')
  146. if ("住址" in txt or "址" in txt) and len(res) == 0:
  147. res.append(txt.split("址")[-1])
  148. else:
  149. res.append(txt)
  150. confs.append(r.conf)
  151. if len(res) > 0:
  152. error_dict = [('呼呼', '呼'), ('霸桥', '灞桥'),
  153. ('漳尔市', '淖尔市'), ('屹旦', '圪旦'), ('营家村', '菅家村'),
  154. ('四四川', '四川'), ('止口', ''), ('装柏村', '裴柏村'),
  155. ('安安徽', '安徽'), ('吃梁村', '圪梁村'), ('中熬本台', '中敖本台')]
  156. txt = "".join(res)
  157. txt = txt.split("址")[-1]
  158. for k, v in error_dict:
  159. txt = txt.replace(k, v)
  160. self.res["address"] = RecItem(txt, np.mean(confs))
  161. self.split_addr()
  162. return
  163. raise Exception('无法识别')
  164. def split_addr(self):
  165. print(self.res['address'].text, '=======')
  166. conf = self.res["address"].confidence
  167. df = cpca.transform([self.res["address"].text])
  168. province = df.iloc[0, 0]
  169. df = df.replace([None], [''], regex=True)
  170. city = df.iloc[0, 1]
  171. region = df.iloc[0, 2]
  172. detail = df.iloc[0, 3]
  173. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  174. self.res["address_province"] = RecItem(province, conf)
  175. self.res["address_city"] = RecItem(city, conf)
  176. if detail and "旗" in detail:
  177. temp_region = []
  178. temp_region.insert(0, detail.split("旗")[0] + "旗")
  179. self.res["address_region"] = RecItem(temp_region[0], conf)
  180. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  181. # elif detail and "旗" in detail:
  182. else:
  183. self.res["address_region"] = RecItem(region, conf)
  184. self.res["address_detail"] = RecItem(detail, conf)
  185. def parse(self):
  186. self.card_no()
  187. self.name()
  188. self.national()
  189. self.birth()
  190. self.address()
  191. return {key: self.res[key].to_dict() for key in self.keys}
  192. class BackParser(Parser):
  193. def __init__(self, ocr_results: List[List[OcrResult]]):
  194. Parser.__init__(self, ocr_results)
  195. def expire_date(self):
  196. for row in self.result:
  197. for r in row:
  198. txt = r.txt
  199. txt = txt.replace('.', '')
  200. res = re.findall('\d{8}\-\d{4}', txt)
  201. if res:
  202. self.res["expire_date"] = RecItem(res[0] + res[0][4:8], r.conf)
  203. return
  204. res = re.findall('\d{8}\-长期', txt)
  205. if res:
  206. self.res["expire_date"] = RecItem(res[0], r.conf)
  207. return
  208. raise Exception('无法识别')
  209. def parse(self):
  210. self.expire_date()
  211. if not self.res["expire_date"].text:
  212. raise Exception("无法识别")
  213. return {key: self.res[key].to_dict() for key in self.keys}