parser.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. import re
  2. import string
  3. from dataclasses import dataclass
  4. from collections import defaultdict
  5. import numpy as np
  6. import cpca
  7. from typing import List
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  15. class Parser(object):
  16. def __init__(self, ocr_results: List[OcrResult]):
  17. self.result = ocr_results
  18. self.res = defaultdict(RecItem)
  19. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  20. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  21. for key in self.keys:
  22. self.res[key] = RecItem()
  23. def parse(self):
  24. return self.res
  25. class FrontParser(Parser):
  26. """
  27. """
  28. def __init__(self, ocr_results: List[OcrResult]):
  29. Parser.__init__(self, ocr_results)
  30. def birth(self):
  31. if len(self.res["id"].text) == 18:
  32. # 342423 2001 0 2 1 5 6552
  33. # 012345 6789 10 11 12 13 14
  34. str_num = self.res["id"].text
  35. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  36. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  37. def card_no(self):
  38. """
  39. 身份证号码
  40. """
  41. for idx, row in enumerate(self.result):
  42. for r in row:
  43. txt = r.txt
  44. # 身份证号码
  45. if "X" in txt or "x" in txt:
  46. res = re.findall("\d*[X|x]", txt)
  47. else:
  48. res = re.findall("\d{16,18}", txt)
  49. if len(res) > 0:
  50. if len(res[0]) == 18:
  51. self.res["id"].text = res[0]
  52. self.res["id"].confidence = r.conf
  53. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  54. self.res["gender"].confidence = r.conf
  55. if idx < 2:
  56. self.result = self.result[idx + 1:]
  57. self.result.reverse()
  58. else:
  59. self.result = self.result[:idx]
  60. return
  61. raise Exception('无法识别')
  62. def name(self):
  63. """
  64. 姓名
  65. """
  66. if len(self.result[0]) == 2:
  67. for r in self.result[0]:
  68. if '姓' in r.txt or ('名' in r.txt and len(r.txt) < 3):
  69. continue
  70. else:
  71. self.res['name'] = RecItem(r.txt, r.conf)
  72. return
  73. if len(self.result[0]) == 1:
  74. txt = self.result[0][0].txt
  75. conf = self.result[0][0].conf
  76. if "姓名" in txt:
  77. res = txt[2:]
  78. name_list = []
  79. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  80. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  81. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  82. for n in range(len(point_unicode)):
  83. point = re.findall(point_unicode[n], res)
  84. if len(point) != 0:
  85. name_list = res.split(point[0])
  86. self.res['name'] = RecItem(name_list[0].replace('姓名') + '\u00B7' + name_list[1], conf)
  87. return
  88. res = re.findall("姓名[\u4e00-\u9fa5]{1,7}", txt)
  89. if len(res) > 0:
  90. self.res["name"] = RecItem(res[0].split("姓名")[-1], conf)
  91. return
  92. else:
  93. self.res["name"] = RecItem(txt, conf)
  94. return
  95. raise Exception('无法识别')
  96. def national(self):
  97. """
  98. 性别 <-- id
  99. 民族汉
  100. """
  101. for nation in self.result[1]:
  102. txt = nation.txt
  103. conf = nation.conf
  104. res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  105. if len(res) > 0:
  106. self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf)
  107. return
  108. # if len(self.result[1]) == 1:
  109. # txt = self.result[1][0].txt
  110. # conf = self.result[1][0].conf
  111. # res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  112. #
  113. # if len(res) > 0:
  114. # self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf)
  115. # return
  116. def address(self):
  117. """
  118. 身份证地址
  119. """
  120. res = []
  121. confs = []
  122. for row in self.result[3:]:
  123. for r in row:
  124. txt = r.txt
  125. if (
  126. "住址" in txt
  127. or "址" in txt
  128. or "省" in txt
  129. or "市" in txt
  130. or "县" in txt
  131. or "街" in txt
  132. or "乡" in txt
  133. or "村" in txt
  134. or "镇" in txt
  135. or "区" in txt
  136. or "城" in txt
  137. or "组" in txt
  138. or "旗" in txt
  139. or "号" in txt
  140. ):
  141. # if "住址" in txt or "省" in txt or "址" in txt:
  142. if "住址" in txt or "址" in txt:
  143. res.append(txt.split("址")[-1])
  144. else:
  145. res.append(txt)
  146. confs.append(r.conf)
  147. if len(res) > 0:
  148. self.res["address"] = RecItem("".join(res), np.mean(confs))
  149. self.split_addr()
  150. return
  151. raise Exception('无法识别')
  152. def split_addr(self):
  153. print(self.res['address'].text, '=======')
  154. conf = self.res["address"].confidence
  155. df = cpca.transform([self.res["address"].text])
  156. # print(df)
  157. province = df.iloc[0, 0]
  158. city = df.iloc[0, 1]
  159. region = df.iloc[0, 2]
  160. detail = df.iloc[0, 3]
  161. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  162. self.res["address_province"] = RecItem(province, conf)
  163. self.res["address_city"] = RecItem(city, conf)
  164. if detail and "旗" in detail:
  165. temp_region = []
  166. temp_region.insert(0, detail.split("旗")[0] + "旗")
  167. self.res["address_region"] = RecItem(temp_region[0], conf)
  168. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  169. else:
  170. self.res["address_region"] = RecItem(region, conf)
  171. self.res["address_detail"] = RecItem(detail, conf)
  172. if not self.res['address_region'].text or not self.res['address_detail'].text:
  173. raise Exception('无法识别')
  174. def parse(self):
  175. self.card_no()
  176. self.name()
  177. self.national()
  178. self.birth()
  179. self.address()
  180. return {key: self.res[key].to_dict() for key in self.keys}
  181. class BackParser(Parser):
  182. def __init__(self, ocr_results: List[OcrResult]):
  183. Parser.__init__(self, ocr_results)
  184. def expire_date(self):
  185. for row in self.result:
  186. for r in row:
  187. txt = r.txt
  188. txt = txt.replace('.', '')
  189. res = re.findall('\d{8}\-\d{8}', txt)
  190. if res:
  191. self.res["expire_date"] = RecItem(res[0], r.conf)
  192. return
  193. res = re.findall('\d{8}\-长期', txt)
  194. if res:
  195. self.res["expire_date"] = RecItem(res[0], r.conf)
  196. return
  197. raise Exception('无法识别')
  198. def parse(self):
  199. self.expire_date()
  200. if not self.res["expire_date"].text:
  201. raise Exception("无法识别")
  202. return {key: self.res[key].to_dict() for key in self.keys}