parser.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. import re
  2. import string
  3. from dataclasses import dataclass
  4. from collections import defaultdict
  5. import numpy as np
  6. import cpca
  7. from typing import List
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  15. class Parser(object):
  16. def __init__(self, ocr_results: List[OcrResult]):
  17. self.result = ocr_results
  18. self.res = defaultdict(RecItem)
  19. self.keys = ["name", "id", "ethnicity", "gender", "birthday",
  20. "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
  21. for key in self.keys:
  22. self.res[key] = RecItem()
  23. def parse(self):
  24. return self.res
  25. class FrontParser(Parser):
  26. """
  27. """
  28. def __init__(self, ocr_results: List[OcrResult]):
  29. Parser.__init__(self, ocr_results)
  30. def birth(self):
  31. if len(self.res["id"].text) == 18:
  32. # 342423 2001 0 2 1 5 6552
  33. # 012345 6789 10 11 12 13 14
  34. str_num = self.res["id"].text
  35. date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
  36. self.res["birthday"] = RecItem(date, self.res['id'].confidence)
  37. def card_no(self):
  38. """
  39. 身份证号码
  40. """
  41. for idx, row in enumerate(self.result):
  42. for r in row:
  43. txt = r.txt
  44. # 身份证号码
  45. if "X" in txt or "x" in txt:
  46. res = re.findall("\d*[X|x]", txt)
  47. else:
  48. res = re.findall("\d{16,18}", txt)
  49. if len(res) > 0:
  50. if len(res[0]) == 18:
  51. self.res["id"].text = res[0]
  52. self.res["id"].confidence = r.conf
  53. self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
  54. self.res["gender"].confidence = r.conf
  55. if idx < 2:
  56. self.result = self.result[idx + 1:]
  57. self.result.reverse()
  58. else:
  59. self.result = self.result[:idx]
  60. return
  61. raise Exception('无法识别')
  62. def name(self):
  63. """
  64. 姓名
  65. """
  66. if len(self.result[0]) == 2:
  67. for r in self.result[0]:
  68. if '姓' in r.txt or ('名' in r.txt and len(r.txt) < 3):
  69. continue
  70. else:
  71. self.res['name'] = RecItem(r.txt, r.conf)
  72. return
  73. if len(self.result[0]) == 1:
  74. txt = self.result[0][0].txt
  75. conf = self.result[0][0].conf
  76. if "姓名" in txt:
  77. res = txt[2:]
  78. name_list = []
  79. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  80. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  81. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  82. for n in range(len(point_unicode)):
  83. point = re.findall(point_unicode[n], res)
  84. if len(point) != 0:
  85. name_list = res.split(point[0])
  86. self.res['name'] = RecItem(name_list[0].replace('姓名') + '\u00B7' + name_list[1], conf)
  87. return
  88. res = re.findall("姓名[\u4e00-\u9fa5]{1,7}", txt)
  89. if len(res) > 0:
  90. self.res["name"] = RecItem(res[0].split("姓名")[-1], conf)
  91. return
  92. else:
  93. self.res["name"] = RecItem(txt, conf)
  94. return
  95. raise Exception('无法识别')
  96. def national(self):
  97. # 性别女民族汉
  98. if len(self.result[1]) == 1:
  99. txt = self.result[1][0].txt
  100. conf = self.result[1][0].conf
  101. res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  102. if len(res) > 0:
  103. self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf)
  104. return
  105. def address(self):
  106. """
  107. 身份证地址
  108. """
  109. res = []
  110. confs = []
  111. for row in self.result[3:]:
  112. for r in row:
  113. txt = r.txt
  114. if (
  115. "住址" in txt
  116. or "址" in txt
  117. or "省" in txt
  118. or "市" in txt
  119. or "县" in txt
  120. or "街" in txt
  121. or "乡" in txt
  122. or "村" in txt
  123. or "镇" in txt
  124. or "区" in txt
  125. or "城" in txt
  126. or "组" in txt
  127. or "旗" in txt
  128. or "号" in txt
  129. ):
  130. # if "住址" in txt or "省" in txt or "址" in txt:
  131. if "住址" in txt or "址" in txt:
  132. res.append(txt.split("址")[-1])
  133. else:
  134. res.append(txt)
  135. confs.append(r.conf)
  136. if len(res) > 0:
  137. self.res["address"] = RecItem("".join(res), np.mean(confs))
  138. self.split_addr()
  139. return
  140. raise Exception('无法识别')
  141. def split_addr(self):
  142. conf = self.res["address"].confidence
  143. df = cpca.transform([self.res["address"].text])
  144. # print(df)
  145. province = df.iloc[0, 0]
  146. city = df.iloc[0, 1]
  147. region = df.iloc[0, 2]
  148. detail = df.iloc[0, 3]
  149. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  150. self.res["address_province"] = RecItem(province, conf)
  151. self.res["address_city"] = RecItem(city, conf)
  152. if detail and "旗" in detail:
  153. temp_region = []
  154. temp_region.insert(0, detail.split("旗")[0] + "旗")
  155. self.res["address_region"] = RecItem(temp_region[0], conf)
  156. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  157. else:
  158. self.res["address_region"] = RecItem(region, conf)
  159. self.res["address_detail"] = RecItem(detail, conf)
  160. if not self.res['address_region'].text or not self.res['address_detail'].text:
  161. raise Exception('无法识别')
  162. def parse(self):
  163. self.card_no()
  164. self.name()
  165. self.national()
  166. self.birth()
  167. self.address()
  168. return {key: self.res[key].to_dict() for key in self.keys}
  169. class BackParser(Parser):
  170. def __init__(self, ocr_results: List[OcrResult]):
  171. Parser.__init__(self, ocr_results)
  172. def expire_date(self):
  173. for row in self.result:
  174. for r in row:
  175. txt = r.txt
  176. txt = txt.replace('.', '')
  177. res = re.findall('\d{8}\-\d{8}', txt)
  178. if res:
  179. self.res["expire_date"] = RecItem(res[0], r.conf)
  180. return
  181. res = re.findall('\d{8}\-长期', txt)
  182. if res:
  183. self.res["expire_date"] = RecItem(res[0], r.conf)
  184. return
  185. raise Exception('无法识别')
  186. def parse(self):
  187. self.expire_date()
  188. if not self.res["expire_date"].text:
  189. raise Exception("无法识别")
  190. return {key: self.res[key].to_dict() for key in self.keys}