parser.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. import re
  2. import string
  3. from dataclasses import dataclass
  4. from collections import defaultdict
  5. import numpy as np
  6. import cpca
  7. @dataclass
  8. class RecItem:
  9. text: str = ''
  10. confidence: float = 0.
  11. def to_dict(self):
  12. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  13. class Parser(object):
  14. def __init__(self, txts, confs):
  15. self.result = txts
  16. self.confs = confs
  17. assert len(self.result) == len(self.confs), 'result and confs do not match'
  18. self.res = defaultdict(RecItem)
  19. self.res["Name"] = RecItem()
  20. self.res["IDNumber"] = RecItem()
  21. self.res["Address"] = RecItem()
  22. self.res["Gender"] = RecItem()
  23. self.res["Nationality"] = RecItem()
  24. self.res["Birth"] = RecItem()
  25. self.res["expire_date"] = RecItem()
  26. def parse(self):
  27. return self.res
  28. @property
  29. def confidence(self):
  30. return 0.
  31. class FrontParser(Parser):
  32. """
  33. """
  34. def __init__(self, txts, confs):
  35. Parser.__init__(self, txts, confs)
  36. self.result = [
  37. i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
  38. for i in txts
  39. ]
  40. assert len(self.result) == len(self.confs), 'result and confs do not match'
  41. def birth(self):
  42. if len(self.res["IDNumber"].text) == 18:
  43. # 342423 2001 0 2 1 5 6552
  44. # 012345 6789 10 11 12 13 14
  45. str_num = self.res["IDNumber"].text
  46. date = list(str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日")
  47. if date[date.index("月") - 2] == "0":
  48. del date[date.index("月") - 2]
  49. if date[date.index("日") - 2] == "0":
  50. del date[date.index("日") - 2]
  51. self.res["Birth"].text = "".join(date)
  52. def card_no(self):
  53. """
  54. 身份证号码
  55. """
  56. for i in range(len(self.result)):
  57. txt = self.result[i]
  58. # 身份证号码
  59. if "X" in txt or "x" in txt:
  60. res = re.findall("\d*[X|x]", txt)
  61. else:
  62. res = re.findall("\d{16,18}", txt)
  63. if len(res) > 0:
  64. if len(res[0]) == 18:
  65. self.res["IDNumber"].text = res[0].replace("号码", "")
  66. self.res["IDNumber"].confidence = self.confs[i]
  67. self.res["Gender"].text = "男" if int(res[0][16]) % 2 else "女"
  68. self.res["Gender"].confidence = self.confs[i]
  69. break
  70. def full_name(self):
  71. """
  72. 身份证姓名
  73. """
  74. for i in range(len(self.result)):
  75. txt = self.result[i]
  76. length = len(txt)
  77. print(length)
  78. if "姓名" in txt:
  79. if len(txt) < 7:
  80. res = re.findall("姓名[\u4e00-\u9fa5]{1,4}", txt)
  81. # 三个字名字
  82. if len(res) > 0:
  83. self.res["Name"].text = res[0].split("姓名")[-1]
  84. self.res["Name"].confidence = self.confs[i]
  85. self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
  86. break
  87. else:
  88. res = txt[2:]
  89. name_list = []
  90. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  91. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  92. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  93. for n in range(len(point_unicode)):
  94. point = re.findall(point_unicode[n], res)
  95. if len(point) != 0:
  96. name_list = res.split(point[0])
  97. for m in range(len(name_list)):
  98. name_list[m] = name_list[m].replace(' ', '')
  99. res = name_list[0] + '\u00B7' + name_list[1]
  100. self.res["Name"].text = res
  101. self.res["Name"].confidence = self.confs[i]
  102. self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
  103. def gender(self):
  104. """
  105. 性别女民族汉
  106. """
  107. if len(self.res["Gender"].text) != 0: return
  108. for i in range(len(self.result)):
  109. txt = self.result[i]
  110. if "男" in txt:
  111. self.res["Gender"] = RecItem("男", self.confs[i])
  112. break
  113. if "女" in txt:
  114. self.res["Gender"] = RecItem("女", self.confs[i])
  115. break
  116. def national(self):
  117. # 性别女民族汉
  118. for i in range(len(self.result)):
  119. txt = self.result[i]
  120. res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  121. if len(res) > 0:
  122. self.res["Nationality"] = RecItem(res[0].split("族")[-1], self.confs[i])
  123. break
  124. def address(self):
  125. """
  126. 身份证地址
  127. """
  128. addString = []
  129. conf = []
  130. for i in range(len(self.result)):
  131. txt = self.result[i]
  132. txt = txt.replace("号码", "")
  133. if "公民" in txt:
  134. txt = "temp"
  135. # 身份证地址
  136. if (
  137. "住址" in txt
  138. or "址" in txt
  139. or "省" in txt
  140. or "市" in txt
  141. or "县" in txt
  142. or "街" in txt
  143. or "乡" in txt
  144. or "村" in txt
  145. or "镇" in txt
  146. or "区" in txt
  147. or "城" in txt
  148. or "组" in txt
  149. or "旗" in txt
  150. or "号" in txt
  151. ):
  152. # if "住址" in txt or "省" in txt or "址" in txt:
  153. if "住址" in txt or "省" in txt or "址" in txt or \
  154. ('市' in txt and len(addString) > 0 and '市' not in addString[0]):
  155. addString.insert(0, txt.split("址")[-1])
  156. else:
  157. addString.append(txt)
  158. conf.append(self.confs[i])
  159. self.result[i] = "temp"
  160. if len(addString) > 0:
  161. self.res["Address"].text = "".join(addString)
  162. self.res["Address"].confidence = np.mean(conf)
  163. # print(f'addr: {self.res["Address"]}')
  164. def split_addr(self):
  165. if self.res["Address"].text:
  166. conf = self.res["Address"].confidence
  167. df = cpca.transform([self.res["Address"].text])
  168. # print(df)
  169. province = df.iloc[0, 0]
  170. city = df.iloc[0, 1]
  171. region = df.iloc[0, 2]
  172. detail = df.iloc[0, 3]
  173. # print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  174. self.res["address_province"] = RecItem(province, conf)
  175. self.res["address_city"] = RecItem(city, conf)
  176. if "旗" in detail:
  177. temp_region = []
  178. temp_region.insert(0, detail.split("旗")[0] + "旗")
  179. self.res["address_region"] = RecItem(temp_region[0], conf)
  180. self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
  181. else:
  182. self.res["address_region"] = RecItem(region, conf)
  183. self.res["address_detail"] = RecItem(detail, conf)
  184. def expire_date(self):
  185. for txt, conf in zip(self.result, self.confs):
  186. txt = txt.replace('.', '')
  187. res = re.findall('\d{8}\-\d{8}', txt)
  188. if res:
  189. self.res["expire_date"] = RecItem(res[0], conf)
  190. break
  191. res = re.findall('\d{8}\-长期', txt)
  192. if res:
  193. self.res["expire_date"] = RecItem(res[0], conf)
  194. break
  195. def predict_name(self):
  196. """
  197. 如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进
  198. """
  199. if len(self.res['Name'].text) > 1: return
  200. for i in range(len(self.result)):
  201. txt = self.result[i]
  202. if 1 < len(txt) < 5:
  203. if (
  204. "性别" not in txt
  205. and "姓名" not in txt
  206. and "民族" not in txt
  207. and "住址" not in txt
  208. and "出生" not in txt
  209. and "号码" not in txt
  210. and "身份" not in txt
  211. ):
  212. result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
  213. if len(result) > 0:
  214. self.res["Name"] = RecItem(result[0], self.confs[i])
  215. break
  216. @property
  217. def confidence(self):
  218. return np.mean(self.confs)
  219. def parse(self):
  220. self.full_name()
  221. self.national()
  222. self.card_no()
  223. self.address()
  224. self.split_addr()
  225. self.birth()
  226. self.gender()
  227. self.expire_date()
  228. self.predict_name()
  229. if not self.res["IDNumber"].text:
  230. raise Exception("没有识别到身份证号")
  231. return self.res
  232. class BackParser(Parser):
  233. def __init__(self, txts, confs):
  234. Parser.__init__(self, txts, confs)
  235. def expire_date(self):
  236. for txt, conf in zip(self.result, self.confs):
  237. txt = txt.replace('.', '')
  238. res = re.findall('\d{8}\-\d{8}', txt)
  239. if res:
  240. self.res["expire_date"] = RecItem(res[0], conf)
  241. break
  242. res = re.findall('\d{8}\-长期', txt)
  243. if res:
  244. self.res["expire_date"] = RecItem(res[0], conf)
  245. break
  246. @property
  247. def confidence(self):
  248. return np.mean(self.confs)
  249. def parse(self):
  250. self.expire_date()
  251. if not self.res["expire_date"].text:
  252. raise Exception("无法识别")
  253. return self.res