parser.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. import re
  2. import string
  3. from dataclasses import dataclass
  4. from collections import defaultdict
  5. import numpy as np
  6. import cpca
  7. @dataclass
  8. class RecItem:
  9. text: str = ''
  10. confidence: float = 0.
  11. def to_dict(self):
  12. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  13. class Parser(object):
  14. def __init__(self, txts, confs):
  15. self.result = txts
  16. self.confs = confs
  17. assert len(self.result) == len(self.confs), 'result and confs do not match'
  18. self.res = defaultdict(RecItem)
  19. self.res["Name"] = RecItem()
  20. self.res["IDNumber"] = RecItem()
  21. self.res["Address"] = RecItem()
  22. self.res["Gender"] = RecItem()
  23. self.res["Nationality"] = RecItem()
  24. self.res["Birth"] = RecItem()
  25. self.res["expire_date"] = RecItem()
  26. def parse(self):
  27. return self.res
  28. @property
  29. def confidence(self):
  30. return 0.
  31. class FrontParser(Parser):
  32. """
  33. """
  34. def __init__(self, txts, confs):
  35. Parser.__init__(self, txts, confs)
  36. self.result = [
  37. i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
  38. for i in txts
  39. ]
  40. assert len(self.result) == len(self.confs), 'result and confs do not match'
  41. def birth(self):
  42. addString = []
  43. for i in range(len(self.result)):
  44. txt = self.result[i]
  45. if "出生" in txt or "生" in txt:
  46. # txt = txt.replace("出生", "")
  47. txt = txt.split('生')[-1]
  48. addString.append(txt.strip())
  49. self.res["Birth"] = RecItem("".join(addString), self.confs[i])
  50. break
  51. def card_no(self):
  52. """
  53. 身份证号码
  54. """
  55. for i in range(len(self.result)):
  56. txt = self.result[i]
  57. # 身份证号码
  58. if "X" in txt or "x" in txt:
  59. res = re.findall("\d*[X|x]", txt)
  60. else:
  61. res = re.findall("\d{16,18}", txt)
  62. if len(res) > 0:
  63. if len(res[0]) == 18:
  64. self.res["IDNumber"].text = res[0].replace("号码", "")
  65. self.res["IDNumber"].confidence = self.confs[i]
  66. self.res["Gender"].text = "男" if int(res[0][16]) % 2 else "女"
  67. self.res["Gender"].confidence = self.confs[i]
  68. break
  69. def full_name(self):
  70. """
  71. 身份证姓名
  72. """
  73. for i in range(len(self.result)):
  74. txt = self.result[i]
  75. if ("姓名" in txt) and len(txt) > 2:
  76. res = re.findall("姓名[\u4e00-\u9fa5]{1,4}", txt)
  77. if len(res) > 0:
  78. self.res["Name"].text = res[0].split("姓名")[-1]
  79. self.res["Name"].confidence = self.confs[i]
  80. self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
  81. break
  82. def gender(self):
  83. """
  84. 性别女民族汉
  85. """
  86. if len(self.res["Gender"].text) != 0: return
  87. for i in range(len(self.result)):
  88. txt = self.result[i]
  89. if "男" in txt:
  90. self.res["Gender"] = RecItem("男", self.confs[i])
  91. break
  92. if "女" in txt:
  93. self.res["Gender"] = RecItem("女", self.confs[i])
  94. break
  95. def national(self):
  96. # 性别女民族汉
  97. for i in range(len(self.result)):
  98. txt = self.result[i]
  99. res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  100. if len(res) > 0:
  101. self.res["Nationality"] = RecItem(res[0].split("族")[-1], self.confs[i])
  102. break
  103. def address(self):
  104. """
  105. 身份证地址
  106. """
  107. addString = []
  108. conf = []
  109. for i in range(len(self.result)):
  110. txt = self.result[i]
  111. txt = txt.replace("号码", "")
  112. if "公民" in txt:
  113. txt = "temp"
  114. # 身份证地址
  115. if (
  116. "住址" in txt
  117. or "址" in txt
  118. or "省" in txt
  119. or "市" in txt
  120. or "县" in txt
  121. or "街" in txt
  122. or "乡" in txt
  123. or "村" in txt
  124. or "镇" in txt
  125. or "区" in txt
  126. or "城" in txt
  127. or "组" in txt
  128. or "号" in txt
  129. ):
  130. if "住址" in txt or "省" in txt or "址" in txt:
  131. addString.insert(0, txt.split("址")[-1])
  132. else:
  133. addString.append(txt)
  134. conf.append(self.confs[i])
  135. self.result[i] = "temp"
  136. if len(addString) > 0:
  137. self.res["Address"].text = "".join(addString)
  138. self.res["Address"].confidence = np.mean(conf)
  139. # print(f'addr: {self.res["Address"]}')
  140. def split_addr(self):
  141. if self.res["Address"].text:
  142. conf = self.res["Address"].confidence
  143. df = cpca.transform([self.res["Address"].text])
  144. # print(df)
  145. province = df.iloc[0, 0]
  146. city = df.iloc[0, 1]
  147. region = df.iloc[0, 2]
  148. detail = df.iloc[0, 3]
  149. # print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  150. self.res["address_province"] = RecItem(province, conf)
  151. self.res["address_city"] = RecItem(city, conf)
  152. self.res["address_region"] = RecItem(region, conf)
  153. self.res["address_detail"] = RecItem(detail, conf)
  154. def expire_date(self):
  155. for txt, conf in zip(self.result, self.confs):
  156. txt = txt.replace('.', '')
  157. res = re.findall('\d{8}\-\d{8}', txt)
  158. if res:
  159. self.res["expire_date"] = RecItem(res[0], conf)
  160. break
  161. res = re.findall('\d{8}\-长期', txt)
  162. if res:
  163. self.res["expire_date"] = RecItem(res[0], conf)
  164. break
  165. def predict_name(self):
  166. """
  167. 如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进
  168. """
  169. if len(self.res['Name'].text) > 1: return
  170. for i in range(len(self.result)):
  171. txt = self.result[i]
  172. if 1 < len(txt) < 5:
  173. if (
  174. "性别" not in txt
  175. and "姓名" not in txt
  176. and "民族" not in txt
  177. and "住址" not in txt
  178. and "出生" not in txt
  179. and "号码" not in txt
  180. and "身份" not in txt
  181. ):
  182. result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
  183. if len(result) > 0:
  184. self.res["Name"] = RecItem(result[0], self.confs[i])
  185. break
  186. @property
  187. def confidence(self):
  188. return np.mean(self.confs)
  189. def parse(self):
  190. self.full_name()
  191. self.national()
  192. self.card_no()
  193. self.address()
  194. self.split_addr()
  195. self.birth()
  196. self.gender()
  197. self.expire_date()
  198. self.predict_name()
  199. if not self.res["IDNumber"].text:
  200. raise Exception("没有识别到身份证号")
  201. return self.res
  202. class BackParser(Parser):
  203. def __init__(self, txts, confs):
  204. Parser.__init__(self, txts, confs)
  205. def expire_date(self):
  206. for txt, conf in zip(self.result, self.confs):
  207. txt = txt.replace('.', '')
  208. res = re.findall('\d{8}\-\d{8}', txt)
  209. if res:
  210. self.res["expire_date"] = RecItem(res[0], conf)
  211. break
  212. res = re.findall('\d{8}\-长期', txt)
  213. if res:
  214. self.res["expire_date"] = RecItem(res[0], conf)
  215. break
  216. @property
  217. def confidence(self):
  218. return np.mean(self.confs)
  219. def parse(self):
  220. self.expire_date()
  221. if not self.res["expire_date"].text:
  222. raise Exception("无法识别")
  223. return self.res