idcrad.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. import re
  2. import string
  3. from dataclasses import dataclass
  4. from collections import defaultdict
  5. import numpy as np
  6. import cpca
  7. @dataclass
  8. class RecItem:
  9. text: str = ''
  10. confidence: float = 0.
  11. def to_dict(self):
  12. return {"text": self.text, "confidence": self.confidence}
  13. class Parser(object):
  14. def __init__(self, txts, confs):
  15. self.result = txts
  16. self.confs = confs
  17. assert len(self.result) == len(self.confs), 'result and confs do not match'
  18. self.res = defaultdict(RecItem)
  19. self.res["Name"] = RecItem()
  20. self.res["IDNumber"] = RecItem()
  21. self.res["Address"] = RecItem()
  22. self.res["Gender"] = RecItem()
  23. self.res["Nationality"] = RecItem()
  24. self.res["Birth"] = RecItem()
  25. self.res["expire_date"] = RecItem()
  26. def parse(self):
  27. return self.res
  28. @property
  29. def confidence(self):
  30. return 0.
  31. class FrontParser(Parser):
  32. """
  33. """
  34. def __init__(self, txts, confs):
  35. Parser.__init__(self, txts, confs)
  36. self.result = [
  37. i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
  38. for i in txts
  39. ]
  40. assert len(self.result) == len(self.confs), 'result and confs do not match'
  41. def birth(self):
  42. addString = []
  43. for i in range(len(self.result)):
  44. txt = self.result[i]
  45. if "出生" in txt or "生" in txt:
  46. # txt = txt.replace("出生", "")
  47. txt = txt.split('生')[-1]
  48. addString.append(txt.strip())
  49. self.res["Birth"] = RecItem("".join(addString), self.confs[i])
  50. break
  51. def card_no(self):
  52. """
  53. 身份证号码
  54. """
  55. for i in range(len(self.result)):
  56. txt = self.result[i]
  57. # 身份证号码
  58. if "X" in txt or "x" in txt:
  59. res = re.findall("\d*[X|x]", txt)
  60. else:
  61. res = re.findall("\d{16,18}", txt)
  62. if len(res) > 0:
  63. if len(res[0]) == 18:
  64. self.res["IDNumber"].text = res[0].replace("号码", "")
  65. self.res["IDNumber"].confidence = self.confs[i]
  66. self.res["Gender"].text = "男" if int(res[0][16]) % 2 else "女"
  67. self.res["Gender"].confidence = self.confs[i]
  68. break
  69. def full_name(self):
  70. """
  71. 身份证姓名
  72. """
  73. for i in range(len(self.result)):
  74. txt = self.result[i]
  75. if ("姓名" or "名" in txt) and len(txt) > 2:
  76. res = re.findall("名[\u4e00-\u9fa5]{1,4}", txt)
  77. if len(res) > 0:
  78. self.res["Name"].text = res[0].split("名")[-1]
  79. self.res["Name"].confidence = self.confs[i]
  80. self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
  81. break
  82. def gender(self):
  83. """
  84. 性别女民族汉
  85. """
  86. if len(self.res["Gender"].text) != 0: return
  87. for i in range(len(self.result)):
  88. txt = self.result[i]
  89. if "男" in txt:
  90. self.res["Gender"] = RecItem("男", self.confs[i])
  91. break
  92. if "女" in txt:
  93. self.res["Gender"] = RecItem("女", self.confs[i])
  94. break
  95. def national(self):
  96. # 性别女民族汉
  97. for i in range(len(self.result)):
  98. txt = self.result[i]
  99. res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  100. if len(res) > 0:
  101. self.res["Nationality"] = RecItem(res[0].split("族")[-1], self.confs[i])
  102. break
  103. def address(self):
  104. """
  105. 身份证地址
  106. """
  107. addString = []
  108. conf = []
  109. for i in range(len(self.result)):
  110. txt = self.result[i]
  111. txt = txt.replace("号码", "")
  112. if "公民" in txt:
  113. txt = "temp"
  114. # 身份证地址
  115. if (
  116. "住址" in txt
  117. or "址" in txt
  118. or "省" in txt
  119. or "市" in txt
  120. or "县" in txt
  121. or "街" in txt
  122. or "乡" in txt
  123. or "村" in txt
  124. or "镇" in txt
  125. or "区" in txt
  126. or "城" in txt
  127. or "组" in txt
  128. or "号" in txt
  129. ):
  130. if "住址" in txt or "省" in txt or "址" in txt:
  131. addString.insert(0, txt.split("址")[-1])
  132. else:
  133. addString.append(txt)
  134. conf.append(self.confs[i])
  135. self.result[i] = "temp"
  136. # print(addString)
  137. if len(addString) > 0:
  138. self.res["Address"].text = "".join(addString)
  139. self.res["Address"].confidence = np.mean(conf)
  140. print(f'addr: {self.res["Address"]}')
  141. def split_addr(self):
  142. if self.res["Address"].text:
  143. conf = self.res["Address"].confidence
  144. print('split_addr', self.res["Address"].text)
  145. df = cpca.transform([self.res["Address"].text])
  146. print(df)
  147. province = df.iloc[0, 0]
  148. city = df.iloc[0, 1]
  149. region = df.iloc[0, 2]
  150. detail = df.iloc[0, 3]
  151. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  152. self.res["address_province"] = RecItem(province, conf)
  153. self.res["address_city"] = RecItem(city, conf)
  154. self.res["address_region"] = RecItem(region, conf)
  155. self.res["address_detail"] = RecItem(detail, conf)
  156. def expire_date(self):
  157. for txt, conf in zip(self.result, self.confs):
  158. print(txt)
  159. res = re.findall('\d{4}\.\d{2}\.\d{2}\-\d{4}\.\d{2}\.\d{2}', txt)
  160. print(res)
  161. if res:
  162. self.res["expire_date"] = RecItem(res[0], conf)
  163. def predict_name(self):
  164. """
  165. 如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进
  166. """
  167. if self.res['Name']: return
  168. for i in range(len(self.result)):
  169. txt = self.result[i]
  170. if self.res["Name"] == "":
  171. if len(txt) > 1 and len(txt) < 5:
  172. if (
  173. "性别" not in txt
  174. and "姓名" not in txt
  175. and "民族" not in txt
  176. and "住址" not in txt
  177. and "出生" not in txt
  178. and "号码" not in txt
  179. and "身份" not in txt
  180. ):
  181. result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
  182. if len(result) > 0:
  183. self.res["Name"] = result[0]
  184. break
  185. @property
  186. def confidence(self):
  187. return np.mean(self.confs)
  188. def parse(self):
  189. self.full_name()
  190. self.national()
  191. self.card_no()
  192. self.address()
  193. self.split_addr()
  194. # self.predict_name()
  195. self.birth()
  196. self.gender()
  197. self.expire_date()
  198. return self.res
  199. class BackParser(Parser):
  200. def __init__(self, txts, confs):
  201. Parser.__init__(self, txts, confs)
  202. def expire_date(self):
  203. for txt, conf in zip(self.result, self.confs):
  204. print(txt)
  205. res = re.findall('\d{4}\.\d{2}\.\d{2}\-\d{4}\.\d{2}\.\d{2}', txt)
  206. print(res)
  207. if res:
  208. self.res["expire_date"] = RecItem(res[0], conf)
  209. @property
  210. def confidence(self):
  211. return np.mean(self.confs)
  212. def parse(self):
  213. self.expire_date()
  214. return self.res