idcrad.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. import re
  2. import json
  3. import string
  4. class IdCardStraight:
  5. """
  6. """
  7. def __init__(self, result):
  8. self.result = [
  9. i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
  10. for i in result
  11. ]
  12. self.out = {"Data": {"FrontResult": {}}}
  13. self.res = self.out["Data"]["FrontResult"]
  14. self.res["Name"] = ""
  15. self.res["IDNumber"] = ""
  16. self.res["Address"] = ""
  17. self.res["Gender"] = ""
  18. self.res["Nationality"] = ""
  19. self.res["year"] = ""
  20. # self.res["Isauthority"]=""
  21. # self.res["Effdata"]=""
  22. # def IS_author(self):
  23. # """
  24. # 签发机关
  25. # """
  26. def year(self):
  27. addString = []
  28. for i in range(len(self.result)):
  29. txt = self.result[i]
  30. if "出生" in txt:
  31. txt = txt.replace("出生", "")
  32. addString.append(txt)
  33. self.res["year"] = "".join(addString)
  34. # break
  35. # print(',,,,')
  36. # print(self.result)
  37. # txt = txt.replace("出生", "")
  38. # addString.append(txt)
  39. # print(txt)
  40. # self.res["year"] = "".join(addString)
  41. def birth_no(self):
  42. """
  43. 身份证号码
  44. """
  45. for i in range(len(self.result)):
  46. txt = self.result[i]
  47. # 身份证号码
  48. if "X" in txt or "x" in txt:
  49. res = re.findall("\d*[X|x]", txt)
  50. else:
  51. res = re.findall("\d{16,18}", txt)
  52. if len(res) > 0:
  53. if len(res[0]) == 18:
  54. self.res["IDNumber"] = res[0].replace("号码", "")
  55. self.res["Gender"] = "男" if int(res[0][16]) % 2 else "女"
  56. break
  57. def full_name(self):
  58. """
  59. 身份证姓名
  60. """
  61. # print(self)
  62. for i in range(len(self.result)):
  63. txt = self.result[i]
  64. if ("姓名" or "名" in txt) and len(txt) > 2:
  65. res = re.findall("名[\u4e00-\u9fa5]{1,4}", txt)
  66. if len(res) > 0:
  67. self.res["Name"] = res[0].split("名")[-1]
  68. self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
  69. break
  70. def sex(self):
  71. """
  72. 性别女民族汉
  73. """
  74. for i in range(len(self.result)):
  75. txt = self.result[i]
  76. if "男" in txt:
  77. self.res["Gender"] = "男"
  78. elif "女" in txt:
  79. self.res["Gender"] = "女"
  80. def national(self):
  81. # 性别女民族汉
  82. for i in range(len(self.result)):
  83. txt = self.result[i]
  84. res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
  85. if len(res) > 0:
  86. self.res["Nationality"] = res[0].split("族")[-1]
  87. break
  88. def address(self):
  89. """
  90. 身份证地址
  91. """
  92. addString = []
  93. for i in range(len(self.result)):
  94. txt = self.result[i]
  95. txt = txt.replace("号码", "")
  96. if "公民" in txt:
  97. txt = "temp"
  98. # 身份证地址
  99. if (
  100. "住址" in txt
  101. or "址" in txt
  102. or "省" in txt
  103. or "市" in txt
  104. or "县" in txt
  105. or "街" in txt
  106. or "乡" in txt
  107. or "村" in txt
  108. or "镇" in txt
  109. or "区" in txt
  110. or "城" in txt
  111. or "组" in txt
  112. or "号" in txt
  113. ):
  114. if "住址" in txt or "省" in txt or "址" in txt:
  115. addString.insert(0, txt.split("址")[-1])
  116. else:
  117. addString.append(txt)
  118. self.result[i] = "temp"
  119. # print(addString)
  120. if len(addString) > 0:
  121. self.res["Address"] = "".join(addString)
  122. else:
  123. self.res["Address"] = ""
  124. def predict_name(self):
  125. """
  126. 如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进
  127. """
  128. for i in range(len(self.result)):
  129. txt = self.result[i]
  130. if self.res["Name"] == "":
  131. if len(txt) > 1 and len(txt) < 5:
  132. if (
  133. "性别" not in txt
  134. and "姓名" not in txt
  135. and "民族" not in txt
  136. and "住址" not in txt
  137. and "出生" not in txt
  138. and "号码" not in txt
  139. and "身份" not in txt
  140. ):
  141. result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
  142. if len(result) > 0:
  143. self.res["Name"] = result[0]
  144. break
  145. def run(self):
  146. print(self)
  147. self.full_name()
  148. self.national()
  149. self.birth_no()
  150. self.address()
  151. self.predict_name()
  152. self.year()
  153. print(self.out)
  154. return self.out