parser.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. import math
  2. import re
  3. import string
  4. from dataclasses import dataclass
  5. from collections import defaultdict
  6. import numpy as np
  7. from typing import List
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  15. # 父类
  16. class Parser(object):
  17. def __init__(self, ocr_results: List[List[OcrResult]]):
  18. self.result = ocr_results
  19. # assert len(self.result) == len(self.confs), 'result and confs do not match'
  20. self.res = defaultdict(RecItem)
  21. self.keys = ['name', 'gender', 'admission_time', 'education_time', 'education_level', 'education_type',
  22. 'learning_type', 'school', 'major', 'number']
  23. for key in self.keys:
  24. self.res[key] = RecItem()
  25. for i in range(len(self.result)):
  26. tail = ['', 0.]
  27. for j in range(len(self.result[i])):
  28. self.result[i][j].txt = self.result[i][j].txt.replace("|", ""). \
  29. replace(":", "").replace(":", "").replace(",", ""). \
  30. replace(",", "").replace("【", "").replace("】", ""). \
  31. replace("「", "").replace("[", "").replace("]", "").replace(" ", "")
  32. for k in range(len(self.result[i])):
  33. tail[0] = tail[0] + self.result[i][k].txt
  34. tail[1] = np.mean([tail[1], self.result[i][k].conf])
  35. self.result[i].append(tail)
  36. def parse(self):
  37. return self.res
  38. # All
  39. class AllParser(Parser):
  40. def __init__(self, ocr_results: List[List[OcrResult]]):
  41. Parser.__init__(self, ocr_results)
  42. # all
  43. class PostParser(Parser):
  44. """
  45. 教育部学籍在线验证报告
  46. 表格
  47. """
  48. def __init__(self, ocr_results: List[List[OcrResult]]):
  49. Parser.__init__(self, ocr_results)
  50. def full_name(self):
  51. """
  52. 姓名
  53. """
  54. for i in range(len(self.result)):
  55. res = self.result[i]
  56. txt = res[-1][0]
  57. conf = res[-1][1]
  58. if "姓名" in txt:
  59. name_val = txt.split("姓名")[-1].split("性别")[0].split("证件")[0]
  60. if len(name_val) < 5:
  61. self.res["name"] = RecItem(name_val, conf)
  62. return
  63. else:
  64. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  65. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  66. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101", "\u002d",
  67. "\u4e00"]
  68. for n in range(len(point_unicode)):
  69. point = re.findall(point_unicode[n], name_val)
  70. if len(point) != 0:
  71. name_list = name_val.split(point[0])
  72. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  73. return
  74. def gender(self):
  75. """
  76. 性别女
  77. """
  78. for i in range(len(self.result)):
  79. res = self.result[i]
  80. txt = res[-1][0]
  81. conf = res[-1][1]
  82. if '男' in txt:
  83. self.res["gender"] = RecItem("男", conf)
  84. return
  85. elif '女' in txt:
  86. self.res["gender"] = RecItem("女", conf)
  87. return
  88. def admission_time(self):
  89. """
  90. 入学 ⚠️日期⚠️ ⚠️时间⚠️
  91. """
  92. for i in range(len(self.result)):
  93. res = self.result[i]
  94. txt = res[-1][0]
  95. conf = res[-1][1]
  96. if "学日期" in txt:
  97. txt = txt.split("学日期")[-1]
  98. self.res["admission_time"] = RecItem(self.to_data(txt), conf)
  99. return
  100. elif "学时间" in txt:
  101. txt = txt.split("学时间")[-1]
  102. self.res["admission_time"] = RecItem(self.to_data(txt), conf)
  103. return
  104. elif "入学" in txt:
  105. txt = txt.split("期")[-1]
  106. self.res["admission_time"] = RecItem(self.to_data(txt), conf)
  107. return
  108. def education_time(self):
  109. """
  110. 毕业日期 ⚠️离校日期⚠️
  111. """
  112. for i in range(len(self.result)):
  113. res = self.result[i]
  114. txt = res[-1][0]
  115. conf = res[-1][1]
  116. if "业日期" in txt:
  117. txt = txt.split("业日期")[-1]
  118. self.res["education_time"] = RecItem(self.to_data(txt), conf)
  119. return
  120. elif "校日期" in txt:
  121. txt = txt.split("校日期")[-1]
  122. self.res["education_time"] = RecItem(self.to_data(txt), conf)
  123. return
  124. def education_level(self):
  125. """
  126. 学历层次 本科
  127. """
  128. for i in range(len(self.result)):
  129. res = self.result[i]
  130. for j in range(len(self.result[i]) - 1):
  131. txt = self.result[i][j].txt
  132. conf = self.result[i][j].conf
  133. # 0 res 2 mini_dis 0 传入字段 4 字段长度 5 传入字段
  134. if '层次' in txt and len(txt) < 4:
  135. self.res["education_level"] = RecItem(self.result[i][j + 1].txt, conf)
  136. return
  137. if "层次" in txt:
  138. txt = txt.split("层次")[-1]
  139. self.res["education_level"] = RecItem(txt, conf)
  140. return
  141. def education_type(self):
  142. """
  143. 学历类别 ⚠️类型⚠️ 普通高等教育
  144. """
  145. for i in range(len(self.result)):
  146. res = self.result[i]
  147. for j in range(len(self.result[i]) - 1):
  148. txt = self.result[i][j].txt
  149. conf = self.result[i][j].conf
  150. education_type = "类别" in txt or "类型" in txt
  151. if education_type and len(txt) < 6:
  152. self.res["education_type"] = RecItem(self.result[i][j + 1].txt, conf)
  153. return
  154. if "历类别" in txt:
  155. txt = txt.split("历类别")[-1]
  156. self.res["education_type"] = RecItem(txt, conf)
  157. return
  158. elif "类型" in txt:
  159. txt = txt.split("类型")[-1]
  160. self.res["education_type"] = RecItem(txt, conf)
  161. return
  162. def learning_type(self):
  163. """
  164. 学习形式 ⚠️形式⚠️ 普通全日制
  165. """
  166. for i in range(len(self.result)):
  167. res = self.result[i]
  168. for j in range(len(self.result[i]) - 1):
  169. txt = self.result[i][j].txt
  170. conf = self.result[i][j].conf
  171. if '形式' in txt and len(txt) < 6:
  172. self.res["learning_type"] = RecItem(self.result[i][j + 1].txt, conf)
  173. return
  174. if "习形式" in txt:
  175. txt = txt.split("习形式")[-1]
  176. self.res["learning_type"] = RecItem(txt, conf)
  177. return
  178. elif "形式" in txt:
  179. txt = txt.split("形式")[-1]
  180. self.res["learning_type"] = RecItem(txt, conf)
  181. return
  182. def school(self):
  183. """
  184. 学校名称 ⚠️院校⚠️
  185. """
  186. for i in range(len(self.result)):
  187. res = self.result[i]
  188. for j in range(len(self.result[i]) - 1):
  189. txt = self.result[i][j].txt
  190. conf = self.result[i][j].conf
  191. bool_school = '校名称' in txt or '院校' in txt
  192. if bool_school and len(txt) < 6:
  193. self.res["school"] = RecItem(self.result[i][j + 1].txt, conf)
  194. return
  195. # 学校名都带 `学`
  196. if '校名称' in txt and len(txt) < 6:
  197. for k in range(len(self.result[i]) - 1):
  198. if k == j: continue
  199. txt = self.result[i][k].txt
  200. conf = self.result[i][k].conf
  201. if "学" in txt:
  202. self.res["school"] = RecItem(txt, conf)
  203. return
  204. if "名称" in txt and j + 1 <= len(self.result[i]) - 1 and len(txt) < 6:
  205. if "学" in self.result[i][j + 1].txt:
  206. txt = self.result[i][j + 1].txt
  207. conf = self.result[i][j + 1].conf
  208. self.res["school"] = RecItem(txt, conf)
  209. return
  210. elif "学校名" in txt:
  211. txt = txt.split("名称")[-1]
  212. self.res["school"] = RecItem(txt, conf)
  213. return
  214. elif "院校" in txt:
  215. txt = txt.split("院校")[-1]
  216. self.res["school"] = RecItem(txt, conf)
  217. return
  218. def major(self):
  219. """
  220. 专业
  221. """
  222. for i in range(len(self.result)):
  223. res = self.result[i]
  224. for j in range(len(self.result[i]) - 1):
  225. txt = self.result[i][j].txt
  226. conf = self.result[i][j].conf
  227. mini_dis = [99999., 0]
  228. is_major = "专业" in txt
  229. if is_major and len(txt) < 4:
  230. for k in range(len(self.result[i]) - 1):
  231. if k == j: continue
  232. p = np.array(res[j].center) - np.array(res[k].center)
  233. min = math.hypot(p[0], p[1])
  234. if min < mini_dis[0]:
  235. mini_dis = [min, k]
  236. major_txt = self.broken(self.result[i][j + 1].txt, i, j)
  237. self.res["major"] = RecItem(major_txt, conf)
  238. return
  239. if is_major:
  240. txt = txt.split("专业")[-1]
  241. major_txt = self.broken(txt, i, j)
  242. self.res["major"] = RecItem(major_txt, conf)
  243. return
  244. def broken(self, txt, row, r):
  245. is_broken = '(' in txt and ')' not in txt or '(' in txt and ')' not in txt
  246. if not is_broken:
  247. return txt
  248. else:
  249. for i in range(row, len(self.result)):
  250. res = self.result[i]
  251. for j in range(r, len(res)-1):
  252. other_txt = res[j].txt
  253. if ')' in other_txt:
  254. return txt + other_txt.replace(')', ')').replace('(', "(")
  255. if ')' in other_txt:
  256. return txt + other_txt
  257. def to_data(self, txt):
  258. date_in = re.findall(r"\d+", txt)
  259. return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2][:2]}日'
  260. def number(self):
  261. """
  262. 证书编号
  263. """
  264. num_txt = ''
  265. num_conf = 0.
  266. for i in range(len(self.result)):
  267. for j in range(len(self.result[i]) - 1):
  268. txt = self.result[i][j].txt
  269. txt = txt.replace(' ', '')
  270. if '预计' in txt or '(预计' in txt or '(预计' in txt or '(毕业' in txt or '(毕业' in txt:
  271. self.res["number"] = RecItem('', 0.)
  272. return
  273. txt = re.findall("\d{16,18}", txt)
  274. conf = self.result[i][j].conf
  275. if len(txt) > 0:
  276. if len(txt[0]) == 18:
  277. num_txt = txt[0].replace("号码", "")
  278. num_conf = conf
  279. self.res["number"] = RecItem(num_txt, num_conf)
  280. return
  281. # 存入
  282. def parse(self):
  283. self.full_name()
  284. self.gender()
  285. self.admission_time()
  286. self.education_time()
  287. self.education_level()
  288. self.education_type()
  289. self.learning_type()
  290. self.school()
  291. self.number()
  292. self.major()
  293. return {key: self.res[key].to_dict() for key in self.keys}