parser.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. import re
  2. from collections import defaultdict
  3. from dataclasses import dataclass
  4. from typing import List
  5. import cpca
  6. import numpy as np
  7. from zhon.hanzi import punctuation
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  15. class Parser(object):
  16. def __init__(self, ocr_results: List[List[OcrResult]]):
  17. self.result = ocr_results
  18. self.res = defaultdict(RecItem)
  19. self.keys = ["name", "id", "language", "level", "exam_time", "score"]
  20. for key in self.keys:
  21. self.res[key] = RecItem()
  22. ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]')
  23. for item in self.result:
  24. tail = ['', 1.]
  25. for k in range(len(item)):
  26. item[k].txt = ''.join(re.findall(ch, item[k].txt))
  27. tail[0] = tail[0] + item[k].txt
  28. tail[1] = tail[1] + item[k].conf
  29. tail[1] = (tail[1] - 1.) / len(item)
  30. item.append(tail)
  31. for i in range(len(self.result)):
  32. res = self.result[i]
  33. txt = res[-1][0]
  34. if "口试" in txt:
  35. self.result = self.result[:i + 1]
  36. break
  37. def parse(self):
  38. return self.res
  39. class CETParser(Parser):
  40. def __init__(self, ocr_results: List[List[OcrResult]]):
  41. Parser.__init__(self, ocr_results)
  42. def name(self):
  43. """
  44. 姓名
  45. """
  46. name_val = ''
  47. conf = 0.
  48. is_name = False
  49. for i in range(len(self.result)):
  50. res = self.result[i]
  51. txt = res[-1][0]
  52. conf = res[-1][1]
  53. for s in range(len(txt)):
  54. if txt[s] == "名" and s < 3 and "名" in txt:
  55. is_name = True
  56. if is_name:
  57. name_val = txt.split("名")[-1]
  58. break
  59. if len(name_val) < 5:
  60. self.res["name"] = RecItem(name_val, conf)
  61. else:
  62. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  63. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  64. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  65. for item in point_unicode:
  66. point = re.findall(item, name_val)
  67. if len(point) != 0:
  68. name_list = name_val.split(point[0])
  69. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  70. return
  71. def id(self):
  72. """
  73. 身份证号码
  74. """
  75. for i in range(len(self.result)):
  76. res = self.result[i]
  77. txt = res[-1][0]
  78. conf = res[-1][1]
  79. id_num = re.findall("\d{17,19}[X|x|×]*", txt)
  80. if id_num and len(id_num[0]) == 19 and id_num[0][0] == id_num[0][1]:
  81. self.res['id'] = RecItem(id_num[0][1:], conf)
  82. break
  83. if id_num and len(id_num[0]) == 18:
  84. self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf)
  85. break
  86. if id_num and len(id_num[0]) == 17:
  87. self.res['id'] = RecItem(f'{id_num[0]}X', conf)
  88. break
  89. def language(self):
  90. """
  91. 语言
  92. """
  93. self.res['language'] = RecItem("英语", 1.)
  94. def level(self):
  95. """
  96. 等级
  97. """
  98. for i in range(len(self.result)):
  99. res = self.result[i]
  100. txt = res[-1][0]
  101. conf = res[-1][1]
  102. if "四级" in txt:
  103. self.res['level'] = RecItem("CET4", conf)
  104. return
  105. elif "六级" in txt:
  106. self.res['level'] = RecItem("CET6", conf)
  107. return
  108. raise Exception("四六级无法识别")
  109. def exam_time(self):
  110. """
  111. 考试时间
  112. """
  113. for i in range(len(self.result)):
  114. res = self.result[i]
  115. txt = res[-1][0]
  116. conf = res[-1][1]
  117. if "时间" in txt:
  118. txt = txt.split("时间")[-1]
  119. self.res["exam_time"] = RecItem(self.to_data(txt), conf)
  120. return
  121. def score(self):
  122. """
  123. 总分
  124. """
  125. for i in range(len(self.result)):
  126. res = self.result[i]
  127. txt = res[-1][0]
  128. conf = res[-1][1]
  129. if '总分' in txt and (len(txt) == 5 or '具备' in txt or '资格' in txt):
  130. score = re.findall(r'\d+', txt)
  131. if len(score[0]) == 4 and score[0][0] == score[0][1]:
  132. self.res["score"] = RecItem(score[0][1:], conf)
  133. return
  134. self.res["score"] = RecItem(score[0], conf)
  135. return
  136. for i in range(len(self.result)):
  137. res = self.result[i]
  138. txt = res[-1][0]
  139. conf = res[-1][1]
  140. if "时间" in txt:
  141. if '月' in txt:
  142. txt = txt.split("月")[-1][:3]
  143. self.res["score"] = RecItem(txt, conf)
  144. else:
  145. self.res["score"] = RecItem(res[1].txt, conf)
  146. return
  147. def to_data(self, txt):
  148. date_in = re.findall(r"\d+", txt)
  149. if len(date_in) == 1: date_in.append('6')
  150. return f'{date_in[0][-4:]}年{date_in[1]}月'
  151. def parse(self):
  152. self.name()
  153. self.id()
  154. self.language()
  155. self.level()
  156. self.exam_time()
  157. self.score()
  158. return {key: self.res[key].to_dict() for key in self.keys}
  159. class TEMParser(Parser):
  160. def __init__(self, ocr_results: List[List[OcrResult]]):
  161. Parser.__init__(self, ocr_results)
  162. def name(self):
  163. """
  164. 姓名
  165. """
  166. name_val = ''
  167. conf = 0.
  168. is_name = False
  169. for row_idx, row in enumerate(self.result):
  170. for idx, r in enumerate(row[:-1]):
  171. if '同学' in r.txt:
  172. is_name = True
  173. name_val = self.result[row_idx - 1][-2].txt if idx == 0 else row[idx - 1].txt
  174. break
  175. if is_name is False:
  176. for i in range(len(self.result)):
  177. res = self.result[i]
  178. txt = res[-1][0]
  179. conf = res[-1][1]
  180. if '于' in txt:
  181. txt = ''.join(txt.split('于')[:-1])
  182. name_val = txt.split('学生')[-1]
  183. break
  184. if len(name_val) < 5:
  185. self.res["name"] = RecItem(name_val, conf)
  186. else:
  187. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  188. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  189. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  190. for item in point_unicode:
  191. point = re.findall(item, name_val)
  192. if len(point) != 0:
  193. name_list = name_val.split(point[0])
  194. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  195. return
  196. def id(self):
  197. self.res['id'] = RecItem("", 1.)
  198. def language(self):
  199. self.res['language'] = RecItem("英语", 1.)
  200. def level(self):
  201. """
  202. 等级
  203. """
  204. for i in range(len(self.result)):
  205. res = self.result[i]
  206. txt = res[-1][0]
  207. conf = res[-1][1]
  208. if "TEM4" in txt or "基础" in txt or '四级' in txt:
  209. self.res['level'] = RecItem("TEM4", conf)
  210. return
  211. elif "TEM8" in txt or "高年级" in txt or '八级' in txt:
  212. self.res['level'] = RecItem("TEM8", conf)
  213. return
  214. raise Exception("专四专八无法识别")
  215. def exam_time(self):
  216. """
  217. 考试时间
  218. """
  219. for i in range(len(self.result)):
  220. NewVersion = True
  221. res = self.result[i]
  222. txt = res[-1][0]
  223. conf = res[-1][1]
  224. if '级学生' in txt: NewVersion = False
  225. if not NewVersion:
  226. txt = txt.split('于')[-1].split('参')[0]
  227. self.res["exam_time"] = RecItem(self.han_to_date(txt), conf)
  228. return
  229. if "于" in txt:
  230. txt = txt.split("于")[-1]
  231. self.res["exam_time"] = RecItem(self.to_data(txt), conf)
  232. return
  233. if '教育部全国' in txt:
  234. txt = txt.split("教育部全国")[0]
  235. self.res["exam_time"] = RecItem(self.to_data(txt), conf)
  236. return
  237. def to_data(self, txt):
  238. date_in = re.findall(r"\d+", txt)
  239. if len(date_in) == 1: date_in.append('6')
  240. return f'{date_in[0][-4:]}年{date_in[1]}月'
  241. def han_to_date(self, date):
  242. numbers = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十一': '11',
  243. '十二': '12', '0': '0', 'O': '0', 'o': '0'}
  244. date = date.split('于')[-1].split('月')[0]
  245. data_y = date.split('年')[0]
  246. if len(data_y) == 3:
  247. y = list(data_y)
  248. y.insert(1, '0')
  249. data_y = ''.join(y)
  250. date_m = date.split('年')[-1] or '6'
  251. for wy in data_y:
  252. data_y = data_y.replace(wy, numbers.get(wy))
  253. date_m =numbers.get(date_m)
  254. return f"{data_y}年{date_m}月"
  255. def score(self):
  256. """
  257. 总分
  258. """
  259. for i in range(len(self.result)):
  260. res = self.result[i]
  261. txt = res[-1][0]
  262. conf = res[-1][1]
  263. if "成绩" in txt:
  264. txt = txt.split("成绩")[-1][:2]
  265. if '合' in txt or '格' in txt:
  266. self.res["score"] = RecItem('合格', conf)
  267. return
  268. if '良' in txt or '好' in txt:
  269. self.res["score"] = RecItem('良好', conf)
  270. return
  271. if '优' in txt or '秀' in txt:
  272. self.res["score"] = RecItem('优秀', conf)
  273. return
  274. def parse(self):
  275. self.name()
  276. self.id()
  277. self.language()
  278. self.level()
  279. self.exam_time()
  280. self.score()
  281. return {key: self.res[key].to_dict() for key in self.keys}