parser.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. import re
  2. from collections import defaultdict
  3. from dataclasses import dataclass
  4. from typing import List
  5. import cpca
  6. import numpy as np
  7. from zhon.hanzi import punctuation
  8. from core.line_parser import OcrResult
  9. @dataclass
  10. class RecItem:
  11. text: str = ''
  12. confidence: float = 0.
  13. def to_dict(self):
  14. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  15. class Parser(object):
  16. def __init__(self, ocr_results: List[List[OcrResult]]):
  17. self.result = ocr_results
  18. self.res = defaultdict(RecItem)
  19. self.keys = ["name", "id", "language", "level", "exam_time", "score"]
  20. for key in self.keys:
  21. self.res[key] = RecItem()
  22. ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]')
  23. for item in self.result:
  24. tail = ['', 1.]
  25. for k in range(len(item)):
  26. item[k].txt = ''.join(re.findall(ch, item[k].txt))
  27. tail[0] = tail[0] + item[k].txt
  28. tail[1] = tail[1] + item[k].conf
  29. tail[1] = (tail[1] - 1.) / len(item)
  30. item.append(tail)
  31. for i in range(len(self.result)):
  32. res = self.result[i]
  33. txt = res[-1][0]
  34. if "口试" in txt:
  35. self.result = self.result[:i]
  36. break
  37. def parse(self):
  38. return self.res
  39. class CETParser(Parser):
  40. def __init__(self, ocr_results: List[List[OcrResult]]):
  41. Parser.__init__(self, ocr_results)
  42. def name(self):
  43. """
  44. 姓名
  45. """
  46. name_val = ''
  47. conf = 0.
  48. is_name = False
  49. for i in range(len(self.result)):
  50. res = self.result[i]
  51. txt = res[-1][0]
  52. conf = res[-1][1]
  53. for s in range(len(txt)):
  54. if txt[s] == "名" and s < 3 and "名" in txt:
  55. is_name = True
  56. if is_name:
  57. name_val = txt.split("名")[-1]
  58. break
  59. if len(name_val) < 5:
  60. self.res["name"] = RecItem(name_val, conf)
  61. else:
  62. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  63. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  64. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  65. for item in point_unicode:
  66. point = re.findall(item, name_val)
  67. if len(point) != 0:
  68. name_list = name_val.split(point[0])
  69. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  70. return
  71. def id(self):
  72. """
  73. 身份证号码
  74. """
  75. for i in range(len(self.result)):
  76. res = self.result[i]
  77. txt = res[-1][0]
  78. conf = res[-1][1]
  79. id_num = re.findall("\d{10,18}[X|x|×]*", txt)
  80. if id_num and len(id_num[0]) == 18:
  81. self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf)
  82. break
  83. def language(self):
  84. """
  85. 语言
  86. """
  87. self.res['language'] = RecItem("英语", 1.)
  88. def level(self):
  89. """
  90. 等级
  91. """
  92. for i in range(len(self.result)):
  93. res = self.result[i]
  94. txt = res[-1][0]
  95. conf = res[-1][1]
  96. if "四级" in txt:
  97. self.res['level'] = RecItem("CET4", conf)
  98. return
  99. elif "六级" in txt:
  100. self.res['level'] = RecItem("CET6", conf)
  101. return
  102. raise Exception("四六级无法识别")
  103. def exam_time(self):
  104. """
  105. 考试时间
  106. """
  107. for i in range(len(self.result)):
  108. res = self.result[i]
  109. txt = res[-1][0]
  110. conf = res[-1][1]
  111. if "时间" in txt:
  112. txt = txt.split("时间")[-1]
  113. self.res["exam_time"] = RecItem(self.to_data(txt), conf)
  114. return
  115. def score(self):
  116. """
  117. 总分
  118. """
  119. for i in range(len(self.result)):
  120. res = self.result[i]
  121. txt = res[-1][0]
  122. conf = res[-1][1]
  123. if "时间" in txt:
  124. txt = txt.split("月")[-1][:3]
  125. self.res["score"] = RecItem(txt, conf)
  126. return
  127. def to_data(self, txt):
  128. date_in = re.findall(r"\d+", txt)
  129. return f'{date_in[0][-4:]}年{date_in[1]}月'
  130. def parse(self):
  131. self.name()
  132. self.id()
  133. self.language()
  134. self.level()
  135. self.exam_time()
  136. self.score()
  137. return {key: self.res[key].to_dict() for key in self.keys}
  138. class TEMParser(Parser):
  139. def __init__(self, ocr_results: List[List[OcrResult]]):
  140. Parser.__init__(self, ocr_results)
  141. def name(self):
  142. """
  143. 姓名
  144. """
  145. name_val = ''
  146. conf = 0.
  147. is_name = False
  148. for row in self.result:
  149. for idx, r in enumerate(row[:-1]):
  150. if '同学' in r.txt:
  151. name_val = row[idx-1].txt
  152. break
  153. if len(name_val) < 5:
  154. self.res["name"] = RecItem(name_val, conf)
  155. else:
  156. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  157. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  158. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  159. for item in point_unicode:
  160. point = re.findall(item, name_val)
  161. if len(point) != 0:
  162. name_list = name_val.split(point[0])
  163. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  164. return
  165. def id(self):
  166. self.res['id'] = RecItem("", 1.)
  167. def language(self):
  168. self.res['language'] = RecItem("英语", 1.)
  169. def level(self):
  170. """
  171. 等级
  172. """
  173. for i in range(len(self.result)):
  174. res = self.result[i]
  175. txt = res[-1][0]
  176. conf = res[-1][1]
  177. if "TEM4" in txt or "基础" in txt:
  178. self.res['level'] = RecItem("TEM4", conf)
  179. return
  180. elif "TEM8" in txt or "高年级" in txt:
  181. self.res['level'] = RecItem("TEM8", conf)
  182. return
  183. raise Exception("专四专八无法识别")
  184. def exam_time(self):
  185. """
  186. 考试时间
  187. """
  188. for i in range(len(self.result)):
  189. res = self.result[i]
  190. txt = res[-1][0]
  191. conf = res[-1][1]
  192. if "于" in txt:
  193. txt = txt.split("于")[-1]
  194. self.res["exam_time"] = RecItem(self.to_data(txt), conf)
  195. return
  196. if '教育部全国' in txt:
  197. txt = txt.split("教育部全国")[0]
  198. self.res["exam_time"] = RecItem(self.to_data(txt), conf)
  199. return
  200. def to_data(self, txt):
  201. date_in = re.findall(r"\d+", txt)
  202. return f'{date_in[0][-4:]}年{date_in[1]}月'
  203. def score(self):
  204. """
  205. 总分
  206. """
  207. for i in range(len(self.result)):
  208. res = self.result[i]
  209. txt = res[-1][0]
  210. conf = res[-1][1]
  211. if "成绩" in txt:
  212. txt = txt.split("成绩")[-1][:2]
  213. if '合' in txt or '格' in txt:
  214. self.res["score"] = RecItem('合格', conf)
  215. return
  216. if '良' in txt or '好' in txt:
  217. self.res["score"] = RecItem('良好', conf)
  218. return
  219. if '优' in txt or '秀' in txt:
  220. self.res["score"] = RecItem('优秀', conf)
  221. return
  222. def parse(self):
  223. self.name()
  224. self.id()
  225. self.language()
  226. self.level()
  227. self.exam_time()
  228. self.score()
  229. return {key: self.res[key].to_dict() for key in self.keys}