123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326 |
- import re
- from collections import defaultdict
- from dataclasses import dataclass
- from typing import List
- import cpca
- import numpy as np
- from zhon.hanzi import punctuation
- from core.line_parser import OcrResult
- @dataclass
- class RecItem:
- text: str = ''
- confidence: float = 0.
- def to_dict(self):
- return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
- class Parser(object):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- self.result = ocr_results
- self.res = defaultdict(RecItem)
- self.keys = ["name", "id", "language", "level", "exam_time", "score"]
- for key in self.keys:
- self.res[key] = RecItem()
- ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]')
- for item in self.result:
- tail = ['', 1.]
- for k in range(len(item)):
- item[k].txt = ''.join(re.findall(ch, item[k].txt))
- tail[0] = tail[0] + item[k].txt
- tail[1] = tail[1] + item[k].conf
- tail[1] = (tail[1] - 1.) / len(item)
- item.append(tail)
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- if "口试" in txt:
- self.result = self.result[:i + 1]
- break
- def parse(self):
- return self.res
- class CETParser(Parser):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- Parser.__init__(self, ocr_results)
- def name(self):
- """
- 姓名
- """
- name_val = ''
- conf = 0.
- is_name = False
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- for s in range(len(txt)):
- if txt[s] == "名" and s < 3 and "名" in txt:
- is_name = True
- if is_name:
- name_val = txt.split("名")[-1]
- break
- if len(name_val) < 5:
- self.res["name"] = RecItem(name_val, conf)
- else:
- point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
- "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
- "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
- for item in point_unicode:
- point = re.findall(item, name_val)
- if len(point) != 0:
- name_list = name_val.split(point[0])
- self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
- return
- def id(self):
- """
- 身份证号码
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- id_num = re.findall("\d{17,19}[X|x|×]*", txt)
- if id_num and len(id_num[0]) == 19 and id_num[0][0] == id_num[0][1]:
- self.res['id'] = RecItem(id_num[0][1:], conf)
- break
- if id_num and len(id_num[0]) == 18:
- self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf)
- break
- if id_num and len(id_num[0]) == 17:
- self.res['id'] = RecItem(f'{id_num[0]}X', conf)
- break
- def language(self):
- """
- 语言
- """
- self.res['language'] = RecItem("英语", 1.)
- def level(self):
- """
- 等级
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "四级" in txt:
- self.res['level'] = RecItem("CET4", conf)
- return
- elif "六级" in txt:
- self.res['level'] = RecItem("CET6", conf)
- return
- raise Exception("四六级无法识别")
- def exam_time(self):
- """
- 考试时间
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "时间" in txt:
- txt = txt.split("时间")[-1]
- self.res["exam_time"] = RecItem(self.to_data(txt), conf)
- return
- def score(self):
- """
- 总分
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if '总分' in txt and (len(txt) == 5 or '具备' in txt or '资格' in txt):
- score = re.findall(r'\d+', txt)
- if len(score[0]) == 4 and score[0][0] == score[0][1]:
- self.res["score"] = RecItem(score[0][1:], conf)
- return
- self.res["score"] = RecItem(score[0], conf)
- return
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "时间" in txt:
- if '月' in txt:
- txt = txt.split("月")[-1][:3]
- self.res["score"] = RecItem(txt, conf)
- else:
- self.res["score"] = RecItem(res[1].txt, conf)
- return
- def to_data(self, txt):
- date_in = re.findall(r"\d+", txt)
- if len(date_in) == 1: date_in.append('6')
- return f'{date_in[0][-4:]}年{date_in[1]}月'
- def parse(self):
- self.name()
- self.id()
- self.language()
- self.level()
- self.exam_time()
- self.score()
- return {key: self.res[key].to_dict() for key in self.keys}
- class TEMParser(Parser):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- Parser.__init__(self, ocr_results)
- def name(self):
- """
- 姓名
- """
- name_val = ''
- conf = 0.
- is_name = False
- for row_idx, row in enumerate(self.result):
- for idx, r in enumerate(row[:-1]):
- if '同学' in r.txt:
- is_name = True
- name_val = self.result[row_idx - 1][-2].txt if idx == 0 else row[idx - 1].txt
- break
- if is_name is False:
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if '于' in txt:
- txt = ''.join(txt.split('于')[:-1])
- name_val = txt.split('学生')[-1]
- break
- if len(name_val) < 5:
- self.res["name"] = RecItem(name_val, conf)
- else:
- point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
- "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
- "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
- for item in point_unicode:
- point = re.findall(item, name_val)
- if len(point) != 0:
- name_list = name_val.split(point[0])
- self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
- return
- def id(self):
- self.res['id'] = RecItem("", 1.)
- def language(self):
- self.res['language'] = RecItem("英语", 1.)
- def level(self):
- """
- 等级
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "TEM4" in txt or "基础" in txt or '四级' in txt:
- self.res['level'] = RecItem("TEM4", conf)
- return
- elif "TEM8" in txt or "高年级" in txt or '八级' in txt:
- self.res['level'] = RecItem("TEM8", conf)
- return
- raise Exception("专四专八无法识别")
- def exam_time(self):
- """
- 考试时间
- """
- for i in range(len(self.result)):
- NewVersion = True
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if '级学生' in txt: NewVersion = False
- if not NewVersion:
- txt = txt.split('于')[-1].split('参')[0]
- self.res["exam_time"] = RecItem(self.han_to_date(txt), conf)
- return
- if "于" in txt:
- txt = txt.split("于")[-1]
- self.res["exam_time"] = RecItem(self.to_data(txt), conf)
- return
- if '教育部全国' in txt:
- txt = txt.split("教育部全国")[0]
- self.res["exam_time"] = RecItem(self.to_data(txt), conf)
- return
- def to_data(self, txt):
- date_in = re.findall(r"\d+", txt)
- if len(date_in) == 1: date_in.append('6')
- return f'{date_in[0][-4:]}年{date_in[1]}月'
- def han_to_date(self, date):
- numbers = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十一': '11',
- '十二': '12', '0': '0', 'O': '0', 'o': '0'}
- date = date.split('于')[-1].split('月')[0]
- data_y = date.split('年')[0]
- if len(data_y) == 3:
- y = list(data_y)
- y.insert(1, '0')
- data_y = ''.join(y)
- date_m = date.split('年')[-1] or '6'
- for wy in data_y:
- data_y = data_y.replace(wy, numbers.get(wy))
- date_m =numbers.get(date_m)
- return f"{data_y}年{date_m}月"
- def score(self):
- """
- 总分
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "成绩" in txt:
- txt = txt.split("成绩")[-1][:2]
- if '合' in txt or '格' in txt:
- self.res["score"] = RecItem('合格', conf)
- return
- if '良' in txt or '好' in txt:
- self.res["score"] = RecItem('良好', conf)
- return
- if '优' in txt or '秀' in txt:
- self.res["score"] = RecItem('优秀', conf)
- return
- def parse(self):
- self.name()
- self.id()
- self.language()
- self.level()
- self.exam_time()
- self.score()
- return {key: self.res[key].to_dict() for key in self.keys}
|