import re from collections import defaultdict from dataclasses import dataclass from typing import List import cpca import numpy as np from zhon.hanzi import punctuation from core.line_parser import OcrResult @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)} class Parser(object): def __init__(self, ocr_results: List[List[OcrResult]]): self.result = ocr_results self.res = defaultdict(RecItem) self.keys = ["name", "id", "language", "level", "exam_time", "score"] for key in self.keys: self.res[key] = RecItem() ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]') for item in self.result: tail = ['', 1.] for k in range(len(item)): item[k].txt = ''.join(re.findall(ch, item[k].txt)) tail[0] = tail[0] + item[k].txt tail[1] = tail[1] + item[k].conf tail[1] = (tail[1] - 1.) / len(item) item.append(tail) for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] if "口试" in txt: self.result = self.result[:i + 1] break def parse(self): return self.res class CETParser(Parser): def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def name(self): """ 姓名 """ name_val = '' conf = 0. is_name = False for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] for s in range(len(txt)): if txt[s] == "名" and s < 3 and "名" in txt: is_name = True if is_name: name_val = txt.split("名")[-1] break if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for item in point_unicode: point = re.findall(item, name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return def id(self): """ 身份证号码 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] id_num = re.findall("\d{17,19}[X|x|×]*", txt) if id_num and len(id_num[0]) == 19 and id_num[0][0] == id_num[0][1]: self.res['id'] = RecItem(id_num[0][1:], conf) break if id_num and len(id_num[0]) == 18: self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf) break if id_num and len(id_num[0]) == 17: self.res['id'] = RecItem(f'{id_num[0]}X', conf) break def language(self): """ 语言 """ self.res['language'] = RecItem("英语", 1.) def level(self): """ 等级 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "四级" in txt: self.res['level'] = RecItem("CET4", conf) return elif "六级" in txt: self.res['level'] = RecItem("CET6", conf) return raise Exception("四六级无法识别") def exam_time(self): """ 考试时间 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "时间" in txt: txt = txt.split("时间")[-1] self.res["exam_time"] = RecItem(self.to_data(txt), conf) return def score(self): """ 总分 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if '总分' in txt and (len(txt) == 5 or '具备' in txt or '资格' in txt): score = re.findall(r'\d+', txt) if len(score[0]) == 4 and score[0][0] == score[0][1]: self.res["score"] = RecItem(score[0][1:], conf) return self.res["score"] = RecItem(score[0], conf) return for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "时间" in txt: if '月' in txt: txt = txt.split("月")[-1][:3] self.res["score"] = RecItem(txt, conf) else: self.res["score"] = RecItem(res[1].txt, conf) return def to_data(self, txt): date_in = re.findall(r"\d+", txt) if len(date_in) == 1: date_in.append('6') return f'{date_in[0][-4:]}年{date_in[1]}月' def parse(self): self.name() self.id() self.language() self.level() self.exam_time() self.score() return {key: self.res[key].to_dict() for key in self.keys} class TEMParser(Parser): def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def name(self): """ 姓名 """ name_val = '' conf = 0. is_name = False for row_idx, row in enumerate(self.result): for idx, r in enumerate(row[:-1]): if '同学' in r.txt: is_name = True name_val = self.result[row_idx - 1][-2].txt if idx == 0 else row[idx - 1].txt break if is_name is False: for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if '于' in txt: txt = ''.join(txt.split('于')[:-1]) name_val = txt.split('学生')[-1] break if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for item in point_unicode: point = re.findall(item, name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return def id(self): self.res['id'] = RecItem("", 1.) def language(self): self.res['language'] = RecItem("英语", 1.) def level(self): """ 等级 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "TEM4" in txt or "基础" in txt or '四级' in txt: self.res['level'] = RecItem("TEM4", conf) return elif "TEM8" in txt or "高年级" in txt or '八级' in txt: self.res['level'] = RecItem("TEM8", conf) return raise Exception("专四专八无法识别") def exam_time(self): """ 考试时间 """ for i in range(len(self.result)): NewVersion = True res = self.result[i] txt = res[-1][0] conf = res[-1][1] if '级学生' in txt: NewVersion = False if not NewVersion: txt = txt.split('于')[-1].split('参')[0] self.res["exam_time"] = RecItem(self.han_to_date(txt), conf) return if "于" in txt: txt = txt.split("于")[-1] self.res["exam_time"] = RecItem(self.to_data(txt), conf) return if '教育部全国' in txt: txt = txt.split("教育部全国")[0] self.res["exam_time"] = RecItem(self.to_data(txt), conf) return def to_data(self, txt): date_in = re.findall(r"\d+", txt) if len(date_in) == 1: date_in.append('6') return f'{date_in[0][-4:]}年{date_in[1]}月' def han_to_date(self, date): numbers = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十一': '11', '十二': '12', '0': '0', 'O': '0', 'o': '0'} date = date.split('于')[-1].split('月')[0] data_y = date.split('年')[0] if len(data_y) == 3: y = list(data_y) y.insert(1, '0') data_y = ''.join(y) date_m = date.split('年')[-1] or '6' for wy in data_y: data_y = data_y.replace(wy, numbers.get(wy)) date_m =numbers.get(date_m) return f"{data_y}年{date_m}月" def score(self): """ 总分 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "成绩" in txt: txt = txt.split("成绩")[-1][:2] if '合' in txt or '格' in txt: self.res["score"] = RecItem('合格', conf) return if '良' in txt or '好' in txt: self.res["score"] = RecItem('良好', conf) return if '优' in txt or '秀' in txt: self.res["score"] = RecItem('优秀', conf) return def parse(self): self.name() self.id() self.language() self.level() self.exam_time() self.score() return {key: self.res[key].to_dict() for key in self.keys}