import re from collections import defaultdict from dataclasses import dataclass from typing import List import cpca import numpy as np from zhon.hanzi import punctuation from core.line_parser import OcrResult @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)} class Parser(object): def __init__(self, ocr_results: List[List[OcrResult]]): self.result = ocr_results self.res = defaultdict(RecItem) self.keys = ["name", "id", "language", "level", "exam_time", "score"] for key in self.keys: self.res[key] = RecItem() ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]') for item in self.result: tail = ['', 1.] for k in range(len(item)): item[k].txt = ''.join(re.findall(ch, item[k].txt)) tail[0] = tail[0] + item[k].txt tail[1] = tail[1] + item[k].conf tail[1] = (tail[1] - 1.) / len(item) item.append(tail) for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] if "口试" in txt: self.result = self.result[:i] break def parse(self): return self.res class CETParser(Parser): def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def name(self): """ 姓名 """ name_val = '' conf = 0. is_name = False for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] for s in range(len(txt)): if txt[s] == "名" and s < 3 and "名" in txt: is_name = True if is_name: name_val = txt.split("名")[-1] break if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for item in point_unicode: point = re.findall(item, name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return def id(self): """ 身份证号码 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] id_num = re.findall("\d{10,18}[X|x|×]*", txt) if id_num and len(id_num[0]) == 18: self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf) break def language(self): """ 语言 """ self.res['language'] = RecItem("英语", 1.) def level(self): """ 等级 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "四级" in txt: self.res['level'] = RecItem("CET4", conf) return elif "六级" in txt: self.res['level'] = RecItem("CET6", conf) return raise Exception("四六级无法识别") def exam_time(self): """ 考试时间 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "时间" in txt: txt = txt.split("时间")[-1] self.res["exam_time"] = RecItem(self.to_data(txt), conf) return def score(self): """ 总分 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "时间" in txt: txt = txt.split("月")[-1][:3] self.res["score"] = RecItem(txt, conf) return def to_data(self, txt): date_in = re.findall(r"\d+", txt) return f'{date_in[0][-4:]}年{date_in[1]}月' def parse(self): self.name() self.id() self.language() self.level() self.exam_time() self.score() return {key: self.res[key].to_dict() for key in self.keys} class TEMParser(Parser): def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def name(self): """ 姓名 """ name_val = '' conf = 0. is_name = False for row in self.result: for idx, r in enumerate(row[:-1]): if '同学' in r.txt: name_val = row[idx-1].txt break if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for item in point_unicode: point = re.findall(item, name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return def id(self): self.res['id'] = RecItem("", 1.) def language(self): self.res['language'] = RecItem("英语", 1.) def level(self): """ 等级 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "TEM4" in txt or "基础" in txt: self.res['level'] = RecItem("TEM4", conf) return elif "TEM8" in txt or "高年级" in txt: self.res['level'] = RecItem("TEM8", conf) return raise Exception("专四专八无法识别") def exam_time(self): """ 考试时间 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "于" in txt: txt = txt.split("于")[-1] self.res["exam_time"] = RecItem(self.to_data(txt), conf) return if '教育部全国' in txt: txt = txt.split("教育部全国")[0] self.res["exam_time"] = RecItem(self.to_data(txt), conf) return def to_data(self, txt): date_in = re.findall(r"\d+", txt) return f'{date_in[0][-4:]}年{date_in[1]}月' def score(self): """ 总分 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "成绩" in txt: txt = txt.split("成绩")[-1][:2] if '合' in txt or '格' in txt: self.res["score"] = RecItem('合格', conf) return if '良' in txt or '好' in txt: self.res["score"] = RecItem('良好', conf) return if '优' in txt or '秀' in txt: self.res["score"] = RecItem('优秀', conf) return def parse(self): self.name() self.id() self.language() self.level() self.exam_time() self.score() return {key: self.res[key].to_dict() for key in self.keys}