123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- import math
- import re
- import string
- from dataclasses import dataclass
- from collections import defaultdict
- import numpy as np
- from typing import List
- from core.line_parser import OcrResult
- @dataclass
- class RecItem:
- text: str = ''
- confidence: float = 0.
- def to_dict(self):
- return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
- # 父类
- class Parser(object):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- self.result = ocr_results
- # assert len(self.result) == len(self.confs), 'result and confs do not match'
- self.res = defaultdict(RecItem)
- self.keys = ['name', 'gender', 'admission_time', 'education_time', 'education_level', 'education_type',
- 'learning_type', 'school', 'major', 'number']
- for key in self.keys:
- self.res[key] = RecItem()
- for i in range(len(self.result)):
- tail = ['', 0.]
- for j in range(len(self.result[i])):
- self.result[i][j].txt = self.result[i][j].txt.replace("|", ""). \
- replace(":", "").replace(":", "").replace(",", ""). \
- replace(",", "").replace("【", "").replace("】", ""). \
- replace("「", "").replace("[", "").replace("]", "").replace(" ", "")
- for k in range(len(self.result[i])):
- tail[0] = tail[0] + self.result[i][k].txt
- tail[1] = np.mean([tail[1], self.result[i][k].conf])
- self.result[i].append(tail)
- def parse(self):
- return self.res
- # All
- class AllParser(Parser):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- Parser.__init__(self, ocr_results)
- # all
- class PostParser(Parser):
- """
- 教育部学籍在线验证报告
- 表格
- """
- def __init__(self, ocr_results: List[List[OcrResult]]):
- Parser.__init__(self, ocr_results)
- def full_name(self):
- """
- 姓名
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "姓名" in txt:
- name_val = txt.split("姓名")[-1].split("性别")[0].split("证件")[0]
- if len(name_val) < 5:
- self.res["name"] = RecItem(name_val, conf)
- return
- else:
- point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
- "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
- "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101", "\u002d",
- "\u4e00"]
- for n in range(len(point_unicode)):
- point = re.findall(point_unicode[n], name_val)
- if len(point) != 0:
- name_list = name_val.split(point[0])
- self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
- return
- def gender(self):
- """
- 性别女
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if '男' in txt:
- self.res["gender"] = RecItem("男", conf)
- return
- elif '女' in txt:
- self.res["gender"] = RecItem("女", conf)
- return
- def admission_time(self):
- """
- 入学 ⚠️日期⚠️ ⚠️时间⚠️
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "学日期" in txt:
- txt = txt.split("学日期")[-1]
- self.res["admission_time"] = RecItem(self.to_data(txt), conf)
- return
- elif "学时间" in txt:
- txt = txt.split("学时间")[-1]
- self.res["admission_time"] = RecItem(self.to_data(txt), conf)
- return
- elif "入学" in txt:
- txt = txt.split("期")[-1]
- self.res["admission_time"] = RecItem(self.to_data(txt), conf)
- return
- def education_time(self):
- """
- 毕业日期 ⚠️离校日期⚠️
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "业日期" in txt:
- txt = txt.split("业日期")[-1]
- self.res["education_time"] = RecItem(self.to_data(txt), conf)
- return
- elif "校日期" in txt:
- txt = txt.split("校日期")[-1]
- self.res["education_time"] = RecItem(self.to_data(txt), conf)
- return
- def education_level(self):
- """
- 学历层次 本科
- """
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i]) - 1):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- # 0 res 2 mini_dis 0 传入字段 4 字段长度 5 传入字段
- if '层次' in txt and len(txt) < 4:
- self.res["education_level"] = RecItem(self.result[i][j + 1].txt, conf)
- return
- if "层次" in txt:
- txt = txt.split("层次")[-1]
- self.res["education_level"] = RecItem(txt, conf)
- return
- def education_type(self):
- """
- 学历类别 ⚠️类型⚠️ 普通高等教育
- """
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i]) - 1):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- education_type = "类别" in txt or "类型" in txt
- if education_type and len(txt) < 6:
- self.res["education_type"] = RecItem(self.result[i][j + 1].txt, conf)
- return
- if "历类别" in txt:
- txt = txt.split("历类别")[-1]
- self.res["education_type"] = RecItem(txt, conf)
- return
- elif "类型" in txt:
- txt = txt.split("类型")[-1]
- self.res["education_type"] = RecItem(txt, conf)
- return
- def learning_type(self):
- """
- 学习形式 ⚠️形式⚠️ 普通全日制
- """
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i]) - 1):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- if '形式' in txt and len(txt) < 6:
- self.res["learning_type"] = RecItem(self.result[i][j + 1].txt, conf)
- return
- if "习形式" in txt:
- txt = txt.split("习形式")[-1]
- self.res["learning_type"] = RecItem(txt, conf)
- return
- elif "形式" in txt:
- txt = txt.split("形式")[-1]
- self.res["learning_type"] = RecItem(txt, conf)
- return
- def school(self):
- """
- 学校名称 ⚠️院校⚠️
- """
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i]) - 1):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- bool_school = '校名称' in txt or '院校' in txt
- if bool_school and len(txt) < 6:
- self.res["school"] = RecItem(self.result[i][j + 1].txt, conf)
- return
- # 学校名都带 `学`
- if '校名称' in txt and len(txt) < 6:
- for k in range(len(self.result[i]) - 1):
- if k == j: continue
- txt = self.result[i][k].txt
- conf = self.result[i][k].conf
- if "学" in txt:
- self.res["school"] = RecItem(txt, conf)
- return
- if "名称" in txt and j + 1 <= len(self.result[i]) - 1 and len(txt) < 6:
- if "学" in self.result[i][j + 1].txt:
- txt = self.result[i][j + 1].txt
- conf = self.result[i][j + 1].conf
- self.res["school"] = RecItem(txt, conf)
- return
- elif "学校名" in txt:
- txt = txt.split("名称")[-1]
- self.res["school"] = RecItem(txt, conf)
- return
- elif "院校" in txt:
- txt = txt.split("院校")[-1]
- self.res["school"] = RecItem(txt, conf)
- return
- def major(self):
- """
- 专业
- """
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i]) - 1):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- mini_dis = [99999., 0]
- is_major = "专业" in txt
- if is_major and len(txt) < 4:
- for k in range(len(self.result[i]) - 1):
- if k == j: continue
- p = np.array(res[j].center) - np.array(res[k].center)
- min = math.hypot(p[0], p[1])
- if min < mini_dis[0]:
- mini_dis = [min, k]
- major_txt = self.broken(self.result[i][j + 1].txt, i, j)
- self.res["major"] = RecItem(major_txt, conf)
- return
- if is_major:
- txt = txt.split("专业")[-1]
- major_txt = self.broken(txt, i, j)
- self.res["major"] = RecItem(major_txt, conf)
- return
- def broken(self, txt, row, r):
- is_broken = '(' in txt and ')' not in txt or '(' in txt and ')' not in txt
- if not is_broken:
- return txt
- else:
- for i in range(row, len(self.result)):
- res = self.result[i]
- for j in range(r, len(res)-1):
- other_txt = res[j].txt
- if ')' in other_txt:
- return txt + other_txt.replace(')', ')').replace('(', "(")
- if ')' in other_txt:
- return txt + other_txt
- def to_data(self, txt):
- date_in = re.findall(r"\d+", txt)
- return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2][:2]}日'
- def number(self):
- """
- 证书编号
- """
- num_txt = ''
- num_conf = 0.
- for i in range(len(self.result)):
- for j in range(len(self.result[i]) - 1):
- txt = self.result[i][j].txt
- txt = txt.replace(' ', '')
- if '预计' in txt or '(预计' in txt or '(预计' in txt or '(毕业' in txt or '(毕业' in txt:
- self.res["number"] = RecItem('', 0.)
- return
- txt = re.findall("\d{16,18}", txt)
- conf = self.result[i][j].conf
- if len(txt) > 0:
- if len(txt[0]) == 18:
- num_txt = txt[0].replace("号码", "")
- num_conf = conf
- self.res["number"] = RecItem(num_txt, num_conf)
- return
- # 存入
- def parse(self):
- self.full_name()
- self.gender()
- self.admission_time()
- self.education_time()
- self.education_level()
- self.education_type()
- self.learning_type()
- self.school()
- self.number()
- self.major()
- return {key: self.res[key].to_dict() for key in self.keys}
|