chenguilong
/
hr-ocr-cet


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
							import re
from collections import defaultdict
from dataclasses import dataclass
from typing import List

import cpca
import numpy as np
from zhon.hanzi import punctuation

from core.line_parser import OcrResult


@dataclass
class RecItem:
    text: str = ''
    confidence: float = 0.

    def to_dict(self):
        return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}


class Parser(object):
    def __init__(self, ocr_results: List[List[OcrResult]]):
        self.result = ocr_results
        self.res = defaultdict(RecItem)
        self.keys = ["name", "id", "language", "level", "exam_time", "score"]
        for key in self.keys:
            self.res[key] = RecItem()

        ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]')
        for item in self.result:
            tail = ['', 1.]
            for k in range(len(item)):
                item[k].txt = ''.join(re.findall(ch, item[k].txt))
                tail[0] = tail[0] + item[k].txt
                tail[1] = tail[1] + item[k].conf
            tail[1] = (tail[1] - 1.) / len(item)
            item.append(tail)

        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            if "口试" in txt:
                self.result = self.result[:i + 1]
                break

    def parse(self):
        return self.res


class CETParser(Parser):

    def __init__(self, ocr_results: List[List[OcrResult]]):
        Parser.__init__(self, ocr_results)

    def name(self):
        """
        姓名
        """
        name_val = ''
        conf = 0.
        is_name = False
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]
            for s in range(len(txt)):
                if txt[s] == "名" and s < 3 and "名" in txt:
                    is_name = True
            if is_name:
                name_val = txt.split("名")[-1]
                break

        if len(name_val) < 5:
            self.res["name"] = RecItem(name_val, conf)
        else:
            point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
                             "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
                             "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
            for item in point_unicode:
                point = re.findall(item, name_val)
                if len(point) != 0:
                    name_list = name_val.split(point[0])
                    self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
                    return

    def id(self):
        """
        身份证号码
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            id_num = re.findall("\d{17,19}[X|x|×]*", txt)
            if id_num and len(id_num[0]) == 19 and id_num[0][0] == id_num[0][1]:
                self.res['id'] = RecItem(id_num[0][1:], conf)
                break
            if id_num and len(id_num[0]) == 18:
                self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf)
                break
            if id_num and len(id_num[0]) == 17:
                self.res['id'] = RecItem(f'{id_num[0]}X', conf)
                break

    def language(self):
        """
        语言
        """
        self.res['language'] = RecItem("英语", 1.)

    def level(self):
        """
        等级
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "四级" in txt:
                self.res['level'] = RecItem("CET4", conf)
                return
            elif "六级" in txt:
                self.res['level'] = RecItem("CET6", conf)
                return
        raise Exception("四六级无法识别")

    def exam_time(self):
        """
        考试时间
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "时间" in txt:
                txt = txt.split("时间")[-1]
                self.res["exam_time"] = RecItem(self.to_data(txt), conf)
                return

    def score(self):
        """
        总分
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]
            if '总分' in txt and (len(txt) == 5 or '具备' in txt or '资格' in txt):
                score = re.findall(r'\d+', txt)
                if len(score[0]) == 4 and score[0][0] == score[0][1]:
                    self.res["score"] = RecItem(score[0][1:], conf)
                    return
                self.res["score"] = RecItem(score[0], conf)
                return

        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "时间" in txt:
                if '月' in txt:
                    txt = txt.split("月")[-1][:3]
                    self.res["score"] = RecItem(txt, conf)
                else:
                    self.res["score"] = RecItem(res[1].txt, conf)
                return

    def to_data(self, txt):
        date_in = re.findall(r"\d+", txt)
        if len(date_in) == 1: date_in.append('6')
        return f'{date_in[0][-4:]}年{date_in[1]}月'

    def parse(self):
        self.name()
        self.id()
        self.language()
        self.level()
        self.exam_time()
        self.score()
        return {key: self.res[key].to_dict() for key in self.keys}


class TEMParser(Parser):
    def __init__(self, ocr_results: List[List[OcrResult]]):
        Parser.__init__(self, ocr_results)

    def name(self):
        """
        姓名
        """
        name_val = ''
        conf = 0.
        is_name = False
        for row_idx, row in enumerate(self.result):
            for idx, r in enumerate(row[:-1]):
                if '同学' in r.txt:
                    is_name = True
                    name_val = self.result[row_idx - 1][-2].txt if idx == 0 else row[idx - 1].txt
                    break

        if is_name is False:
            for i in range(len(self.result)):
                res = self.result[i]
                txt = res[-1][0]
                conf = res[-1][1]
                if '于' in txt:
                    txt = ''.join(txt.split('于')[:-1])
                    name_val = txt.split('学生')[-1]
                    break

        if len(name_val) < 5:
            self.res["name"] = RecItem(name_val, conf)
        else:
            point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
                             "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
                             "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
            for item in point_unicode:
                point = re.findall(item, name_val)
                if len(point) != 0:
                    name_list = name_val.split(point[0])
                    self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
                    return

    def id(self):
        self.res['id'] = RecItem("", 1.)

    def language(self):
        self.res['language'] = RecItem("英语", 1.)

    def level(self):
        """
        等级
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "TEM4" in txt or "基础" in txt or '四级' in txt:
                self.res['level'] = RecItem("TEM4", conf)
                return
            elif "TEM8" in txt or "高年级" in txt or '八级' in txt:
                self.res['level'] = RecItem("TEM8", conf)
                return
        raise Exception("专四专八无法识别")

    def exam_time(self):
        """
        考试时间
        """
        for i in range(len(self.result)):
            NewVersion = True
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]
            if '级学生' in txt: NewVersion = False
            if not NewVersion:
                txt = txt.split('于')[-1].split('参')[0]
                self.res["exam_time"] = RecItem(self.han_to_date(txt), conf)
                return

            if "于" in txt:
                txt = txt.split("于")[-1]
                self.res["exam_time"] = RecItem(self.to_data(txt), conf)
                return
            if '教育部全国' in txt:
                txt = txt.split("教育部全国")[0]
                self.res["exam_time"] = RecItem(self.to_data(txt), conf)
                return

    def to_data(self, txt):
        date_in = re.findall(r"\d+", txt)
        if len(date_in) == 1: date_in.append('6')
        return f'{date_in[0][-4:]}年{date_in[1]}月'

    def han_to_date(self, date):
        numbers = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十一': '11',
                   '十二': '12', '0': '0', 'O': '0', 'o': '0'}
        date = date.split('于')[-1].split('月')[0]
        data_y = date.split('年')[0]
        if len(data_y) == 3:
            y = list(data_y)
            y.insert(1, '0')
            data_y = ''.join(y)

        date_m = date.split('年')[-1] or '6'
        for wy in data_y:
            data_y = data_y.replace(wy, numbers.get(wy))
        date_m =numbers.get(date_m)

        return f"{data_y}年{date_m}月"

    def score(self):
        """
        总分
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "成绩" in txt:
                txt = txt.split("成绩")[-1][:2]
                if '合' in txt or '格' in txt:
                    self.res["score"] = RecItem('合格', conf)
                    return
                if '良' in txt or '好' in txt:
                    self.res["score"] = RecItem('良好', conf)
                    return
                if '优' in txt or '秀' in txt:
                    self.res["score"] = RecItem('优秀', conf)
                    return

    def parse(self):
        self.name()
        self.id()
        self.language()
        self.level()
        self.exam_time()
        self.score()
        return {key: self.res[key].to_dict() for key in self.keys}