chenguilong
/
hr-ocr-cet


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
							import re
from collections import defaultdict
from dataclasses import dataclass
from typing import List

import cpca
import numpy as np
from zhon.hanzi import punctuation

from core.line_parser import OcrResult


@dataclass
class RecItem:
    text: str = ''
    confidence: float = 0.

    def to_dict(self):
        return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}


class Parser(object):
    def __init__(self, ocr_results: List[List[OcrResult]]):
        self.result = ocr_results
        self.res = defaultdict(RecItem)
        self.keys = ["name", "id", "language", "level", "exam_time", "score"]
        for key in self.keys:
            self.res[key] = RecItem()

        ch = re.compile(u'[\u4e00-\u9fa5+\u0030-\u0039]')
        for item in self.result:
            tail = ['', 1.]
            for k in range(len(item)):
                item[k].txt = ''.join(re.findall(ch, item[k].txt))
                tail[0] = tail[0] + item[k].txt
                tail[1] = tail[1] + item[k].conf
            tail[1] = (tail[1] - 1.) / len(item)
            item.append(tail)

        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            if "口试" in txt:
                self.result = self.result[:i]
                break

    def parse(self):
        return self.res


class CETParser(Parser):


    def __init__(self, ocr_results: List[List[OcrResult]]):
        Parser.__init__(self, ocr_results)


    def name(self):
        """
        姓名
        """
        name_val = ''
        conf = 0.
        is_name = False
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]
            for s in range(len(txt)):
                if txt[s] == "名" and s < 3 and "名" in txt:
                    is_name = True
            if is_name:
                name_val = txt.split("名")[-1]
                break

        if len(name_val) < 5:
            self.res["name"] = RecItem(name_val, conf)
        else:
            point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
                             "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
                             "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
            for item in point_unicode:
                point = re.findall(item, name_val)
                if len(point) != 0:
                    name_list = name_val.split(point[0])
                    self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
                    return

    def id(self):
        """
        身份证号码
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            id_num = re.findall("\d{10,18}[X|x|×]*", txt)
            if id_num and len(id_num[0]) == 18:
                self.res['id'] = RecItem(id_num[0].replace('x', "X").replace('×', "X"), conf)
                break

    def language(self):
        """
        语言
        """
        self.res['language'] = RecItem("英语", 1.)

    def level(self):
        """
        等级
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "四级" in txt:
                self.res['level'] = RecItem("CET4", conf)
                return
            elif "六级" in txt:
                self.res['level'] = RecItem("CET6", conf)
                return
        raise Exception("四六级无法识别")

    def exam_time(self):
        """
        考试时间
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "时间" in txt:
                txt = txt.split("时间")[-1]
                self.res["exam_time"] = RecItem(self.to_data(txt), conf)
                return

    def score(self):
        """
        总分
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "时间" in txt:
                txt = txt.split("月")[-1][:3]
                self.res["score"] = RecItem(txt, conf)
                return

    def to_data(self, txt):
        date_in = re.findall(r"\d+", txt)
        return f'{date_in[0][-4:]}年{date_in[1]}月'

    def parse(self):
        self.name()
        self.id()
        self.language()
        self.level()
        self.exam_time()
        self.score()
        return {key: self.res[key].to_dict() for key in self.keys}


class TEMParser(Parser):
    def __init__(self, ocr_results: List[List[OcrResult]]):
        Parser.__init__(self, ocr_results)

    def name(self):
        """
        姓名
        """
        name_val = ''
        conf = 0.
        is_name = False
        for row in self.result:
            for idx, r in enumerate(row[:-1]):
                if '同学' in r.txt:
                    name_val = row[idx-1].txt
                    break

        if len(name_val) < 5:
            self.res["name"] = RecItem(name_val, conf)
        else:
            point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
                             "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
                             "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
            for item in point_unicode:
                point = re.findall(item, name_val)
                if len(point) != 0:
                    name_list = name_val.split(point[0])
                    self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
                    return

    def id(self):
        self.res['id'] = RecItem("", 1.)

    def language(self):
        self.res['language'] = RecItem("英语", 1.)

    def level(self):
        """
        等级
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "TEM4" in txt or "基础" in txt:
                self.res['level'] = RecItem("TEM4", conf)
                return
            elif "TEM8" in txt or "高年级" in txt:
                self.res['level'] = RecItem("TEM8", conf)
                return
        raise Exception("专四专八无法识别")

    def exam_time(self):
        """
        考试时间
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "于" in txt:
                txt = txt.split("于")[-1]
                self.res["exam_time"] = RecItem(self.to_data(txt), conf)
                return
            if '教育部全国' in txt:
                txt = txt.split("教育部全国")[0]
                self.res["exam_time"] = RecItem(self.to_data(txt), conf)
                return


    def to_data(self, txt):
        date_in = re.findall(r"\d+", txt)
        return f'{date_in[0][-4:]}年{date_in[1]}月'

    def score(self):
        """
        总分
        """
        for i in range(len(self.result)):
            res = self.result[i]
            txt = res[-1][0]
            conf = res[-1][1]

            if "成绩" in txt:
                txt = txt.split("成绩")[-1][:2]
                if '合' in txt or '格' in txt:
                    self.res["score"] = RecItem('合格', conf)
                    return
                if '良' in txt or '好' in txt:
                    self.res["score"] = RecItem('良好', conf)
                    return
                if '优' in txt or '秀' in txt:
                    self.res["score"] = RecItem('优秀', conf)
                    return


    def parse(self):
        self.name()
        self.id()
        self.language()
        self.level()
        self.exam_time()
        self.score()
        return {key: self.res[key].to_dict() for key in self.keys}