sxwl_DL
/
hr-ocr-idcard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
							import re
from collections import defaultdict
from dataclasses import dataclass
from typing import List

import cpca
import numpy as np
from zhon.hanzi import punctuation

from core.line_parser import OcrResult


@dataclass
class RecItem:
    text: str = ''
    confidence: float = 0.

    def to_dict(self):
        return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}


class Parser(object):
    def __init__(self, ocr_results: List[List[OcrResult]]):
        self.result = ocr_results
        self.res = defaultdict(RecItem)
        self.keys = ["name", "id", "ethnicity", "gender", "birthday",
                     "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
        for key in self.keys:
            self.res[key] = RecItem()

    def parse(self):
        return self.res


class FrontParser(Parser):
    """
    出生年月日
    """

    def __init__(self, ocr_results: List[List[OcrResult]]):
        Parser.__init__(self, ocr_results)
        self.id_ok = True

    def birth(self):
        if len(self.res["id"].text) == 18:
            # 342423 2001  0  2    1  5    6552
            # 012345 6789  10 11   12 13   14
            str_num = self.res["id"].text
            date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
            self.res["birthday"] = RecItem(date, self.res['id'].confidence)
        else:
            # 出生年月
            idx = 2
            txt = ''.join([r.txt for r in self.result[idx]])
            conf = np.mean([r.conf for r in self.result[idx]])
            res = re.match('.*(\d{4})[\u4E00-\u9FA5]+(\d{1,2})[\u4E00-\u9FA5]+(\d{1,2})', txt)
            if res and len(res.groups()) == 3:
                year, month, day = res.groups()
                self.res['birthday'] = RecItem(f'{year}年{month}月{day}日', conf)
            # 性别
            idx = 1
            txt = ''.join([r.txt for r in self.result[idx]])
            conf = np.mean([r.conf for r in self.result[idx]])
            if '男' in txt:
                self.res['gender'] = RecItem('男', conf)
            else:
                self.res['gender'] = RecItem('女', conf)

    def card_no(self):
        """
        身份证号码
        """
        for idx, row in enumerate(self.result):
            for r in row:
                txt = r.txt

                # 身份证号码
                res = re.findall("\d{10,18}[X|x|×]*", txt)
                print(res, '~~~~~')

                if res:
                    if idx < 2:
                        self.result = self.result[idx + 1:]
                        self.result.reverse()
                    else:
                        self.result = self.result[:idx]
                    print('--------after id no -------------')
                    for row in self.result:
                        print('---')
                        print(''.join([r.txt for r in row]))
                    print('--------after id no -------------')
                    if len(res[0]) == 18:
                        for x in ['×', 'x']:
                            res[0] = res[0].replace(x, 'X')
                        self.res["id"].text = res[0]
                        self.res["id"].confidence = r.conf
                        self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
                        self.res["gender"].confidence = r.conf
                        return
                    else:
                        return

        # raise Exception('无法识别')

    @staticmethod
    def extract_zhon(txt):
        # 提取中文字
        res = re.findall('[\u4E00-\u9FA5]+', txt)
        if res:
            return res[0]

    def name(self):

        def parser_name(name_val):
            if len(name_val) < 5:
                self.res["name"] = RecItem(name_val, conf)
            else:
                point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
                                 "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
                                 "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
                for item in point_unicode:
                    point = re.findall(item, name_val)
                    if len(point) != 0:
                        name_list = name_val.split(point[0])
                        self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
                        return

        if len(self.result[0]) > 1:
            for r in self.result[0]:
                if '姓' in r.txt or '名' in r.txt:
                    r.txt = '姓名'
        txt = ''.join([r.txt for r in self.result[0]])
        conf = np.array([r.conf for r in self.result[0]]).mean()
        res = re.split('.*姓.', txt)
        if len(res) == 2:
            parser_name(res[-1])
        res = re.split('.*名', txt)
        if len(res) == 2:
            parser_name(res[-1])


    def national(self):
        """
        民族汉
        """
        txt = ''.join([r.txt for r in self.result[1]])
        conf = np.array([r.conf for r in self.result[1]]).mean()
        res = re.split('.*民.', txt)
        if len(res) == 2:
            self.res['ethnicity'] = RecItem(res[-1], conf)
            return
        res = re.split('.*族', txt)
        if len(res) == 2:
            self.res['ethnicity'] = RecItem(res[-1], conf)
            return

    def address(self):
        """
        身份证地址
        """
        res = []
        confs = []

        for row in self.result[2:]:
            for r in row:
                txt = r.txt
                if '性别' in txt or '出生' in txt or '民族' in txt or '年' in txt: continue
                for i in punctuation:
                    txt = txt.replace(i, '')
                # if (
                #         "住址" in txt
                #         or "址" in txt
                #         or "省" in txt
                #         or "市" in txt
                #         or "县" in txt
                #         or "街" in txt
                #         or "乡" in txt
                #         or "村" in txt
                #         or "镇" in txt
                #         or "区" in txt
                #         or "城" in txt
                #         or "组" in txt
                #         or "旗" in txt
                #         or "号" in txt
                #         or "户" in txt
                #         or "室" in txt
                #         or "嘎查" in txt
                #         or "楼" in txt
                #         or "路" in txt
                # ):
                # if "住址" in txt or "省" in txt or "址" in txt:
                if ("住址" in txt or "址" in txt) and len(res) == 0:
                    res.append(txt.split("址")[-1])
                else:
                    res.append(txt)
                confs.append(r.conf)

        if len(res) > 0:
            error_dict = [('呼呼', '呼'), ('霸桥', '灞桥'),
                          ('漳尔市', '淖尔市'), ('屹旦', '圪旦'), ('营家村', '菅家村'),
                          ('四四川', '四川'), ('止口', ''), ('装柏村', '裴柏村'),
                          ('安安徽', '安徽'), ('吃梁村', '圪梁村'), ('中熬本台', '中敖本台')]
            txt = "".join(res)
            txt = txt.split("址")[-1]
            for k, v in error_dict:
                txt = txt.replace(k, v)

            self.res["address"] = RecItem(txt, np.mean(confs))
            self.split_addr()
            return
        raise Exception('无法识别')

    def split_addr(self):

        print(self.res['address'].text, '=======')
        conf = self.res["address"].confidence
        df = cpca.transform([self.res["address"].text])
        province = df.iloc[0, 0]
        df = df.replace([None], [''], regex=True)
        city = df.iloc[0, 1]
        region = df.iloc[0, 2]
        detail = df.iloc[0, 3]
        print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
        self.res["address_province"] = RecItem(province, conf)
        self.res["address_city"] = RecItem(city, conf)
        if detail and "旗" in detail:
            temp_region = []
            temp_region.insert(0, detail.split("旗")[0] + "旗")
            self.res["address_region"] = RecItem(temp_region[0], conf)
            self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
        # elif detail and "旗" in detail:

        else:
            self.res["address_region"] = RecItem(region, conf)
            self.res["address_detail"] = RecItem(detail, conf)
        # if not self.res['address_region'].text or not self.res['address_detail'].text:
        #     raise Exception('无法识别区域或者地址详情')

    def parse(self):
        self.card_no()
        self.name()
        self.national()
        self.birth()
        self.address()
        return {key: self.res[key].to_dict() for key in self.keys}


class BackParser(Parser):
    def __init__(self, ocr_results: List[List[OcrResult]]):
        Parser.__init__(self, ocr_results)

    def expire_date(self):
        for row in self.result:
            for r in row:
                txt = r.txt
                txt = txt.replace('.', '')
                res = re.findall('\d{8}\-\d{4}', txt)
                if res:
                    self.res["expire_date"] = RecItem(res[0] + res[0][4:8], r.conf)
                    return
                res = re.findall('\d{8}\-长期', txt)
                if res:
                    self.res["expire_date"] = RecItem(res[0], r.conf)
                    return
        raise Exception('无法识别')

    def parse(self):
        self.expire_date()
        if not self.res["expire_date"].text:
            raise Exception("无法识别")
        return {key: self.res[key].to_dict() for key in self.keys}