sxwl_DL
/
hr-ocr-idcard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
							import re
import string
from dataclasses import dataclass
from collections import defaultdict
import numpy as np
import cpca


@dataclass
class RecItem:
    text: str = ''
    confidence: float = 0.

    def to_dict(self):
        return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}


class Parser(object):
    def __init__(self, txts, confs):
        self.result = txts
        self.confs = confs
        assert len(self.result) == len(self.confs), 'result and confs do not match'
        self.res = defaultdict(RecItem)
        self.res["Name"] = RecItem()
        self.res["IDNumber"] = RecItem()
        self.res["Address"] = RecItem()
        self.res["Gender"] = RecItem()
        self.res["Nationality"] = RecItem()
        self.res["Birth"] = RecItem()
        self.res["expire_date"] = RecItem()

    def parse(self):
        return self.res

    @property
    def confidence(self):
        return 0.


class FrontParser(Parser):
    """
    """

    def __init__(self, txts, confs):
        Parser.__init__(self, txts, confs)
        self.result = [
            i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
            for i in txts
        ]
        assert len(self.result) == len(self.confs), 'result and confs do not match'

    def birth(self):
        if len(self.res["IDNumber"].text) == 18:
            # 342423 2001  0  2    1  5    6552
            # 012345 6789  10 11   12 13   14
            str_num = self.res["IDNumber"].text
            date = list(str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日")
            if date[date.index("月") - 2] == "0":
                del date[date.index("月") - 2]
            if date[date.index("日") - 2] == "0":
                del date[date.index("日") - 2]
            self.res["Birth"].text = "".join(date)

    def card_no(self):
        """
        身份证号码
        """
        for i in range(len(self.result)):
            txt = self.result[i]

            # 身份证号码
            if "X" in txt or "x" in txt:
                res = re.findall("\d*[X|x]", txt)
            else:
                res = re.findall("\d{16,18}", txt)

            if len(res) > 0:
                if len(res[0]) == 18:
                    self.res["IDNumber"].text = res[0].replace("号码", "")
                    self.res["IDNumber"].confidence = self.confs[i]
                    self.res["Gender"].text = "男" if int(res[0][16]) % 2 else "女"
                    self.res["Gender"].confidence = self.confs[i]
                break

    def full_name(self):
        """
        身份证姓名
        """
        for i in range(len(self.result)):
            txt = self.result[i]
            length = len(txt)
            print(length)
            if "姓名" in txt:
                if len(txt) < 7:
                    res = re.findall("姓名[\u4e00-\u9fa5]{1,4}", txt)
                    # 三个字名字
                    if len(res) > 0:
                        self.res["Name"].text = res[0].split("姓名")[-1]
                        self.res["Name"].confidence = self.confs[i]
                        self.result[i] = "temp"  # 避免身份证姓名对地址造成干扰
                        break
                else:
                    res = txt[2:]
                    name_list = []
                    point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
                                     "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
                                     "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
                    for n in range(len(point_unicode)):
                        point = re.findall(point_unicode[n], res)
                        if len(point) != 0:
                            name_list = res.split(point[0])
                            for m in range(len(name_list)):
                                name_list[m] = name_list[m].replace(' ', '')
                            res = name_list[0] + '\u00B7' + name_list[1]

                self.res["Name"].text = res
                self.res["Name"].confidence = self.confs[i]
                self.result[i] = "temp"  # 避免身份证姓名对地址造成干扰

    def gender(self):
        """
        性别女民族汉
        """
        if len(self.res["Gender"].text) != 0: return
        for i in range(len(self.result)):
            txt = self.result[i]
            if "男" in txt:
                self.res["Gender"] = RecItem("男", self.confs[i])
                break

            if "女" in txt:
                self.res["Gender"] = RecItem("女", self.confs[i])
                break

    def national(self):
        # 性别女民族汉
        for i in range(len(self.result)):
            txt = self.result[i]
            res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)

            if len(res) > 0:
                self.res["Nationality"] = RecItem(res[0].split("族")[-1], self.confs[i])
                break

    def address(self):
        """
        身份证地址
        """
        addString = []
        conf = []
        for i in range(len(self.result)):
            txt = self.result[i]
            txt = txt.replace("号码", "")
            if "公民" in txt:
                txt = "temp"
            # 身份证地址

            if (
                    "住址" in txt
                    or "址" in txt
                    or "省" in txt
                    or "市" in txt
                    or "县" in txt
                    or "街" in txt
                    or "乡" in txt
                    or "村" in txt
                    or "镇" in txt
                    or "区" in txt
                    or "城" in txt
                    or "组" in txt
                    or "旗" in txt
                    or "号" in txt
            ):
                # if "住址" in txt or "省" in txt or "址" in txt:
                if "住址" in txt or "省" in txt or "址" in txt or \
                        ('市' in txt and len(addString) > 0 and '市' not in addString[0]):
                    addString.insert(0, txt.split("址")[-1])
                else:
                    addString.append(txt)
                conf.append(self.confs[i])
                self.result[i] = "temp"
        if len(addString) > 0:
            self.res["Address"].text = "".join(addString)
            self.res["Address"].confidence = np.mean(conf)
        # print(f'addr: {self.res["Address"]}')

    def split_addr(self):
        if self.res["Address"].text:
            conf = self.res["Address"].confidence
            df = cpca.transform([self.res["Address"].text])
            # print(df)

            province = df.iloc[0, 0]
            city = df.iloc[0, 1]
            region = df.iloc[0, 2]
            detail = df.iloc[0, 3]
            # print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
            self.res["address_province"] = RecItem(province, conf)
            self.res["address_city"] = RecItem(city, conf)
            if "旗" in detail:
                temp_region = []
                temp_region.insert(0, detail.split("旗")[0] + "旗")
                self.res["address_region"] = RecItem(temp_region[0], conf)
                self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
            else:
                self.res["address_region"] = RecItem(region, conf)
                self.res["address_detail"] = RecItem(detail, conf)

    def expire_date(self):
        for txt, conf in zip(self.result, self.confs):
            txt = txt.replace('.', '')
            res = re.findall('\d{8}\-\d{8}', txt)
            if res:
                self.res["expire_date"] = RecItem(res[0], conf)
                break
            res = re.findall('\d{8}\-长期', txt)
            if res:
                self.res["expire_date"] = RecItem(res[0], conf)
                break

    def predict_name(self):
        """
        如果PaddleOCR返回的不是姓名xx连着的，则需要去猜测这个姓名，此处需要改进
        """
        if len(self.res['Name'].text) > 1: return
        for i in range(len(self.result)):
            txt = self.result[i]
            if 1 < len(txt) < 5:
                if (
                        "性别" not in txt
                        and "姓名" not in txt
                        and "民族" not in txt
                        and "住址" not in txt
                        and "出生" not in txt
                        and "号码" not in txt
                        and "身份" not in txt
                ):
                    result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
                    if len(result) > 0:
                        self.res["Name"] = RecItem(result[0], self.confs[i])
                        break

    @property
    def confidence(self):
        return np.mean(self.confs)

    def parse(self):
        self.full_name()
        self.national()
        self.card_no()
        self.address()
        self.split_addr()
        self.birth()
        self.gender()
        self.expire_date()
        self.predict_name()
        if not self.res["IDNumber"].text:
            raise Exception("没有识别到身份证号")
        return self.res


class BackParser(Parser):
    def __init__(self, txts, confs):
        Parser.__init__(self, txts, confs)

    def expire_date(self):
        for txt, conf in zip(self.result, self.confs):
            txt = txt.replace('.', '')
            res = re.findall('\d{8}\-\d{8}', txt)
            if res:
                self.res["expire_date"] = RecItem(res[0], conf)
                break
            res = re.findall('\d{8}\-长期', txt)
            if res:
                self.res["expire_date"] = RecItem(res[0], conf)
                break

    @property
    def confidence(self):
        return np.mean(self.confs)

    def parse(self):
        self.expire_date()
        if not self.res["expire_date"].text:
            raise Exception("无法识别")
        return self.res