import re import string from dataclasses import dataclass from collections import defaultdict import numpy as np import cpca @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text, "confidence": np.nan_to_num(self.confidence)} class Parser(object): def __init__(self, txts, confs): self.result = txts self.confs = confs assert len(self.result) == len(self.confs), 'result and confs do not match' self.res = defaultdict(RecItem) self.keys = ["name", "id", "ethnicity", "gender", "birthday", "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"] for key in self.keys: self.res[key] = RecItem() def parse(self): return self.res @property def confidence(self): return 0. class FrontParser(Parser): """ """ def __init__(self, txts, confs): Parser.__init__(self, txts, confs) self.result = [ i.replace(" ", "").translate(str.maketrans("", "", string.punctuation)) for i in txts ] assert len(self.result) == len(self.confs), 'result and confs do not match' def birth(self): if len(self.res["id"].text) == 18: # 342423 2001 0 2 1 5 6552 # 012345 6789 10 11 12 13 14 str_num = self.res["id"].text date = list(str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日") if date[date.index("月") - 2] == "0": del date[date.index("月") - 2] if date[date.index("日") - 2] == "0": del date[date.index("日") - 2] self.res["birthday"].text = "".join(date) def card_no(self): """ 身份证号码 """ for i in range(len(self.result)): txt = self.result[i] # 身份证号码 if "X" in txt or "x" in txt: res = re.findall("\d*[X|x]", txt) else: res = re.findall("\d{16,18}", txt) if len(res) > 0: if len(res[0]) == 18: self.res["id"].text = res[0].replace("号码", "") self.res["id"].confidence = self.confs[i] self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女" self.res["gender"].confidence = self.confs[i] break def full_name(self): """ 身份证姓名 """ for i in range(len(self.result)): txt = self.result[i] length = len(txt) if "姓名" in txt: if len(txt) < 7: res = re.findall("姓名[\u4e00-\u9fa5]{1,4}", txt) # 三个字名字 if len(res) > 0: self.res["name"].text = res[0].split("姓名")[-1] self.res["name"].confidence = self.confs[i] self.result[i] = "temp" # 避免身份证姓名对地址造成干扰 break else: res = txt[2:] name_list = [] point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for n in range(len(point_unicode)): point = re.findall(point_unicode[n], res) if len(point) != 0: name_list = res.split(point[0]) for m in range(len(name_list)): name_list[m] = name_list[m].replace(' ', '') res = name_list[0] + '\u00B7' + name_list[1] self.res["name"].text = res self.res["name"].confidence = self.confs[i] self.result[i] = "temp" # 避免身份证姓名对地址造成干扰 def gender(self): """ 性别女民族汉 """ if len(self.res["gender"].text) != 0: return for i in range(len(self.result)): txt = self.result[i] if "男" in txt: self.res["gender"] = RecItem("男", self.confs[i]) break if "女" in txt: self.res["gender"] = RecItem("女", self.confs[i]) break def national(self): # 性别女民族汉 for i in range(len(self.result)): txt = self.result[i] res = re.findall(".*民族[\u4e00-\u9fa5]+", txt) if len(res) > 0: self.res["ethnicity"] = RecItem(res[0].split("族")[-1], self.confs[i]) break def address(self): """ 身份证地址 """ addString = [] conf = [] for i in range(len(self.result)): txt = self.result[i] txt = txt.replace("号码", "") if "公民" in txt: txt = "temp" # 身份证地址 if ( "住址" in txt or "址" in txt or "省" in txt or "市" in txt or "县" in txt or "街" in txt or "乡" in txt or "村" in txt or "镇" in txt or "区" in txt or "城" in txt or "组" in txt or "旗" in txt or "号" in txt ): # if "住址" in txt or "省" in txt or "址" in txt: if "住址" in txt or "省" in txt or "址" in txt or \ ('市' in txt and len(addString) > 0 and '市' not in addString[0]): addString.insert(0, txt.split("址")[-1]) else: addString.append(txt) conf.append(self.confs[i]) self.result[i] = "temp" if len(addString) > 0: self.res["address"].text = "".join(addString) self.res["address"].confidence = np.mean(conf) # print(f'addr: {self.res["Address"]}') def split_addr(self): if self.res["address"].text: conf = self.res["address"].confidence df = cpca.transform([self.res["address"].text]) # print(df) province = df.iloc[0, 0] city = df.iloc[0, 1] region = df.iloc[0, 2] detail = df.iloc[0, 3] print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}') self.res["address_province"] = RecItem(province, conf) self.res["address_city"] = RecItem(city, conf) if detail and "旗" in detail: temp_region = [] temp_region.insert(0, detail.split("旗")[0] + "旗") self.res["address_region"] = RecItem(temp_region[0], conf) self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf) else: self.res["address_region"] = RecItem(region, conf) self.res["address_detail"] = RecItem(detail, conf) def expire_date(self): for txt, conf in zip(self.result, self.confs): txt = txt.replace('.', '') res = re.findall('\d{8}\-\d{8}', txt) if res: self.res["expire_date"] = RecItem(res[0], conf) break res = re.findall('\d{8}\-长期', txt) if res: self.res["expire_date"] = RecItem(res[0], conf) break def predict_name(self): """ 如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进 """ if len(self.res['name'].text) > 1: return for i in range(len(self.result)): txt = self.result[i] if 1 < len(txt) < 5: if ( "性别" not in txt and "姓名" not in txt and "民族" not in txt and "住址" not in txt and "出生" not in txt and "号码" not in txt and "身份" not in txt ): result = re.findall("[\u4e00-\u9fa5]{2,4}", txt) if len(result) > 0: self.res["Name"] = RecItem(result[0], self.confs[i]) break @property def confidence(self): return np.mean(self.confs) def parse(self): self.full_name() self.national() self.card_no() self.address() self.split_addr() self.birth() self.gender() self.expire_date() self.predict_name() if not self.res["id"].text: raise Exception("没有识别到身份证号") return {key: self.res[key].to_dict() for key in self.keys} class BackParser(Parser): def __init__(self, txts, confs): Parser.__init__(self, txts, confs) def expire_date(self): for txt, conf in zip(self.result, self.confs): txt = txt.replace('.', '') res = re.findall('\d{8}\-\d{8}', txt) if res: self.res["expire_date"] = RecItem(res[0], conf) break res = re.findall('\d{8}\-长期', txt) if res: self.res["expire_date"] = RecItem(res[0], conf) break @property def confidence(self): return np.mean(self.confs) def parse(self): self.expire_date() if not self.res["expire_date"].text: raise Exception("无法识别") return {key: self.res[key].to_dict() for key in self.keys}