import re from collections import defaultdict from dataclasses import dataclass from typing import List import cpca import numpy as np from zhon.hanzi import punctuation from core.line_parser import OcrResult @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)} class Parser(object): def __init__(self, ocr_results: List[List[OcrResult]]): self.result = ocr_results self.res = defaultdict(RecItem) self.keys = ["name", "id", "ethnicity", "gender", "birthday", "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"] for key in self.keys: self.res[key] = RecItem() def parse(self): return self.res class FrontParser(Parser): """ 出生年月日 """ def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) self.id_ok = True def birth(self): if len(self.res["id"].text) == 18: # 342423 2001 0 2 1 5 6552 # 012345 6789 10 11 12 13 14 str_num = self.res["id"].text date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日" self.res["birthday"] = RecItem(date, self.res['id'].confidence) else: # 出生年月 idx = 2 txt = ''.join([r.txt for r in self.result[idx]]) conf = np.mean([r.conf for r in self.result[idx]]) res = re.match('.*(\d{4})[\u4E00-\u9FA5]+(\d{1,2})[\u4E00-\u9FA5]+(\d{1,2})', txt) if res and len(res.groups()) == 3: year, month, day = res.groups() self.res['birthday'] = RecItem(f'{year}年{month}月{day}日', conf) # 性别 idx = 1 txt = ''.join([r.txt for r in self.result[idx]]) conf = np.mean([r.conf for r in self.result[idx]]) if '男' in txt: self.res['gender'] = RecItem('男', conf) else: self.res['gender'] = RecItem('女', conf) def card_no(self): """ 身份证号码 """ for idx, row in enumerate(self.result): for r in row: txt = r.txt # 身份证号码 res = re.findall("\d{10,18}[X|x|×]*", txt) print(res, '~~~~~') if res: if idx < 2: self.result = self.result[idx + 1:] self.result.reverse() else: self.result = self.result[:idx] print('--------after id no -------------') for row in self.result: print('---') print(''.join([r.txt for r in row])) print('--------after id no -------------') if len(res[0]) == 18: for x in ['×', 'x']: res[0] = res[0].replace(x, 'X') self.res["id"].text = res[0] self.res["id"].confidence = r.conf self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女" self.res["gender"].confidence = r.conf return else: return # raise Exception('无法识别') @staticmethod def extract_zhon(txt): # 提取中文字 res = re.findall('[\u4E00-\u9FA5]+', txt) if res: return res[0] def name(self): def parser_name(name_val): if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for item in point_unicode: point = re.findall(item, name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return if len(self.result[0]) > 1: for r in self.result[0]: if '姓' in r.txt or '名' in r.txt: r.txt = '姓名' txt = ''.join([r.txt for r in self.result[0]]) conf = np.array([r.conf for r in self.result[0]]).mean() res = re.split('.*姓.', txt) if len(res) == 2: parser_name(res[-1]) res = re.split('.*名', txt) if len(res) == 2: parser_name(res[-1]) def national(self): """ 民族汉 """ txt = ''.join([r.txt for r in self.result[1]]) conf = np.array([r.conf for r in self.result[1]]).mean() res = re.split('.*民.', txt) if len(res) == 2: self.res['ethnicity'] = RecItem(res[-1], conf) return res = re.split('.*族', txt) if len(res) == 2: self.res['ethnicity'] = RecItem(res[-1], conf) return def address(self): """ 身份证地址 """ res = [] confs = [] for row in self.result[2:]: for r in row: txt = r.txt if '性别' in txt or '出生' in txt or '民族' in txt or '年' in txt: continue for i in punctuation: txt = txt.replace(i, '') if ("住址" in txt or "址" in txt) and len(res) == 0: res.append(txt.split("址")[-1]) else: res.append(txt) confs.append(r.conf) if len(res) > 0: error_dict = [('呼呼', '呼'), ('霸桥', '灞桥'), ('漳尔市', '淖尔市'), ('屹旦', '圪旦'), ('营家村', '菅家村'), ('四四川', '四川'), ('止口', ''), ('装柏村', '裴柏村'), ('安安徽', '安徽'), ('吃梁村', '圪梁村'), ('中熬本台', '中敖本台')] txt = "".join(res) txt = txt.split("址")[-1] for k, v in error_dict: txt = txt.replace(k, v) self.res["address"] = RecItem(txt, np.mean(confs)) self.split_addr() return raise Exception('无法识别') def split_addr(self): print(self.res['address'].text, '=======') conf = self.res["address"].confidence df = cpca.transform([self.res["address"].text]) province = df.iloc[0, 0] df = df.replace([None], [''], regex=True) city = df.iloc[0, 1] region = df.iloc[0, 2] detail = df.iloc[0, 3] print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}') self.res["address_province"] = RecItem(province, conf) self.res["address_city"] = RecItem(city, conf) if detail and "旗" in detail: temp_region = [] temp_region.insert(0, detail.split("旗")[0] + "旗") self.res["address_region"] = RecItem(temp_region[0], conf) self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf) # elif detail and "旗" in detail: else: self.res["address_region"] = RecItem(region, conf) self.res["address_detail"] = RecItem(detail, conf) def parse(self): self.card_no() self.name() self.national() self.birth() self.address() return {key: self.res[key].to_dict() for key in self.keys} class BackParser(Parser): def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def expire_date(self): for row in self.result: for r in row: txt = r.txt txt = txt.replace('.', '') res = re.findall('\d{8}\-\d{4}', txt) if res: self.res["expire_date"] = RecItem(res[0] + res[0][4:8], r.conf) return res = re.findall('\d{8}\-长期', txt) if res: self.res["expire_date"] = RecItem(res[0], r.conf) return raise Exception('无法识别') def parse(self): self.expire_date() if not self.res["expire_date"].text: raise Exception("无法识别") return {key: self.res[key].to_dict() for key in self.keys}