import re import string import math from dataclasses import dataclass from collections import defaultdict import numpy as np import cpca from typing import List from zhon.hanzi import punctuation from core.line_parser import OcrResult @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)} class Parser(object): def __init__(self, ocr_results: List[List[OcrResult]]): self.result = ocr_results self.res = defaultdict(RecItem) self.keys = ["name", "id", "ethnicity", "gender", "birthday", "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"] for key in self.keys: self.res[key] = RecItem() def parse(self): return self.res class FrontParser(Parser): """ 出生年月日 """ def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def birth(self): if len(self.res["id"].text) == 18: # 342423 2001 0 2 1 5 6552 # 012345 6789 10 11 12 13 14 str_num = self.res["id"].text date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日" self.res["birthday"] = RecItem(date, self.res['id'].confidence) def card_no(self): """ 身份证号码 """ for idx, row in enumerate(self.result): for r in row: txt = r.txt # 身份证号码 if "X" in txt or "x" in txt: res = re.findall("\d*[X|x]", txt) else: res = re.findall("\d{16,18}", txt) if len(res) > 0: if len(res[0]) == 18: self.res["id"].text = res[0] self.res["id"].confidence = r.conf self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女" self.res["gender"].confidence = r.conf if idx < 2: self.result = self.result[idx + 1:] self.result.reverse() else: self.result = self.result[:idx] print('---------------------') for row in self.result: print(row) print('---------------------') return raise Exception('无法识别') def name(self): """ 姓名 """ name_val = None conf = 0. for i in range(len(self.result)): res = self.result[i] for j in range(len(self.result[i])): txt = self.result[i][j].txt conf = self.result[i][j].conf mini_dis = [99999., 0] is_name = '姓' in txt or '名' in txt if is_name and len(res) > 1: for k in range(len(self.result[i])): if k == j: continue p = np.array(res[j].center) - np.array(res[k].center) min = math.hypot(p[0], p[1]) if min < mini_dis[0]: mini_dis = [min, k] conf = self.result[i][k].conf name_val = self.result[i][mini_dis[1]].txt elif is_name and len(txt) > 3: conf = self.result[i][mini_dis[1]].conf name_val = txt.split("姓名")[-1] if name_val is None: raise Exception('无法识别') if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for n in range(len(point_unicode)): point = re.findall(point_unicode[n], name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return def national(self): """ 性别 <-- id 民族汉 """ for i in range(len(self.result)): res = self.result[i] for j in range(len(self.result[i])): txt = self.result[i][j].txt conf = self.result[i][j].conf mini_dis = [99999., 0] # 分框 if '族' in txt and len(txt) < 3: for k in range(len(self.result[i])): if k == j: continue p = np.array(res[j].center) - np.array(res[k].center) min = math.hypot(p[0], p[1]) if min < mini_dis[0]: mini_dis = [min, k] self.res["ethnicity"] = RecItem(self.result[i][mini_dis[1]].txt, conf) return # 合框 elif '族' in txt: self.res["ethnicity"] = RecItem(txt.split("族")[-1], conf) return def address(self): """ 身份证地址 """ res = [] confs = [] for row in self.result[2:]: for r in row: txt = r.txt if '性别' in txt or '出生' in txt or '民族' in txt: continue punctuation_str = punctuation for i in punctuation: txt = txt.replace(i, '') if ( "住址" in txt or "址" in txt or "省" in txt or "市" in txt or "县" in txt or "街" in txt or "乡" in txt or "村" in txt or "镇" in txt or "区" in txt or "城" in txt or "组" in txt or "旗" in txt or "号" in txt or "户" in txt or "室" in txt ): # if "住址" in txt or "省" in txt or "址" in txt: if "住址" in txt or "址" in txt: res.append(txt.split("址")[-1]) else: res.append(txt) confs.append(r.conf) if len(res) > 0: self.res["address"] = RecItem("".join(res), np.mean(confs)) self.split_addr() return raise Exception('无法识别') def split_addr(self): print(self.res['address'].text, '=======') conf = self.res["address"].confidence df = cpca.transform([self.res["address"].text]) # print(df) province = df.iloc[0, 0] city = df.iloc[0, 1] region = df.iloc[0, 2] detail = df.iloc[0, 3] print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}') self.res["address_province"] = RecItem(province, conf) self.res["address_city"] = RecItem(city, conf) if detail and "旗" in detail: temp_region = [] temp_region.insert(0, detail.split("旗")[0] + "旗") self.res["address_region"] = RecItem(temp_region[0], conf) self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf) # elif detail and "旗" in detail: else: self.res["address_region"] = RecItem(region, conf) self.res["address_detail"] = RecItem(detail, conf) if not self.res['address_region'].text or not self.res['address_detail'].text: raise Exception('无法识别') def parse(self): self.card_no() self.name() self.national() self.birth() self.address() return {key: self.res[key].to_dict() for key in self.keys} class BackParser(Parser): def __init__(self, ocr_results: List[List[OcrResult]]): Parser.__init__(self, ocr_results) def expire_date(self): for row in self.result: for r in row: txt = r.txt txt = txt.replace('.', '') res = re.findall('\d{8}\-\d{4}', txt) if res: self.res["expire_date"] = RecItem(res[0]+res[0][4:8], r.conf) return res = re.findall('\d{8}\-长期', txt) if res: self.res["expire_date"] = RecItem(res[0], r.conf) return raise Exception('无法识别') def parse(self): self.expire_date() if not self.res["expire_date"].text: raise Exception("无法识别") return {key: self.res[key].to_dict() for key in self.keys}