import re import string from dataclasses import dataclass from collections import defaultdict import numpy as np import cpca from typing import List from core.line_parser import OcrResult @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text, "confidence": np.nan_to_num(self.confidence)} class Parser(object): def __init__(self, ocr_results: List[OcrResult]): self.result = ocr_results self.res = defaultdict(RecItem) self.keys = ["name", "id", "ethnicity", "gender", "birthday", "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"] for key in self.keys: self.res[key] = RecItem() def parse(self): return self.res class FrontParser(Parser): """ """ def __init__(self, ocr_results: List[OcrResult]): Parser.__init__(self, ocr_results) def birth(self): if len(self.res["id"].text) == 18: # 342423 2001 0 2 1 5 6552 # 012345 6789 10 11 12 13 14 str_num = self.res["id"].text date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日" self.res["birthday"] = RecItem(date, self.res['id'].confidence) def card_no(self): """ 身份证号码 """ for idx, row in enumerate(self.result): for r in row: txt = r.txt # 身份证号码 if "X" in txt or "x" in txt: res = re.findall("\d*[X|x]", txt) else: res = re.findall("\d{16,18}", txt) if len(res) > 0: if len(res[0]) == 18: self.res["id"].text = res[0] self.res["id"].confidence = r.conf self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女" self.res["gender"].confidence = r.conf if idx < 2: self.result = self.result[idx + 1:] self.result.reverse() else: self.result = self.result[:idx] return raise Exception('无法识别') def name(self): """ 姓名 """ if len(self.result[0]) == 2: for r in self.result[0]: if '姓' in r.txt or ('名' in r.txt and len(r.txt) < 3): continue else: self.res['name'] = RecItem(r.txt, r.conf) return if len(self.result[0]) == 1: txt = self.result[0][0].txt conf = self.result[0][0].conf if "姓名" in txt: res = txt[2:] name_list = [] point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for n in range(len(point_unicode)): point = re.findall(point_unicode[n], res) if len(point) != 0: name_list = res.split(point[0]) self.res['name'] = RecItem(name_list[0].replace('姓名') + '\u00B7' + name_list[1], conf) return res = re.findall("姓名[\u4e00-\u9fa5]{1,7}", txt) if len(res) > 0: self.res["name"] = RecItem(res[0].split("姓名")[-1], conf) return else: self.res["name"] = RecItem(txt, conf) return raise Exception('无法识别') def national(self): """ 性别 <-- id 民族汉 """ for nation in self.result[1]: if nation is not None: txt = nation.txt conf = nation.conf res = re.findall(".*民族[\u4e00-\u9fa5]+", txt) if len(res) > 0: self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf) return # if len(self.result[1]) == 1: # txt = self.result[1][0].txt # conf = self.result[1][0].conf # res = re.findall(".*民族[\u4e00-\u9fa5]+", txt) # # if len(res) > 0: # self.res["ethnicity"] = RecItem(res[0].split("族")[-1], conf) # return def address(self): """ 身份证地址 """ res = [] confs = [] for row in self.result[3:]: for r in row: txt = r.txt if ( "住址" in txt or "址" in txt or "省" in txt or "市" in txt or "县" in txt or "街" in txt or "乡" in txt or "村" in txt or "镇" in txt or "区" in txt or "城" in txt or "组" in txt or "旗" in txt or "号" in txt ): # if "住址" in txt or "省" in txt or "址" in txt: if "住址" in txt or "址" in txt: res.append(txt.split("址")[-1]) else: res.append(txt) confs.append(r.conf) if len(res) > 0: self.res["address"] = RecItem("".join(res), np.mean(confs)) self.split_addr() return raise Exception('无法识别') def split_addr(self): conf = self.res["address"].confidence df = cpca.transform([self.res["address"].text]) # print(df) province = df.iloc[0, 0] city = df.iloc[0, 1] region = df.iloc[0, 2] detail = df.iloc[0, 3] print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}') self.res["address_province"] = RecItem(province, conf) self.res["address_city"] = RecItem(city, conf) if detail and "旗" in detail: temp_region = [] temp_region.insert(0, detail.split("旗")[0] + "旗") self.res["address_region"] = RecItem(temp_region[0], conf) self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf) else: self.res["address_region"] = RecItem(region, conf) self.res["address_detail"] = RecItem(detail, conf) if not self.res['address_region'].text or not self.res['address_detail'].text: raise Exception('无法识别') def parse(self): self.card_no() self.name() self.national() self.birth() self.address() return {key: self.res[key].to_dict() for key in self.keys} class BackParser(Parser): def __init__(self, ocr_results: List[OcrResult]): Parser.__init__(self, ocr_results) def expire_date(self): for row in self.result: for r in row: txt = r.txt txt = txt.replace('.', '') res = re.findall('\d{8}\-\d{8}', txt) if res: self.res["expire_date"] = RecItem(res[0], r.conf) return res = re.findall('\d{8}\-长期', txt) if res: self.res["expire_date"] = RecItem(res[0], r.conf) return raise Exception('无法识别') def parse(self): self.expire_date() if not self.res["expire_date"].text: raise Exception("无法识别") return {key: self.res[key].to_dict() for key in self.keys}