import json import math import re import string from dataclasses import dataclass from collections import defaultdict from typing import List from core.line_parser import OcrResult import numpy as np import cpca import os f = open('./core/areas.json', 'r') content = f.read() areas = json.loads(content) @dataclass class RecItem: text: str = '' confidence: float = 0. def to_dict(self): return {"text": self.text, "confidence": np.nan_to_num(self.confidence)} # 父类 class Parser(object): def __init__(self, ocr_results: List[List[OcrResult]]): self.result = ocr_results self.res = defaultdict(RecItem) self.keys = ['type', "address", 'address_province', 'address_city', 'address_region', 'address_detail', 'name', 'id', 'gender', # 出生地 'birthplace', 'birthplace_province', 'birthplace_city', 'birthplace_region', # 籍贯 'native_place', 'native_place_province', 'native_place_city', 'native_place_region', 'blood_type', 'religion'] for key in self.keys: self.res[key] = RecItem() for item in self.result: temp = [item[0].txt, item[0].conf] for j in range(len(item)): item[j].txt = item[j].txt.replace("|", "").replace(":", "").replace(":", "").replace(",", "").replace( ",", "").replace("【", "").replace("】", "").replace("「", "").replace("[", "").replace("]", "").replace( " ", "") for k in range(1, len(item)): temp[0] = temp[0] + item[k].txt temp[1] = np.mean([temp[1], item[k].conf]) item.append(temp) def parse(self): return self.res # 1 户口本首页 class FrontRegBookParser(Parser): def type_(self): """ 户别 """ for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] type_list = ["家庭户", "集体户", "居民户", "农业户"] for t in type_list: if t in txt: for _, temp_res in enumerate(res): if t in temp_res.txt: temp_type = temp_res.txt self.res["type"] = RecItem(temp_type, conf) return break # if "户别" in txt and "户主" in txt: # temp_type = txt.split("户别")[-1].split("户主")[0] # elif "户别" in txt: # for t in type_list: # if t in txt: # temp = txt.split(t)[0] # temp_type = t if temp == "户别" else txt.split("户别")[-1].split(t)[0] + t # break # elif "户主" in txt: # temp_type = txt.split("户主")[0] # # if temp_type: # self.res["type"] = RecItem(temp_type, conf) # break def address(self): """ 首页住址 """ address_txt = '' address_conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] address_conf = res[-1][1] if ( "住址" in txt or "住" in txt or "址" in txt or "省" in txt or "市" in txt or "县" in txt or "街" in txt ): address_txt = txt.split("民族")[0] break if address_txt is not None: self.res["address"] = RecItem(address_txt, address_conf) self.split_addr() return raise Exception('无法识别') # 校准 区 def cal_region(self, province, city, region, temp_region, areas): for _, json_province in enumerate(areas): if province in json_province['name']: for _, json_city in enumerate(json_province['children']): if city in json_city['name']: maxnum = 0 for _, json_region in enumerate(json_city['children']): # 字符串的校准 # 1. 如果长度相等 num = 0 if len(temp_region) == len(json_region['name']): for i in range(len(temp_region)): if temp_region[i] == json_region['name'][i]: num += 1 # 2. 长度不等,temp_region至少 >=2,但是一般小于真实的地址 elif len(temp_region) < len(json_region['name']): for i in range(len(temp_region)): for j in range(len(json_region['name'])): if temp_region[i] == json_region['name'][j]: # 找到了就退出,因为一般只会有一个字相同 num += 1 break if maxnum <= num: maxnum = num region = json_region['name'] break break return region # 校准 市 def cal_city(self, province, city, temp_city, areas): for _, json_province in enumerate(areas): if province in json_province['name']: maxnum = 0 for _, json_city in enumerate(json_province['children']): num = 0 if len(temp_city) == len(json_city['name']): for i in range(len(temp_city)): if temp_city[i] == json_city['name'][i]: num += 1 elif len(temp_city) < len(json_city['name']): for i in range(len(temp_city)): for j in range(len(json_city['name'])): if temp_city[i] == json_city['name'][j]: num += 1 break if maxnum <= num: maxnum = num city = json_city['name'] break return city # 校准 区 ----> 没有市的情况下 def cal_region_non_city(self, province, region, temp_region, areas): for _, json_province in enumerate(areas): if province in json_province['name']: for _, json_city in enumerate(json_province['children']): maxnum = 0 for _, json_region in enumerate(json_city['children']): num = 0 if len(temp_region) == len(json_region['name']): for i in range(len(temp_region)): if temp_region[i] == json_region['name'][i]: num += 1 elif len(temp_region) < len(json_region['name']): for i in range(len(temp_region)): for j in range(len(json_region['name'])): if temp_region[i] == json_region['name'][j]: # 找到了就退出,因为一般只会有一个字相同 num += 1 break if maxnum <= num: maxnum = num region = json_region['name'] break return region def split_addr(self): print(self.res['address'].text, '=======') pre_addr = self.res['address'].text # 一般"户主姓名" 比 "住址" 检测框大,所以会跑到和住址一行,并且在"住址"字段后面 if "户主姓名" in pre_addr: pre_addr = pre_addr.split("户主姓名")[-1] conf = self.res["address"].confidence df = cpca.transform([pre_addr]) print(df) df.replace([None], ['']) province = df.iloc[0][0] city = df.iloc[0][1] or "" region = df.iloc[0][2] or "" street = df.iloc[0][3] print(f'pronvince: {province}, city: {city}, region: {region}, detail: {street}') print("+++++++++++++++++++++") # 第一步,判断 df中的省市区是否存在,存在说明后面的street只要把原来的省市区丢掉即可 if province and city and region: if "区" in street: street = street.split("市")[-1].split("区")[-1] if "县" in street: street = street.split("市")[-1].split("县")[-1] # if 其他的存在,比如:旗 print("省市区都存在,只需要切割street中‘区’后面的内容") print(province, city, region, street) print("============================") # 第二步,判断 df中“区”是否存在,如果存在,那么 “省市区” 肯定都存在 elif not region: # 判断 df中“市”是否存在,如果存在,那么 ”省市“ 肯定都存在 if city: # 地址校准 if '区' in street or '县' in street: if "区" in street: temp_region = street.split("区")[0] + "区" street = street.split("区")[-1] if "县" in street: temp_region = street.split("县")[0] + "县" street = street.split("县")[-1] region = self.cal_region(province, city, region, temp_region, areas) print("cpca没有检测到‘区’,但是检测到省,市,并且street中含有‘区’,需要校准区") print(province, city, region, street) print("============================") else: # (暂时)说明street没有必要切割了 print("cpca没有检测到‘区’,但是检测到省,市,并且street中没有有‘区’,就不需要校准区") # 这里比较复杂 elif not city: # 还是先校准“区”,如果区存在,就直接把区拿到,然后再进行一次cpca, # 如果“区”不存在,就直接校准“市”,street直接切割 if '区' in street or '县' in street: if "区" in street: temp_region = street.split('市')[-1].split('区')[0] + "区" street = street.split('区')[-1] if "县" in street: temp_region = street.split('市')[-1].split('县')[0] + "县" street = street.split('县')[-1] region = self.cal_region_non_city(province, region, temp_region, areas) addr = province + city + region + street df = cpca.transform([addr]) province = df.iloc[0][0] city = df.iloc[0][1] or "" region = df.iloc[0][2] or "" street = df.iloc[0][3] print("cpca只检测到了省,但是street中有区,直接分割出区,再做cpca即可") print(province, city, region, street) print("============================") elif '市' not in street: # 市,区都没有,那就直接返回 print("cpca只检测到了省,并且street中没有市,也没有区,直接跳过") print(province, city, region, street) print("============================") elif '市' in street: temp_city = street.split('市')[0] + '市' street = street.split('市')[-1] city = self.cal_city(province, city, temp_city, areas) print("cpca只检测到了省,并且street中有市,没有区,就只要校准city") print(province, city, region, street) print("============================") self.res["address_province"] = RecItem(province, conf) self.res["address_city"] = RecItem(city, conf) if street and "旗" in street: temp_region = [] temp_region.insert(0, street.split("旗")[0] + "旗") self.res["address_region"] = RecItem(temp_region[0], conf) self.res["address_detail"] = RecItem(street.split("旗")[-1], conf) else: self.res["address_region"] = RecItem(region, conf) self.res["address_detail"] = RecItem(street, conf) city_dic = {"宜城市": "宣城市"} if city in city_dic: city = city_dic[city] self.res['address'].text = province + city + region + street # 存入 def parse(self): self.type_() self.address() return {key: self.res[key].to_dict() for key in self.keys} # 0 常驻人口页 class PeopleRegBookParser(Parser): def full_name(self): """ 姓名 """ name_val = '' conf = 0. is_name = False for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] for s in range(len(txt)): if txt[s] == "名" and s < 2 and "姓名" in txt: is_name = True if is_name: name_val = txt.split("姓名")[-1].split("户主")[0].split("中主")[0] break if len(name_val) < 5: self.res["name"] = RecItem(name_val, conf) else: point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981", "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027", "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"] for n in range(len(point_unicode)): point = re.findall(point_unicode[n], name_val) if len(point) != 0: name_list = name_val.split(point[0]) self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf) return def ethnicity(self): """ 性别女 <- id 民族汉 """ national_val = '' conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "民族" in txt: national_val = txt.split("族")[-1] self.res["ethnicity"] = RecItem(national_val, conf) def card_no(self): """ 身份证号码 """ code_val = 0 conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "X" in txt or "x" in txt: code_val = re.findall("\d*[X|x]", txt) else: code_val = re.findall("\d{16,18}", txt) if len(code_val) > 0: if len(code_val[0]) == 18: self.res["id"].text = code_val[0] self.res["id"].confidence = conf self.res["gender"].text = "男" if int(code_val[0][16]) % 2 else "女" self.res["gender"].confidence = conf print('---------------------') print(code_val[0]) print('---------------------') return raise Exception('身份证号识别出错') def blood_type(self): """ 血型 """ blood_val = '' conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "血型" in txt: blood_val = txt.split("血型")[-1] self.res["blood_type"] = RecItem(blood_val, conf) def religion(self): """ 宗教信仰 """ religion_val = '' conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] conf = res[-1][1] if "宗教信仰" in txt: religion_val = txt.split("宗教信仰")[-1] self.res["religion"] = RecItem(religion_val, conf) def birthplace(self): """ 出生地 """ birth_place_txt = '' birth_place_conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] birth_place_conf = res[-1][1] if "出生地" in txt: birth_place_txt = txt.split('民族')[0].split('民')[0] break if birth_place_txt: self.res["birthplace"] = RecItem(birth_place_txt, birth_place_conf) self.split_addr("birth") def native_place(self): """ 籍贯 """ native_place_txt = '' native_place_conf = 0. for i in range(len(self.result)): res = self.result[i] txt = res[-1][0] native_place_conf = res[-1][1] if '贯' in txt and '出' in txt: native_place_txt = txt.split('出生')[0] break if native_place_txt: self.res["native_place"] = RecItem(native_place_txt, native_place_conf) self.split_addr("native") def split_addr(self, place: str): if place == "birth": place = "birthplace" elif place == "native": place = "native_place" print(self.res[place].text, '=======') conf = self.res[place].confidence df = cpca.transform([self.res[place].text]) df = df.replace([None], ['']) # print(df) province = df.iloc[0, 0] city = df.iloc[0, 1] region = df.iloc[0, 2] detail = df.iloc[0, 3] print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}') self.res[place + "_province"] = RecItem(province, conf) self.res[place + "_city"] = RecItem(city, conf) if detail and "旗" in detail: temp_region = [] temp_region.insert(0, detail.split("旗")[0] + "旗") self.res[place + "_region"] = RecItem(temp_region[0], conf) self.res[place + "_detail"] = RecItem(detail.split("旗")[-1], conf) # elif detail and "旗" in detail: else: self.res[place + "_region"] = RecItem(region, conf) self.res[place + "_detail"] = RecItem(detail, conf) self.res[place].text = province + city + region + detail def parse(self): self.full_name() self.ethnicity() self.card_no() # self.address() self.blood_type() self.religion() self.birthplace() self.native_place() # self.type_() # # todo # self.res['native_place'], self.res['birthplace'] = self.res['address'], self.res['address'] # self.res['native_place_province'], self.res['birthplace_province'] = self.res['address_province'], self.res[ # 'address_province'] # self.res['native_place_city'], self.res['birthplace_city'] = self.res['address_city'], self.res['address_city'] # self.res['native_place_region'], self.res['birthplace_region'] = self.res['address_region'], self.res[ # 'address_region'] return {k: self.res[k].to_dict() for k in self.keys}