123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518 |
- import json
- import math
- import re
- import string
- from dataclasses import dataclass
- from collections import defaultdict
- from typing import List
- from core.line_parser import OcrResult
- import numpy as np
- import cpca
- import os
- f = open('./core/areas.json', 'r')
- content = f.read()
- areas = json.loads(content)
- @dataclass
- class RecItem:
- text: str = ''
- confidence: float = 0.
- def to_dict(self):
- return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
- # 父类
- class Parser(object):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- self.result = ocr_results
- self.res = defaultdict(RecItem)
- self.keys = ['type', "address", 'address_province', 'address_city', 'address_region', 'address_detail',
- 'name', 'id', 'gender',
- # 出生地
- 'birthplace', 'birthplace_province', 'birthplace_city', 'birthplace_region',
- # 籍贯
- 'native_place', 'native_place_province', 'native_place_city', 'native_place_region',
- 'blood_type', 'religion']
- for key in self.keys:
- self.res[key] = RecItem()
- for item in self.result:
- temp = [item[0].txt, item[0].conf]
- for j in range(len(item)):
- item[j].txt = item[j].txt.replace("|", "").replace(":", "").replace(":", "").replace(",", "").replace(
- ",", "").replace("【", "").replace("】", "").replace("「", "").replace("[", "").replace("]",
- "").replace(
- " ", "")
- for k in range(1, len(item)):
- temp[0] = temp[0] + item[k].txt
- temp[1] = np.mean([temp[1], item[k].conf])
- item.append(temp)
- def parse(self):
- return self.res
- # 1 户口本首页
- class FrontRegBookParser(Parser):
- def type_(self):
- """
- 户别
- """
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- type_list = ["家庭户", "集体户", "居民户", "农业户"]
- for t in type_list:
- if t in txt:
- for _, temp_res in enumerate(res):
- if t in temp_res.txt:
- temp_type = temp_res.txt
- self.res["type"] = RecItem(temp_type, conf)
- return
- break
- # if "户别" in txt and "户主" in txt:
- # temp_type = txt.split("户别")[-1].split("户主")[0]
- # elif "户别" in txt:
- # for t in type_list:
- # if t in txt:
- # temp = txt.split(t)[0]
- # temp_type = t if temp == "户别" else txt.split("户别")[-1].split(t)[0] + t
- # break
- # elif "户主" in txt:
- # temp_type = txt.split("户主")[0]
- #
- # if temp_type:
- # self.res["type"] = RecItem(temp_type, conf)
- # break
- def address(self):
- """
- 首页住址
- """
- address_txt = ''
- address_conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- address_conf = res[-1][1]
- if (
- "住址" in txt
- or "住" in txt
- or "址" in txt
- or "省" in txt
- or "市" in txt
- or "县" in txt
- or "街" in txt
- ):
- address_txt = txt.split("民族")[0]
- break
- if address_txt is not None:
- self.res["address"] = RecItem(address_txt, address_conf)
- self.split_addr()
- return
- raise Exception('无法识别')
- # 校准 区
- def cal_region(self, province, city, region, temp_region, areas):
- for _, json_province in enumerate(areas):
- if province in json_province['name']:
- for _, json_city in enumerate(json_province['children']):
- if city in json_city['name']:
- maxnum = 0
- for _, json_region in enumerate(json_city['children']):
- # 字符串的校准
- # 1. 如果长度相等
- num = 0
- if len(temp_region) == len(json_region['name']):
- for i in range(len(temp_region)):
- if temp_region[i] == json_region['name'][i]:
- num += 1
- # 2. 长度不等,temp_region至少 >=2,但是一般小于真实的地址
- elif len(temp_region) < len(json_region['name']):
- for i in range(len(temp_region)):
- for j in range(len(json_region['name'])):
- if temp_region[i] == json_region['name'][j]:
- # 找到了就退出,因为一般只会有一个字相同
- num += 1
- break
- if maxnum <= num:
- maxnum = num
- region = json_region['name']
- break
- break
- return region
- # 校准 市
- def cal_city(self, province, city, temp_city, areas):
- for _, json_province in enumerate(areas):
- if province in json_province['name']:
- maxnum = 0
- for _, json_city in enumerate(json_province['children']):
- num = 0
- if len(temp_city) == len(json_city['name']):
- for i in range(len(temp_city)):
- if temp_city[i] == json_city['name'][i]:
- num += 1
- elif len(temp_city) < len(json_city['name']):
- for i in range(len(temp_city)):
- for j in range(len(json_city['name'])):
- if temp_city[i] == json_city['name'][j]:
- num += 1
- break
- if maxnum <= num:
- maxnum = num
- city = json_city['name']
- break
- return city
- # 校准 区 ----> 没有市的情况下
- def cal_region_non_city(self, province, region, temp_region, areas):
- for _, json_province in enumerate(areas):
- if province in json_province['name']:
- for _, json_city in enumerate(json_province['children']):
- maxnum = 0
- for _, json_region in enumerate(json_city['children']):
- num = 0
- if len(temp_region) == len(json_region['name']):
- for i in range(len(temp_region)):
- if temp_region[i] == json_region['name'][i]:
- num += 1
- elif len(temp_region) < len(json_region['name']):
- for i in range(len(temp_region)):
- for j in range(len(json_region['name'])):
- if temp_region[i] == json_region['name'][j]:
- # 找到了就退出,因为一般只会有一个字相同
- num += 1
- break
- if maxnum <= num:
- maxnum = num
- region = json_region['name']
- break
- return region
- def split_addr(self):
- print(self.res['address'].text, '=======')
- pre_addr = self.res['address'].text
- # 一般"户主姓名" 比 "住址" 检测框大,所以会跑到和住址一行,并且在"住址"字段后面
- if "户主姓名" in pre_addr:
- pre_addr = pre_addr.split("户主姓名")[-1]
- conf = self.res["address"].confidence
- df = cpca.transform([pre_addr])
- print(df)
- df.replace([None], [''])
- province = df.iloc[0][0]
- city = df.iloc[0][1] or ""
- region = df.iloc[0][2] or ""
- street = df.iloc[0][3]
- print(f'pronvince: {province}, city: {city}, region: {region}, detail: {street}')
- print("+++++++++++++++++++++")
- # 第一步,判断 df中的省市区是否存在,存在说明后面的street只要把原来的省市区丢掉即可
- if province and city and region:
- if "区" in street:
- street = street.split("市")[-1].split("区")[-1]
- if "县" in street:
- street = street.split("市")[-1].split("县")[-1]
- # if 其他的存在,比如:旗
- print("省市区都存在,只需要切割street中‘区’后面的内容")
- print(province, city, region, street)
- print("============================")
- # 第二步,判断 df中“区”是否存在,如果存在,那么 “省市区” 肯定都存在
- elif not region:
- # 判断 df中“市”是否存在,如果存在,那么 ”省市“ 肯定都存在
- if city:
- # 地址校准
- if '区' in street or '县' in street:
- if "区" in street:
- temp_region = street.split("区")[0] + "区"
- street = street.split("区")[-1]
- if "县" in street:
- temp_region = street.split("县")[0] + "县"
- street = street.split("县")[-1]
- region = self.cal_region(province, city, region, temp_region, areas)
- print("cpca没有检测到‘区’,但是检测到省,市,并且street中含有‘区’,需要校准区")
- print(province, city, region, street)
- print("============================")
- else:
- # (暂时)说明street没有必要切割了
- print("cpca没有检测到‘区’,但是检测到省,市,并且street中没有有‘区’,就不需要校准区")
- # 这里比较复杂
- elif not city:
- # 还是先校准“区”,如果区存在,就直接把区拿到,然后再进行一次cpca,
- # 如果“区”不存在,就直接校准“市”,street直接切割
- if '区' in street or '县' in street:
- if "区" in street:
- temp_region = street.split('市')[-1].split('区')[0] + "区"
- street = street.split('区')[-1]
- if "县" in street:
- temp_region = street.split('市')[-1].split('县')[0] + "县"
- street = street.split('县')[-1]
- region = self.cal_region_non_city(province, region, temp_region, areas)
- addr = province + city + region + street
- df = cpca.transform([addr])
- province = df.iloc[0][0]
- city = df.iloc[0][1] or ""
- region = df.iloc[0][2] or ""
- street = df.iloc[0][3]
- print("cpca只检测到了省,但是street中有区,直接分割出区,再做cpca即可")
- print(province, city, region, street)
- print("============================")
- elif '市' not in street:
- # 市,区都没有,那就直接返回
- print("cpca只检测到了省,并且street中没有市,也没有区,直接跳过")
- print(province, city, region, street)
- print("============================")
- elif '市' in street:
- temp_city = street.split('市')[0] + '市'
- street = street.split('市')[-1]
- city = self.cal_city(province, city, temp_city, areas)
- print("cpca只检测到了省,并且street中有市,没有区,就只要校准city")
- print(province, city, region, street)
- print("============================")
- self.res["address_province"] = RecItem(province, conf)
- self.res["address_city"] = RecItem(city, conf)
- if street and "旗" in street:
- temp_region = []
- temp_region.insert(0, street.split("旗")[0] + "旗")
- self.res["address_region"] = RecItem(temp_region[0], conf)
- self.res["address_detail"] = RecItem(street.split("旗")[-1], conf)
- else:
- self.res["address_region"] = RecItem(region, conf)
- self.res["address_detail"] = RecItem(street, conf)
- city_dic = {"宜城市": "宣城市"}
- if city in city_dic:
- city = city_dic[city]
- self.res['address'].text = province + city + region + street
- # 存入
- def parse(self):
- self.type_()
- self.address()
- return {key: self.res[key].to_dict() for key in self.keys}
- # 0 常驻人口页
- class PeopleRegBookParser(Parser):
- def full_name(self):
- """
- 姓名
- """
- name_val = ''
- conf = 0.
- is_name = False
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- for s in range(len(txt)):
- if txt[s] == "名" and s < 2 and "姓名" in txt:
- is_name = True
- if is_name:
- name_val = txt.split("姓名")[-1].split("户主")[0].split("中主")[0]
- break
- if len(name_val) < 5:
- self.res["name"] = RecItem(name_val, conf)
- else:
- point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
- "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
- "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
- for n in range(len(point_unicode)):
- point = re.findall(point_unicode[n], name_val)
- if len(point) != 0:
- name_list = name_val.split(point[0])
- self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
- return
- def ethnicity(self):
- """
- 性别女 <- id
- 民族汉
- """
- national_val = ''
- conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "民族" in txt:
- national_val = txt.split("族")[-1]
- self.res["ethnicity"] = RecItem(national_val, conf)
- def card_no(self):
- """
- 身份证号码
- """
- code_val = 0
- conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "X" in txt or "x" in txt:
- code_val = re.findall("\d*[X|x]", txt)
- else:
- code_val = re.findall("\d{16,18}", txt)
- if len(code_val) > 0:
- if len(code_val[0]) == 18:
- self.res["id"].text = code_val[0]
- self.res["id"].confidence = conf
- self.res["gender"].text = "男" if int(code_val[0][16]) % 2 else "女"
- self.res["gender"].confidence = conf
- print('---------------------')
- print(code_val[0])
- print('---------------------')
- return
- raise Exception('身份证号识别出错')
- def blood_type(self):
- """
- 血型
- """
- blood_val = ''
- conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "血型" in txt:
- blood_val = txt.split("血型")[-1]
- self.res["blood_type"] = RecItem(blood_val, conf)
- def religion(self):
- """
- 宗教信仰
- """
- religion_val = ''
- conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- conf = res[-1][1]
- if "宗教信仰" in txt:
- religion_val = txt.split("宗教信仰")[-1]
- self.res["religion"] = RecItem(religion_val, conf)
- def birthplace(self):
- """
- 出生地
- """
- birth_place_txt = ''
- birth_place_conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- birth_place_conf = res[-1][1]
- if "出生地" in txt:
- birth_place_txt = txt.split('民族')[0].split('民')[0]
- break
- if birth_place_txt:
- self.res["birthplace"] = RecItem(birth_place_txt, birth_place_conf)
- self.split_addr("birth")
- def native_place(self):
- """
- 籍贯
- """
- native_place_txt = ''
- native_place_conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- txt = res[-1][0]
- native_place_conf = res[-1][1]
- if '贯' in txt and '出' in txt:
- native_place_txt = txt.split('出生')[0]
- break
- if native_place_txt:
- self.res["native_place"] = RecItem(native_place_txt, native_place_conf)
- self.split_addr("native")
- def split_addr(self, place: str):
- if place == "birth":
- place = "birthplace"
- elif place == "native":
- place = "native_place"
- print(self.res[place].text, '=======')
- conf = self.res[place].confidence
- df = cpca.transform([self.res[place].text])
- df = df.replace([None], [''])
- # print(df)
- province = df.iloc[0, 0]
- city = df.iloc[0, 1]
- region = df.iloc[0, 2]
- detail = df.iloc[0, 3]
- print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
- self.res[place + "_province"] = RecItem(province, conf)
- self.res[place + "_city"] = RecItem(city, conf)
- if detail and "旗" in detail:
- temp_region = []
- temp_region.insert(0, detail.split("旗")[0] + "旗")
- self.res[place + "_region"] = RecItem(temp_region[0], conf)
- self.res[place + "_detail"] = RecItem(detail.split("旗")[-1], conf)
- # elif detail and "旗" in detail:
- else:
- self.res[place + "_region"] = RecItem(region, conf)
- self.res[place + "_detail"] = RecItem(detail, conf)
- self.res[place].text = province + city + region + detail
- def parse(self):
- self.full_name()
- self.ethnicity()
- self.card_no()
- # self.address()
- self.blood_type()
- self.religion()
- self.birthplace()
- self.native_place()
- # self.type_()
- # # todo
- # self.res['native_place'], self.res['birthplace'] = self.res['address'], self.res['address']
- # self.res['native_place_province'], self.res['birthplace_province'] = self.res['address_province'], self.res[
- # 'address_province']
- # self.res['native_place_city'], self.res['birthplace_city'] = self.res['address_city'], self.res['address_city']
- # self.res['native_place_region'], self.res['birthplace_region'] = self.res['address_region'], self.res[
- # 'address_region']
- return {k: self.res[k].to_dict() for k in self.keys}
|