123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261 |
- import re
- import string
- import math
- from dataclasses import dataclass
- from collections import defaultdict
- import numpy as np
- import cpca
- from typing import List
- from zhon.hanzi import punctuation
- from core.line_parser import OcrResult
- @dataclass
- class RecItem:
- text: str = ''
- confidence: float = 0.
- def to_dict(self):
- return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
- class Parser(object):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- self.result = ocr_results
- self.res = defaultdict(RecItem)
- self.keys = ["name", "id", "ethnicity", "gender", "birthday",
- "address", "address_province", "address_city", "address_region", "address_detail", "expire_date"]
- for key in self.keys:
- self.res[key] = RecItem()
- def parse(self):
- return self.res
- class FrontParser(Parser):
- """
- 出生年月日
- """
- def __init__(self, ocr_results: List[List[OcrResult]]):
- Parser.__init__(self, ocr_results)
- def birth(self):
- if len(self.res["id"].text) == 18:
- # 342423 2001 0 2 1 5 6552
- # 012345 6789 10 11 12 13 14
- str_num = self.res["id"].text
- date = str_num[6:10] + "年" + str_num[10:12] + "月" + str_num[12:14] + "日"
- self.res["birthday"] = RecItem(date, self.res['id'].confidence)
- def card_no(self):
- """
- 身份证号码
- """
- for idx, row in enumerate(self.result):
- for r in row:
- txt = r.txt
- # 身份证号码
- if "X" in txt or "x" in txt:
- res = re.findall("\d*[X|x]", txt)
- else:
- res = re.findall("\d{16,18}", txt)
- if len(res) > 0:
- if len(res[0]) == 18:
- self.res["id"].text = res[0]
- self.res["id"].confidence = r.conf
- self.res["gender"].text = "男" if int(res[0][16]) % 2 else "女"
- self.res["gender"].confidence = r.conf
- if idx < 2:
- self.result = self.result[idx + 1:]
- self.result.reverse()
- else:
- self.result = self.result[:idx]
- print('---------------------')
- for row in self.result:
- print(row)
- print('---------------------')
- return
- raise Exception('无法识别')
- def name(self):
- """
- 姓名
- """
- name_val = None
- conf = 0.
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i])):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- mini_dis = [99999., 0]
- is_name = '姓' in txt or '名' in txt
- if is_name and len(res) > 1:
- for k in range(len(self.result[i])):
- if k == j: continue
- p = np.array(res[j].center) - np.array(res[k].center)
- min = math.hypot(p[0], p[1])
- if min < mini_dis[0]:
- mini_dis = [min, k]
- conf = self.result[i][k].conf
- name_val = self.result[i][mini_dis[1]].txt
- elif is_name and len(txt) > 3:
- conf = self.result[i][mini_dis[1]].conf
- name_val = txt.split("姓名")[-1]
- if name_val is None:
- raise Exception('无法识别')
- if len(name_val) < 5:
- self.res["name"] = RecItem(name_val, conf)
- else:
- point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
- "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
- "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
- for n in range(len(point_unicode)):
- point = re.findall(point_unicode[n], name_val)
- if len(point) != 0:
- name_list = name_val.split(point[0])
- self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
- return
- def national(self):
- """
- 性别 <-- id
- 民族汉
- """
- for i in range(len(self.result)):
- res = self.result[i]
- for j in range(len(self.result[i])):
- txt = self.result[i][j].txt
- conf = self.result[i][j].conf
- mini_dis = [99999., 0]
- # 分框
- if '族' in txt and len(txt) < 3:
- for k in range(len(self.result[i])):
- if k == j: continue
- p = np.array(res[j].center) - np.array(res[k].center)
- min = math.hypot(p[0], p[1])
- if min < mini_dis[0]:
- mini_dis = [min, k]
- self.res["ethnicity"] = RecItem(self.result[i][mini_dis[1]].txt, conf)
- return
- # 合框
- elif '族' in txt:
- self.res["ethnicity"] = RecItem(txt.split("族")[-1], conf)
- return
- def address(self):
- """
- 身份证地址
- """
- res = []
- confs = []
- for row in self.result[2:]:
- for r in row:
- txt = r.txt
- if '性别' in txt or '出生' in txt or '民族' in txt: continue
- punctuation_str = punctuation
- for i in punctuation:
- txt = txt.replace(i, '')
- if (
- "住址" in txt
- or "址" in txt
- or "省" in txt
- or "市" in txt
- or "县" in txt
- or "街" in txt
- or "乡" in txt
- or "村" in txt
- or "镇" in txt
- or "区" in txt
- or "城" in txt
- or "组" in txt
- or "旗" in txt
- or "号" in txt
- or "户" in txt
- or "室" in txt
- ):
- # if "住址" in txt or "省" in txt or "址" in txt:
- if "住址" in txt or "址" in txt:
- res.append(txt.split("址")[-1])
- else:
- res.append(txt)
- confs.append(r.conf)
- if len(res) > 0:
- self.res["address"] = RecItem("".join(res), np.mean(confs))
- self.split_addr()
- return
- raise Exception('无法识别')
- def split_addr(self):
- print(self.res['address'].text, '=======')
- conf = self.res["address"].confidence
- df = cpca.transform([self.res["address"].text])
- # print(df)
- province = df.iloc[0, 0]
- city = df.iloc[0, 1]
- region = df.iloc[0, 2]
- detail = df.iloc[0, 3]
- print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
- self.res["address_province"] = RecItem(province, conf)
- self.res["address_city"] = RecItem(city, conf)
- if detail and "旗" in detail:
- temp_region = []
- temp_region.insert(0, detail.split("旗")[0] + "旗")
- self.res["address_region"] = RecItem(temp_region[0], conf)
- self.res["address_detail"] = RecItem(detail.split("旗")[-1], conf)
- # elif detail and "旗" in detail:
- else:
- self.res["address_region"] = RecItem(region, conf)
- self.res["address_detail"] = RecItem(detail, conf)
- if not self.res['address_region'].text or not self.res['address_detail'].text:
- raise Exception('无法识别')
- def parse(self):
- self.card_no()
- self.name()
- self.national()
- self.birth()
- self.address()
- return {key: self.res[key].to_dict() for key in self.keys}
- class BackParser(Parser):
- def __init__(self, ocr_results: List[List[OcrResult]]):
- Parser.__init__(self, ocr_results)
- def expire_date(self):
- for row in self.result:
- for r in row:
- txt = r.txt
- txt = txt.replace('.', '')
- res = re.findall('\d{8}\-\d{4}', txt)
- if res:
- self.res["expire_date"] = RecItem(res[0]+res[0][4:8], r.conf)
- return
- res = re.findall('\d{8}\-长期', txt)
- if res:
- self.res["expire_date"] = RecItem(res[0], r.conf)
- return
- raise Exception('无法识别')
- def parse(self):
- self.expire_date()
- if not self.res["expire_date"].text:
- raise Exception("无法识别")
- return {key: self.res[key].to_dict() for key in self.keys}
|