123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- import re
- import string
- from dataclasses import dataclass
- from collections import defaultdict
- import numpy as np
- import cpca
- @dataclass
- class RecItem:
- text: str = ''
- confidence: float = 0.
- def to_dict(self):
- return {"text": self.text, "confidence": self.confidence}
- class Parser(object):
- def __init__(self, txts, confs):
- self.result = txts
- self.confs = confs
- assert len(self.result) == len(self.confs), 'result and confs do not match'
- self.res = defaultdict(RecItem)
- self.res["Name"] = RecItem()
- self.res["IDNumber"] = RecItem()
- self.res["Address"] = RecItem()
- self.res["Gender"] = RecItem()
- self.res["Nationality"] = RecItem()
- self.res["Birth"] = RecItem()
- self.res["expire_date"] = RecItem()
- def parse(self):
- return self.res
- @property
- def confidence(self):
- return 0.
- class FrontParser(Parser):
- """
- """
- def __init__(self, txts, confs):
- Parser.__init__(self, txts, confs)
- self.result = [
- i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
- for i in txts
- ]
- assert len(self.result) == len(self.confs), 'result and confs do not match'
- def birth(self):
- addString = []
- for i in range(len(self.result)):
- txt = self.result[i]
- if "出生" in txt or "生" in txt:
- # txt = txt.replace("出生", "")
- txt = txt.split('生')[-1]
- addString.append(txt.strip())
- self.res["Birth"] = RecItem("".join(addString), self.confs[i])
- break
- def card_no(self):
- """
- 身份证号码
- """
- for i in range(len(self.result)):
- txt = self.result[i]
- # 身份证号码
- if "X" in txt or "x" in txt:
- res = re.findall("\d*[X|x]", txt)
- else:
- res = re.findall("\d{16,18}", txt)
- if len(res) > 0:
- if len(res[0]) == 18:
- self.res["IDNumber"].text = res[0].replace("号码", "")
- self.res["IDNumber"].confidence = self.confs[i]
- self.res["Gender"].text = "男" if int(res[0][16]) % 2 else "女"
- self.res["Gender"].confidence = self.confs[i]
- break
- def full_name(self):
- """
- 身份证姓名
- """
- for i in range(len(self.result)):
- txt = self.result[i]
- if ("姓名" or "名" in txt) and len(txt) > 2:
- res = re.findall("名[\u4e00-\u9fa5]{1,4}", txt)
- if len(res) > 0:
- self.res["Name"].text = res[0].split("名")[-1]
- self.res["Name"].confidence = self.confs[i]
- self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
- break
- def gender(self):
- """
- 性别女民族汉
- """
- if len(self.res["Gender"].text) != 0: return
- for i in range(len(self.result)):
- txt = self.result[i]
- if "男" in txt:
- self.res["Gender"] = RecItem("男", self.confs[i])
- break
- if "女" in txt:
- self.res["Gender"] = RecItem("女", self.confs[i])
- break
- def national(self):
- # 性别女民族汉
- for i in range(len(self.result)):
- txt = self.result[i]
- res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
- if len(res) > 0:
- self.res["Nationality"] = RecItem(res[0].split("族")[-1], self.confs[i])
- break
- def address(self):
- """
- 身份证地址
- """
- addString = []
- conf = []
- for i in range(len(self.result)):
- txt = self.result[i]
- txt = txt.replace("号码", "")
- if "公民" in txt:
- txt = "temp"
- # 身份证地址
- if (
- "住址" in txt
- or "址" in txt
- or "省" in txt
- or "市" in txt
- or "县" in txt
- or "街" in txt
- or "乡" in txt
- or "村" in txt
- or "镇" in txt
- or "区" in txt
- or "城" in txt
- or "组" in txt
- or "号" in txt
- ):
- if "住址" in txt or "省" in txt or "址" in txt:
- addString.insert(0, txt.split("址")[-1])
- else:
- addString.append(txt)
- conf.append(self.confs[i])
- self.result[i] = "temp"
- # print(addString)
- if len(addString) > 0:
- self.res["Address"].text = "".join(addString)
- self.res["Address"].confidence = np.mean(conf)
- print(f'addr: {self.res["Address"]}')
- def split_addr(self):
- if self.res["Address"].text:
- conf = self.res["Address"].confidence
- print('split_addr', self.res["Address"].text)
- df = cpca.transform([self.res["Address"].text])
- print(df)
- province = df.iloc[0, 0]
- city = df.iloc[0, 1]
- region = df.iloc[0, 2]
- detail = df.iloc[0, 3]
- print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
- self.res["address_province"] = RecItem(province, conf)
- self.res["address_city"] = RecItem(city, conf)
- self.res["address_region"] = RecItem(region, conf)
- self.res["address_detail"] = RecItem(detail, conf)
- def expire_date(self):
- for txt, conf in zip(self.result, self.confs):
- print(txt)
- res = re.findall('\d{4}\.\d{2}\.\d{2}\-\d{4}\.\d{2}\.\d{2}', txt)
- print(res)
- if res:
- self.res["expire_date"] = RecItem(res[0], conf)
- def predict_name(self):
- """
- 如果PaddleOCR返回的不是姓名xx连着的,则需要去猜测这个姓名,此处需要改进
- """
- if self.res['Name']: return
- for i in range(len(self.result)):
- txt = self.result[i]
- if self.res["Name"] == "":
- if len(txt) > 1 and len(txt) < 5:
- if (
- "性别" not in txt
- and "姓名" not in txt
- and "民族" not in txt
- and "住址" not in txt
- and "出生" not in txt
- and "号码" not in txt
- and "身份" not in txt
- ):
- result = re.findall("[\u4e00-\u9fa5]{2,4}", txt)
- if len(result) > 0:
- self.res["Name"] = result[0]
- break
- @property
- def confidence(self):
- return np.mean(self.confs)
- def parse(self):
- self.full_name()
- self.national()
- self.card_no()
- self.address()
- self.split_addr()
- # self.predict_name()
- self.birth()
- self.gender()
- self.expire_date()
- return self.res
- class BackParser(Parser):
- def __init__(self, txts, confs):
- Parser.__init__(self, txts, confs)
- def expire_date(self):
- for txt, conf in zip(self.result, self.confs):
- print(txt)
- res = re.findall('\d{4}\.\d{2}\.\d{2}\-\d{4}\.\d{2}\.\d{2}', txt)
- print(res)
- if res:
- self.res["expire_date"] = RecItem(res[0], conf)
- @property
- def confidence(self):
- return np.mean(self.confs)
- def parse(self):
- self.expire_date()
- return self.res
|