|
@@ -1,34 +1,54 @@
|
|
|
import re
|
|
|
-import json
|
|
|
import string
|
|
|
-
|
|
|
-
|
|
|
-class IdCardStraight:
|
|
|
+from dataclasses import dataclass
|
|
|
+from collections import defaultdict
|
|
|
+import numpy as np
|
|
|
+import cpca
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class RecItem:
|
|
|
+ text: str = ''
|
|
|
+ confidence: float = 0.
|
|
|
+
|
|
|
+ def to_dict(self):
|
|
|
+ return {"text": self.text, "confidence": self.confidence}
|
|
|
+
|
|
|
+
|
|
|
+class Parser(object):
|
|
|
+ def __init__(self, txts, confs):
|
|
|
+ self.result = txts
|
|
|
+ self.confs = confs
|
|
|
+ assert len(self.result) == len(self.confs), 'result and confs do not match'
|
|
|
+ self.res = defaultdict(RecItem)
|
|
|
+ self.res["Name"] = RecItem()
|
|
|
+ self.res["IDNumber"] = RecItem()
|
|
|
+ self.res["Address"] = RecItem()
|
|
|
+ self.res["Gender"] = RecItem()
|
|
|
+ self.res["Nationality"] = RecItem()
|
|
|
+ self.res["Birth"] = RecItem()
|
|
|
+ self.res["expire_date"] = RecItem()
|
|
|
+
|
|
|
+ def parse(self):
|
|
|
+ return self.res
|
|
|
+
|
|
|
+ @property
|
|
|
+ def confidence(self):
|
|
|
+ return 0.
|
|
|
+
|
|
|
+class FrontParser(Parser):
|
|
|
"""
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, result):
|
|
|
+ def __init__(self, txts, confs):
|
|
|
+ Parser.__init__(self, txts, confs)
|
|
|
self.result = [
|
|
|
i.replace(" ", "").translate(str.maketrans("", "", string.punctuation))
|
|
|
- for i in result
|
|
|
+ for i in txts
|
|
|
]
|
|
|
- self.out = {"Data": {"FrontResult": {}}}
|
|
|
- self.res = self.out["Data"]["FrontResult"]
|
|
|
- self.res["Name"] = ""
|
|
|
- self.res["IDNumber"] = ""
|
|
|
- self.res["Address"] = ""
|
|
|
- self.res["Gender"] = ""
|
|
|
- self.res["Nationality"] = ""
|
|
|
- self.res["year"] = ""
|
|
|
- # self.res["Isauthority"]=""
|
|
|
- # self.res["Effdata"]=""
|
|
|
-
|
|
|
- # def IS_author(self):
|
|
|
- # """
|
|
|
- # 签发机关
|
|
|
- # """
|
|
|
-
|
|
|
- def year(self):
|
|
|
+ assert len(self.result) == len(self.confs), 'result and confs do not match'
|
|
|
+
|
|
|
+ def birth(self):
|
|
|
addString = []
|
|
|
for i in range(len(self.result)):
|
|
|
txt = self.result[i]
|
|
@@ -36,16 +56,11 @@ class IdCardStraight:
|
|
|
# txt = txt.replace("出生", "")
|
|
|
txt = txt.split('生')[-1]
|
|
|
addString.append(txt.strip())
|
|
|
- self.res["year"] = "".join(addString)
|
|
|
-
|
|
|
- # break
|
|
|
- # print(',,,,')
|
|
|
- # print(self.result)
|
|
|
- # txt = txt.replace("出生", "")
|
|
|
- # addString.append(txt)
|
|
|
- # print(txt)
|
|
|
- # self.res["year"] = "".join(addString)
|
|
|
- def birth_no(self):
|
|
|
+ self.res["Birth"].text = "".join(addString)
|
|
|
+ self.res["Birth"].confidence = self.confs[i]
|
|
|
+ break
|
|
|
+
|
|
|
+ def card_no(self):
|
|
|
"""
|
|
|
身份证号码
|
|
|
"""
|
|
@@ -60,35 +75,42 @@ class IdCardStraight:
|
|
|
|
|
|
if len(res) > 0:
|
|
|
if len(res[0]) == 18:
|
|
|
- self.res["IDNumber"] = res[0].replace("号码", "")
|
|
|
- self.res["Gender"] = "男" if int(res[0][16]) % 2 else "女"
|
|
|
+ self.res["IDNumber"].text = res[0].replace("号码", "")
|
|
|
+ self.res["IDNumber"].confidence = self.confs[i]
|
|
|
+ self.res["Gender"].text = "男" if int(res[0][16]) % 2 else "女"
|
|
|
+ self.res["Gender"].confidence = self.confs[i]
|
|
|
break
|
|
|
|
|
|
def full_name(self):
|
|
|
"""
|
|
|
身份证姓名
|
|
|
"""
|
|
|
- # print(self)
|
|
|
for i in range(len(self.result)):
|
|
|
txt = self.result[i]
|
|
|
if ("姓名" or "名" in txt) and len(txt) > 2:
|
|
|
res = re.findall("名[\u4e00-\u9fa5]{1,4}", txt)
|
|
|
if len(res) > 0:
|
|
|
- self.res["Name"] = res[0].split("名")[-1]
|
|
|
+ self.res["Name"].text = res[0].split("名")[-1]
|
|
|
+ self.res["Name"].confidence = self.confs[i]
|
|
|
self.result[i] = "temp" # 避免身份证姓名对地址造成干扰
|
|
|
break
|
|
|
|
|
|
- def sex(self):
|
|
|
+ def gender(self):
|
|
|
"""
|
|
|
性别女民族汉
|
|
|
"""
|
|
|
+ if len(self.res["Gender"].text) != 0: return
|
|
|
for i in range(len(self.result)):
|
|
|
txt = self.result[i]
|
|
|
if "男" in txt:
|
|
|
- self.res["Gender"] = "男"
|
|
|
+ self.res["Gender"].text = "男"
|
|
|
+ self.res["Gender"].confidence = self.confs[i]
|
|
|
+ break
|
|
|
|
|
|
- elif "女" in txt:
|
|
|
- self.res["Gender"] = "女"
|
|
|
+ if "女" in txt:
|
|
|
+ self.res["Gender"].text = "女"
|
|
|
+ self.res["Gender"].confidence = self.confs[i]
|
|
|
+ break
|
|
|
|
|
|
def national(self):
|
|
|
# 性别女民族汉
|
|
@@ -97,7 +119,8 @@ class IdCardStraight:
|
|
|
res = re.findall(".*民族[\u4e00-\u9fa5]+", txt)
|
|
|
|
|
|
if len(res) > 0:
|
|
|
- self.res["Nationality"] = res[0].split("族")[-1]
|
|
|
+ self.res["Nationality"].text = res[0].split("族")[-1]
|
|
|
+ self.res["Nationality"].confidence = self.confs[i]
|
|
|
break
|
|
|
|
|
|
def address(self):
|
|
@@ -105,6 +128,7 @@ class IdCardStraight:
|
|
|
身份证地址
|
|
|
"""
|
|
|
addString = []
|
|
|
+ conf = []
|
|
|
for i in range(len(self.result)):
|
|
|
txt = self.result[i]
|
|
|
txt = txt.replace("号码", "")
|
|
@@ -132,13 +156,31 @@ class IdCardStraight:
|
|
|
addString.insert(0, txt.split("址")[-1])
|
|
|
else:
|
|
|
addString.append(txt)
|
|
|
-
|
|
|
+ conf.append(self.confs[i])
|
|
|
self.result[i] = "temp"
|
|
|
# print(addString)
|
|
|
if len(addString) > 0:
|
|
|
- self.res["Address"] = "".join(addString)
|
|
|
- else:
|
|
|
- self.res["Address"] = ""
|
|
|
+ self.res["Address"].text = "".join(addString)
|
|
|
+ self.res["Address"].confidence = np.mean(conf)
|
|
|
+ print(f'addr: {self.res["Address"]}')
|
|
|
+
|
|
|
+ def split_addr(self):
|
|
|
+ if self.res["Address"].text:
|
|
|
+ conf = self.res["Address"].confidence
|
|
|
+ print('split_addr', self.res["Address"].text)
|
|
|
+ df = cpca.transform([self.res["Address"].text])
|
|
|
+ print(df)
|
|
|
+
|
|
|
+ province = df.iloc[0, 0]
|
|
|
+ city = df.iloc[0, 1]
|
|
|
+ region = df.iloc[0, 2]
|
|
|
+ detail = df.iloc[0, 3]
|
|
|
+ print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
|
|
|
+ self.res["address_province"] = RecItem(province, conf)
|
|
|
+ self.res["address_city"] = RecItem(city, conf)
|
|
|
+ self.res["address_region"] = RecItem(region, conf)
|
|
|
+ self.res["address_detail"] = RecItem(detail, conf)
|
|
|
+
|
|
|
|
|
|
def predict_name(self):
|
|
|
"""
|
|
@@ -163,12 +205,40 @@ class IdCardStraight:
|
|
|
self.res["Name"] = result[0]
|
|
|
break
|
|
|
|
|
|
- def run(self):
|
|
|
+ @property
|
|
|
+ def confidence(self):
|
|
|
+ return np.mean(self.confs)
|
|
|
+
|
|
|
+ def parse(self):
|
|
|
self.full_name()
|
|
|
self.national()
|
|
|
- self.birth_no()
|
|
|
+ self.card_no()
|
|
|
self.address()
|
|
|
- self.predict_name()
|
|
|
- self.year()
|
|
|
- print(self.out)
|
|
|
- return self.out
|
|
|
+ self.split_addr()
|
|
|
+ # self.predict_name()
|
|
|
+ self.birth()
|
|
|
+ self.gender()
|
|
|
+ return self.res
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+class BackParser(Parser):
|
|
|
+ def __init__(self, txts, confs):
|
|
|
+ Parser.__init__(self, txts, confs)
|
|
|
+
|
|
|
+
|
|
|
+ def expire_date(self):
|
|
|
+ for txt, conf in zip(self.result, self.confs):
|
|
|
+ print(txt)
|
|
|
+ res = re.findall('\d{4}\.\d{2}\.\d{2}\-\d{4}\.\d{2}\.\d{2}', txt)
|
|
|
+ print(res)
|
|
|
+ if res:
|
|
|
+ self.res["expire_date"] = RecItem(res[0], conf)
|
|
|
+
|
|
|
+ @property
|
|
|
+ def confidence(self):
|
|
|
+ return np.mean(self.confs)
|
|
|
+
|
|
|
+ def parse(self):
|
|
|
+ self.expire_date()
|
|
|
+ return self.res
|