|
@@ -33,17 +33,15 @@ class Parser(object):
|
|
|
for key in self.keys:
|
|
|
self.res[key] = RecItem()
|
|
|
|
|
|
- for i in range(len(self.result)):
|
|
|
- temp = [self.result[i][0].txt, self.result[i][0].conf]
|
|
|
- for j in range(len(self.result[i])):
|
|
|
- self.result[i][j].txt = self.result[i][j].txt \
|
|
|
- .replace("|", "").replace(":", "").replace(":", "").replace(",", "").replace(",", "") \
|
|
|
- .replace("【", "").replace("】", "").replace("「", "").replace("[", "").replace("]", "") \
|
|
|
- .replace(" ", "")
|
|
|
- for k in range(1, len(self.result[i])):
|
|
|
- temp[0] = temp[0] + self.result[i][k].txt
|
|
|
- temp[1] = np.mean([temp[1], self.result[i][k].conf])
|
|
|
- self.result[i].append(temp)
|
|
|
+ for item in self.result:
|
|
|
+ temp = [item[0].txt, item[0].conf]
|
|
|
+ for j in range(len(item)):
|
|
|
+ item[j].txt = item[j].txt.replace("|", "").replace(":", "").replace(":", "").replace(",", "").replace(",", "").replace("【", "").replace("】", "").replace("「", "").replace("[", "").replace("]", "").replace(" ", "")
|
|
|
+
|
|
|
+ for k in range(1, len(item)):
|
|
|
+ temp[0] = temp[0] + item[k].txt
|
|
|
+ temp[1] = np.mean([temp[1], item[k].conf])
|
|
|
+ item.append(temp)
|
|
|
|
|
|
def parse(self):
|
|
|
return self.res
|
|
@@ -83,7 +81,7 @@ class FrontRegBookParser(Parser):
|
|
|
or "街" in txt
|
|
|
|
|
|
):
|
|
|
- address_txt = txt
|
|
|
+ address_txt = txt.split("民族")[0]
|
|
|
break
|
|
|
|
|
|
if address_txt is not None:
|
|
@@ -100,9 +98,9 @@ class FrontRegBookParser(Parser):
|
|
|
df.replace([None], [''])
|
|
|
|
|
|
province = df.iloc[0, 0]
|
|
|
- city = df.iloc[0, 1]
|
|
|
- region = df.iloc[0, 2]
|
|
|
- detail = df.iloc[0, 3]
|
|
|
+ city = df.iloc[0, 1] or ""
|
|
|
+ region = df.iloc[0, 2] or ""
|
|
|
+ detail = df.iloc[0, 3] or ""
|
|
|
print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
|
|
|
self.res["address_province"] = RecItem(province, conf)
|
|
|
self.res["address_city"] = RecItem(city, conf)
|
|
@@ -116,6 +114,10 @@ class FrontRegBookParser(Parser):
|
|
|
self.res["address_region"] = RecItem(region, conf)
|
|
|
self.res["address_detail"] = RecItem(detail, conf)
|
|
|
|
|
|
+ city_dic = {"宜城市":"宣城市"}
|
|
|
+ if city in city_dic:
|
|
|
+ city = city_dic[city]
|
|
|
+
|
|
|
self.res['address'].text = province + city + region + detail
|
|
|
|
|
|
# 存入
|
|
@@ -134,12 +136,17 @@ class PeopleRegBookParser(Parser):
|
|
|
"""
|
|
|
name_val = ''
|
|
|
conf = 0.
|
|
|
+ is_name = False
|
|
|
for i in range(len(self.result)):
|
|
|
res = self.result[i]
|
|
|
txt = res[-1][0]
|
|
|
conf = res[-1][1]
|
|
|
- if "姓名" in txt:
|
|
|
- name_val = txt.split("姓名")[-1].split("户主")[0]
|
|
|
+ for s in range(len(txt)):
|
|
|
+ if txt[s] == "名" and s < 2 and "姓名" in txt:
|
|
|
+ is_name = True
|
|
|
+ if is_name:
|
|
|
+ name_val = txt.split("姓名")[-1].split("户主")[0].split("中主")[0]
|
|
|
+ break
|
|
|
|
|
|
if len(name_val) < 5:
|
|
|
self.res["name"] = RecItem(name_val, conf)
|
|
@@ -238,7 +245,7 @@ class PeopleRegBookParser(Parser):
|
|
|
txt = res[-1][0]
|
|
|
birth_place_conf = res[-1][1]
|
|
|
if "出生地" in txt:
|
|
|
- birth_place_txt = txt.split('民族')[0]
|
|
|
+ birth_place_txt = txt.split('民族')[0].split('民')[0]
|
|
|
break
|
|
|
|
|
|
if birth_place_txt:
|
|
@@ -256,7 +263,7 @@ class PeopleRegBookParser(Parser):
|
|
|
res = self.result[i]
|
|
|
txt = res[-1][0]
|
|
|
native_place_conf = res[-1][1]
|
|
|
- if '籍贯' in txt:
|
|
|
+ if '贯' in txt and '出' in txt:
|
|
|
native_place_txt = txt.split('出生')[0]
|
|
|
break
|
|
|
|