parser.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. import json
  2. import math
  3. import re
  4. import string
  5. from dataclasses import dataclass
  6. from collections import defaultdict
  7. from typing import List
  8. from core.line_parser import OcrResult
  9. import numpy as np
  10. import cpca
  11. import os
  12. f = open('./core/areas.json', 'r')
  13. content = f.read()
  14. areas = json.loads(content)
  15. @dataclass
  16. class RecItem:
  17. text: str = ''
  18. confidence: float = 0.
  19. def to_dict(self):
  20. return {"text": self.text, "confidence": np.nan_to_num(self.confidence)}
  21. # 父类
  22. class Parser(object):
  23. def __init__(self, ocr_results: List[List[OcrResult]]):
  24. self.result = ocr_results
  25. self.res = defaultdict(RecItem)
  26. self.keys = ['type', "address", 'address_province', 'address_city', 'address_region', 'address_detail',
  27. 'name', 'id', 'gender',
  28. # 出生地
  29. 'birthplace', 'birthplace_province', 'birthplace_city', 'birthplace_region',
  30. # 籍贯
  31. 'native_place', 'native_place_province', 'native_place_city', 'native_place_region',
  32. 'blood_type', 'religion']
  33. for key in self.keys:
  34. self.res[key] = RecItem()
  35. for item in self.result:
  36. temp = [item[0].txt, item[0].conf]
  37. for j in range(len(item)):
  38. item[j].txt = item[j].txt.replace("|", "").replace(":", "").replace(":", "").replace(",", "").replace(
  39. ",", "").replace("【", "").replace("】", "").replace("「", "").replace("[", "").replace("]",
  40. "").replace(
  41. " ", "")
  42. for k in range(1, len(item)):
  43. temp[0] = temp[0] + item[k].txt
  44. temp[1] = np.mean([temp[1], item[k].conf])
  45. item.append(temp)
  46. def parse(self):
  47. return self.res
  48. # 1 户口本首页
  49. class FrontRegBookParser(Parser):
  50. def type_(self):
  51. """
  52. 户别
  53. """
  54. for i in range(len(self.result)):
  55. res = self.result[i]
  56. txt = res[-1][0]
  57. conf = res[-1][1]
  58. type_list = ["家庭户", "集体户", "居民户", "农业户"]
  59. for t in type_list:
  60. if t in txt:
  61. for _, temp_res in enumerate(res):
  62. if t in temp_res.txt:
  63. temp_type = temp_res.txt
  64. self.res["type"] = RecItem(temp_type, conf)
  65. return
  66. break
  67. # if "户别" in txt and "户主" in txt:
  68. # temp_type = txt.split("户别")[-1].split("户主")[0]
  69. # elif "户别" in txt:
  70. # for t in type_list:
  71. # if t in txt:
  72. # temp = txt.split(t)[0]
  73. # temp_type = t if temp == "户别" else txt.split("户别")[-1].split(t)[0] + t
  74. # break
  75. # elif "户主" in txt:
  76. # temp_type = txt.split("户主")[0]
  77. #
  78. # if temp_type:
  79. # self.res["type"] = RecItem(temp_type, conf)
  80. # break
  81. def address(self):
  82. """
  83. 首页住址
  84. """
  85. address_txt = ''
  86. address_conf = 0.
  87. for i in range(len(self.result)):
  88. res = self.result[i]
  89. txt = res[-1][0]
  90. address_conf = res[-1][1]
  91. if (
  92. "住址" in txt
  93. or "住" in txt
  94. or "址" in txt
  95. or "省" in txt
  96. or "市" in txt
  97. or "县" in txt
  98. or "街" in txt
  99. ):
  100. address_txt = txt.split("民族")[0]
  101. break
  102. if address_txt is not None:
  103. self.res["address"] = RecItem(address_txt, address_conf)
  104. self.split_addr()
  105. return
  106. raise Exception('无法识别')
  107. # 校准 区
  108. def cal_region(self, province, city, region, temp_region, areas):
  109. for _, json_province in enumerate(areas):
  110. if province in json_province['name']:
  111. for _, json_city in enumerate(json_province['children']):
  112. if city in json_city['name']:
  113. maxnum = 0
  114. for _, json_region in enumerate(json_city['children']):
  115. # 字符串的校准
  116. # 1. 如果长度相等
  117. num = 0
  118. if len(temp_region) == len(json_region['name']):
  119. for i in range(len(temp_region)):
  120. if temp_region[i] == json_region['name'][i]:
  121. num += 1
  122. # 2. 长度不等,temp_region至少 >=2,但是一般小于真实的地址
  123. elif len(temp_region) < len(json_region['name']):
  124. for i in range(len(temp_region)):
  125. for j in range(len(json_region['name'])):
  126. if temp_region[i] == json_region['name'][j]:
  127. # 找到了就退出,因为一般只会有一个字相同
  128. num += 1
  129. break
  130. if maxnum <= num:
  131. maxnum = num
  132. region = json_region['name']
  133. break
  134. break
  135. return region
  136. # 校准 市
  137. def cal_city(self, province, city, temp_city, areas):
  138. for _, json_province in enumerate(areas):
  139. if province in json_province['name']:
  140. maxnum = 0
  141. for _, json_city in enumerate(json_province['children']):
  142. num = 0
  143. if len(temp_city) == len(json_city['name']):
  144. for i in range(len(temp_city)):
  145. if temp_city[i] == json_city['name'][i]:
  146. num += 1
  147. elif len(temp_city) < len(json_city['name']):
  148. for i in range(len(temp_city)):
  149. for j in range(len(json_city['name'])):
  150. if temp_city[i] == json_city['name'][j]:
  151. num += 1
  152. break
  153. if maxnum <= num:
  154. maxnum = num
  155. city = json_city['name']
  156. break
  157. return city
  158. # 校准 区 ----> 没有市的情况下
  159. def cal_region_non_city(self, province, region, temp_region, areas):
  160. for _, json_province in enumerate(areas):
  161. if province in json_province['name']:
  162. for _, json_city in enumerate(json_province['children']):
  163. maxnum = 0
  164. for _, json_region in enumerate(json_city['children']):
  165. num = 0
  166. if len(temp_region) == len(json_region['name']):
  167. for i in range(len(temp_region)):
  168. if temp_region[i] == json_region['name'][i]:
  169. num += 1
  170. elif len(temp_region) < len(json_region['name']):
  171. for i in range(len(temp_region)):
  172. for j in range(len(json_region['name'])):
  173. if temp_region[i] == json_region['name'][j]:
  174. # 找到了就退出,因为一般只会有一个字相同
  175. num += 1
  176. break
  177. if maxnum <= num:
  178. maxnum = num
  179. region = json_region['name']
  180. break
  181. return region
  182. def split_addr(self):
  183. print(self.res['address'].text, '=======')
  184. pre_addr = self.res['address'].text
  185. # 一般"户主姓名" 比 "住址" 检测框大,所以会跑到和住址一行,并且在"住址"字段后面
  186. if "户主姓名" in pre_addr:
  187. pre_addr = pre_addr.split("户主姓名")[-1]
  188. conf = self.res["address"].confidence
  189. df = cpca.transform([pre_addr])
  190. print(df)
  191. df.replace([None], [''])
  192. province = df.iloc[0][0]
  193. city = df.iloc[0][1] or ""
  194. region = df.iloc[0][2] or ""
  195. street = df.iloc[0][3]
  196. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {street}')
  197. print("+++++++++++++++++++++")
  198. # 第一步,判断 df中的省市区是否存在,存在说明后面的street只要把原来的省市区丢掉即可
  199. if province and city and region:
  200. if "区" in street:
  201. street = street.split("市")[-1].split("区")[-1]
  202. if "县" in street:
  203. street = street.split("市")[-1].split("县")[-1]
  204. # if 其他的存在,比如:旗
  205. print("省市区都存在,只需要切割street中‘区’后面的内容")
  206. print(province, city, region, street)
  207. print("============================")
  208. # 第二步,判断 df中“区”是否存在,如果存在,那么 “省市区” 肯定都存在
  209. elif not region:
  210. # 判断 df中“市”是否存在,如果存在,那么 ”省市“ 肯定都存在
  211. if city:
  212. # 地址校准
  213. if '区' in street or '县' in street:
  214. if "区" in street:
  215. temp_region = street.split("区")[0] + "区"
  216. street = street.split("区")[-1]
  217. if "县" in street:
  218. temp_region = street.split("县")[0] + "县"
  219. street = street.split("县")[-1]
  220. region = self.cal_region(province, city, region, temp_region, areas)
  221. print("cpca没有检测到‘区’,但是检测到省,市,并且street中含有‘区’,需要校准区")
  222. print(province, city, region, street)
  223. print("============================")
  224. else:
  225. # (暂时)说明street没有必要切割了
  226. print("cpca没有检测到‘区’,但是检测到省,市,并且street中没有有‘区’,就不需要校准区")
  227. # 这里比较复杂
  228. elif not city:
  229. # 还是先校准“区”,如果区存在,就直接把区拿到,然后再进行一次cpca,
  230. # 如果“区”不存在,就直接校准“市”,street直接切割
  231. if '区' in street or '县' in street:
  232. if "区" in street:
  233. temp_region = street.split('市')[-1].split('区')[0] + "区"
  234. street = street.split('区')[-1]
  235. if "县" in street:
  236. temp_region = street.split('市')[-1].split('县')[0] + "县"
  237. street = street.split('县')[-1]
  238. region = self.cal_region_non_city(province, region, temp_region, areas)
  239. addr = province + city + region + street
  240. df = cpca.transform([addr])
  241. province = df.iloc[0][0]
  242. city = df.iloc[0][1] or ""
  243. region = df.iloc[0][2] or ""
  244. street = df.iloc[0][3]
  245. print("cpca只检测到了省,但是street中有区,直接分割出区,再做cpca即可")
  246. print(province, city, region, street)
  247. print("============================")
  248. elif '市' not in street:
  249. # 市,区都没有,那就直接返回
  250. print("cpca只检测到了省,并且street中没有市,也没有区,直接跳过")
  251. print(province, city, region, street)
  252. print("============================")
  253. elif '市' in street:
  254. temp_city = street.split('市')[0] + '市'
  255. street = street.split('市')[-1]
  256. city = self.cal_city(province, city, temp_city, areas)
  257. print("cpca只检测到了省,并且street中有市,没有区,就只要校准city")
  258. print(province, city, region, street)
  259. print("============================")
  260. self.res["address_province"] = RecItem(province, conf)
  261. self.res["address_city"] = RecItem(city, conf)
  262. if street and "旗" in street:
  263. temp_region = []
  264. temp_region.insert(0, street.split("旗")[0] + "旗")
  265. self.res["address_region"] = RecItem(temp_region[0], conf)
  266. self.res["address_detail"] = RecItem(street.split("旗")[-1], conf)
  267. else:
  268. self.res["address_region"] = RecItem(region, conf)
  269. self.res["address_detail"] = RecItem(street, conf)
  270. city_dic = {"宜城市": "宣城市"}
  271. if city in city_dic:
  272. city = city_dic[city]
  273. self.res['address'].text = province + city + region + street
  274. # 存入
  275. def parse(self):
  276. self.type_()
  277. self.address()
  278. return {key: self.res[key].to_dict() for key in self.keys}
  279. # 0 常驻人口页
  280. class PeopleRegBookParser(Parser):
  281. def full_name(self):
  282. """
  283. 姓名
  284. """
  285. name_val = ''
  286. conf = 0.
  287. is_name = False
  288. for i in range(len(self.result)):
  289. res = self.result[i]
  290. txt = res[-1][0]
  291. conf = res[-1][1]
  292. for s in range(len(txt)):
  293. if txt[s] == "名" and s < 2 and "姓名" in txt:
  294. is_name = True
  295. if is_name:
  296. name_val = txt.split("姓名")[-1].split("户主")[0].split("中主")[0]
  297. break
  298. if len(name_val) < 5:
  299. self.res["name"] = RecItem(name_val, conf)
  300. else:
  301. point_unicode = ["\u2E31", "\u2218", "\u2219", "\u22C5", "\u25E6", "\u2981",
  302. "\u00B7", "\u0387", "\u05BC", "\u16EB", "\u2022", "\u2027",
  303. "\u2E30", "\uFF0E", "\u30FB", "\uFF65", "\u10101"]
  304. for n in range(len(point_unicode)):
  305. point = re.findall(point_unicode[n], name_val)
  306. if len(point) != 0:
  307. name_list = name_val.split(point[0])
  308. self.res['name'] = RecItem(name_list[0] + '\u00B7' + name_list[1], conf)
  309. return
  310. def ethnicity(self):
  311. """
  312. 性别女 <- id
  313. 民族汉
  314. """
  315. national_val = ''
  316. conf = 0.
  317. for i in range(len(self.result)):
  318. res = self.result[i]
  319. txt = res[-1][0]
  320. conf = res[-1][1]
  321. if "民族" in txt:
  322. national_val = txt.split("族")[-1]
  323. self.res["ethnicity"] = RecItem(national_val, conf)
  324. def card_no(self):
  325. """
  326. 身份证号码
  327. """
  328. code_val = 0
  329. conf = 0.
  330. for i in range(len(self.result)):
  331. res = self.result[i]
  332. txt = res[-1][0]
  333. conf = res[-1][1]
  334. if "X" in txt or "x" in txt:
  335. code_val = re.findall("\d*[X|x]", txt)
  336. else:
  337. code_val = re.findall("\d{16,18}", txt)
  338. if len(code_val) > 0:
  339. if len(code_val[0]) == 18:
  340. self.res["id"].text = code_val[0]
  341. self.res["id"].confidence = conf
  342. self.res["gender"].text = "男" if int(code_val[0][16]) % 2 else "女"
  343. self.res["gender"].confidence = conf
  344. print('---------------------')
  345. print(code_val[0])
  346. print('---------------------')
  347. return
  348. raise Exception('身份证号识别出错')
  349. def blood_type(self):
  350. """
  351. 血型
  352. """
  353. blood_val = ''
  354. conf = 0.
  355. for i in range(len(self.result)):
  356. res = self.result[i]
  357. txt = res[-1][0]
  358. conf = res[-1][1]
  359. if "血型" in txt:
  360. blood_val = txt.split("血型")[-1]
  361. self.res["blood_type"] = RecItem(blood_val, conf)
  362. def religion(self):
  363. """
  364. 宗教信仰
  365. """
  366. religion_val = ''
  367. conf = 0.
  368. for i in range(len(self.result)):
  369. res = self.result[i]
  370. txt = res[-1][0]
  371. conf = res[-1][1]
  372. if "宗教信仰" in txt:
  373. religion_val = txt.split("宗教信仰")[-1]
  374. self.res["religion"] = RecItem(religion_val, conf)
  375. def birthplace(self):
  376. """
  377. 出生地
  378. """
  379. birth_place_txt = ''
  380. birth_place_conf = 0.
  381. for i in range(len(self.result)):
  382. res = self.result[i]
  383. txt = res[-1][0]
  384. birth_place_conf = res[-1][1]
  385. if "出生地" in txt:
  386. birth_place_txt = txt.split('民族')[0].split('民')[0]
  387. break
  388. if birth_place_txt:
  389. self.res["birthplace"] = RecItem(birth_place_txt, birth_place_conf)
  390. self.split_addr("birth")
  391. def native_place(self):
  392. """
  393. 籍贯
  394. """
  395. native_place_txt = ''
  396. native_place_conf = 0.
  397. for i in range(len(self.result)):
  398. res = self.result[i]
  399. txt = res[-1][0]
  400. native_place_conf = res[-1][1]
  401. if '贯' in txt and '出' in txt:
  402. native_place_txt = txt.split('出生')[0]
  403. break
  404. if native_place_txt:
  405. self.res["native_place"] = RecItem(native_place_txt, native_place_conf)
  406. self.split_addr("native")
  407. def split_addr(self, place: str):
  408. if place == "birth":
  409. place = "birthplace"
  410. elif place == "native":
  411. place = "native_place"
  412. print(self.res[place].text, '=======')
  413. conf = self.res[place].confidence
  414. df = cpca.transform([self.res[place].text])
  415. df = df.replace([None], [''])
  416. # print(df)
  417. province = df.iloc[0, 0]
  418. city = df.iloc[0, 1]
  419. region = df.iloc[0, 2]
  420. detail = df.iloc[0, 3]
  421. print(f'pronvince: {province}, city: {city}, region: {region}, detail: {detail}')
  422. self.res[place + "_province"] = RecItem(province, conf)
  423. self.res[place + "_city"] = RecItem(city, conf)
  424. if detail and "旗" in detail:
  425. temp_region = []
  426. temp_region.insert(0, detail.split("旗")[0] + "旗")
  427. self.res[place + "_region"] = RecItem(temp_region[0], conf)
  428. self.res[place + "_detail"] = RecItem(detail.split("旗")[-1], conf)
  429. # elif detail and "旗" in detail:
  430. else:
  431. self.res[place + "_region"] = RecItem(region, conf)
  432. self.res[place + "_detail"] = RecItem(detail, conf)
  433. self.res[place].text = province + city + region + detail
  434. def parse(self):
  435. self.full_name()
  436. self.ethnicity()
  437. self.card_no()
  438. # self.address()
  439. self.blood_type()
  440. self.religion()
  441. self.birthplace()
  442. self.native_place()
  443. # self.type_()
  444. # # todo
  445. # self.res['native_place'], self.res['birthplace'] = self.res['address'], self.res['address']
  446. # self.res['native_place_province'], self.res['birthplace_province'] = self.res['address_province'], self.res[
  447. # 'address_province']
  448. # self.res['native_place_city'], self.res['birthplace_city'] = self.res['address_city'], self.res['address_city']
  449. # self.res['native_place_region'], self.res['birthplace_region'] = self.res['address_region'], self.res[
  450. # 'address_region']
  451. return {k: self.res[k].to_dict() for k in self.keys}