parser.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. import re
  2. from collections import defaultdict
  3. from dataclasses import dataclass
  4. import random
  5. from typing import List
  6. import cpca
  7. import cv2
  8. import numpy as np
  9. import string
  10. from paddleocr import PaddleOCR
  11. from zhon.hanzi import punctuation
  12. import cn2an
  13. from blfe_core.business_parse import BussinessParse0, BussinessParse1
  14. from blfe_core.line_parser import OcrResult
  15. from blfe_core.square_parser import parser_xy
  16. from stamp.d_stamp import send_request
  17. def fix_text(text):
  18. err_dict = {'伍任': '伍仟','(': '(', ')': ')', '型型': '类型', '壹任': '壹仟', '查佰': '壹佰'}
  19. for k, v in err_dict.items():
  20. text = text.replace(k, v)
  21. return text
  22. def clear_punctuation(txt):
  23. t = txt[:3]
  24. for c in string.punctuation:
  25. t = t.replace(c, '')
  26. for c in punctuation:
  27. t = t.replace(c, '')
  28. txt = t + txt[3:]
  29. return txt
  30. @dataclass
  31. class RecItem:
  32. text: str = ''
  33. confidence: float = 0.
  34. def to_dict(self):
  35. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  36. class Parser(object):
  37. def __init__(self, ocr_results: List[List[OcrResult]], raw_results: List, ppocr):
  38. self.result = ocr_results
  39. self.res = defaultdict(RecItem)
  40. self.raw_results = raw_results
  41. self.ppocr = ppocr
  42. self.keys = ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
  43. "business_scope", 'expire_date', 'address', 'stamp']
  44. for key in self.keys:
  45. self.res[key] = RecItem()
  46. # ch_an_al = re.compile('[\u4e00-\u9fa5+\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]')
  47. for item in self.result:
  48. tail = ['', 1.0]
  49. for k in range(len(item)):
  50. tail[0] = tail[0] + item[k].txt
  51. tail[1] = tail[1] + item[k].conf
  52. tail[1] = (tail[1] - 1.0) / len(item)
  53. item.append(tail)
  54. for i in range(len(self.result)):
  55. res = self.result[i]
  56. txt = res[-1][0]
  57. if "登记机关" in txt:
  58. self.result = self.result[:i + 1]
  59. break
  60. raw_OR_list = [OcrResult(np.array(res_raw[0]), res_raw[1][0].replace(' ', ''), res_raw[1][1]) for res_raw in
  61. self.raw_results]
  62. self.raw_results = raw_OR_list
  63. def parse(self):
  64. return self.res
  65. class BusinessLicenseParser0(Parser):
  66. def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List, ppocr):
  67. Parser.__init__(self, ocr_results, raw_results, ppocr)
  68. self.image = image
  69. def social_code(self):
  70. """
  71. 社会信用代码
  72. """
  73. # 得在"营业执照"以下
  74. result = []
  75. for i in range(len(self.result)):
  76. res = self.result[i]
  77. txt = res[-1][0]
  78. if "统一社" in txt or "会信用" in txt or "用代码" in txt:
  79. result = self.result[i:]
  80. break
  81. for i in range(len(result)):
  82. res = result[i]
  83. txt = res[-1][0]
  84. conf = res[-1][1]
  85. code = ''.join(re.findall(u'[\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]{15,18}', txt))
  86. if len(code):
  87. self.res['social_code'] = RecItem(code, conf)
  88. return
  89. def company_name(self):
  90. """
  91. 公司名称
  92. """
  93. for i in range(len(self.result)):
  94. res = self.result[i]
  95. txt = res[-1][0]
  96. conf = res[-1][1]
  97. if '称尔' in txt: txt = txt.replace('称尔', '称')
  98. if '名' in txt[:4] and '称' in txt[:4]:
  99. txt = '名称' + txt.split('称')[-1]
  100. if '名称' in txt:
  101. company_name = txt.split('名称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
  102. self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
  103. return
  104. if '称' in txt and txt[0] == '称' and len(txt) > 5:
  105. company_name = txt.split('称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
  106. self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
  107. return
  108. def legal_person(self):
  109. """
  110. 法人姓名
  111. """
  112. for i in range(len(self.result)):
  113. res = self.result[i]
  114. txt = res[-1][0].replace('市场监督', '')
  115. conf = res[-1][1]
  116. if '法定代表人' in txt or '代表人' in txt:
  117. legal_person = txt.split('代表人')[-1].split('营业')[0]
  118. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  119. return
  120. if '经营者' in txt:
  121. legal_person = txt.split('经营者')[-1].split('经营')[0]
  122. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  123. return
  124. if '负责人' in txt:
  125. legal_person = txt.split('负责人')[-1].split('责人')[0]
  126. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  127. return
  128. def registered_capital(self):
  129. """
  130. 注册资本
  131. """
  132. for i in range(len(self.result)):
  133. res = self.result[i]
  134. txt = res[-1][0]
  135. conf = res[-1][1]
  136. txt = fix_text(txt)
  137. if '注册资本' in txt:
  138. if '人民币' in txt[:4]:
  139. registered_capital = txt.split('人民币')[-1].split('万元')[0]
  140. txt = f'人民币{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
  141. elif '美元' in txt[:4]:
  142. registered_capital = txt.split('美元')[-1].split('万元')[0]
  143. txt = f'美元{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
  144. elif '人民币' in txt[-4:]:
  145. registered_capital = txt.split('资本')[-1].split('人民币')[0].split('万元')[0]
  146. txt = f'{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}人民币'
  147. else:
  148. registered_capital = txt.split('资本')[-1].split('币')[-1].split('万')[0]
  149. txt = self.cn_to_an(fix_text(clear_punctuation(registered_capital)))
  150. self.res['registered_capital'] = RecItem(txt, conf)
  151. return
  152. def type(self): # sourcery skip: hoist-similar-statement-from-if
  153. """
  154. 类型
  155. """
  156. for i in range(len(self.result)):
  157. res = self.result[i]
  158. txt = res[-1][0]
  159. conf = res[-1][1]
  160. txt = fix_text(clear_punctuation(txt))
  161. if '类型' in txt:
  162. txt = txt.split('类型')[-1].split('成立')[0].split('注册')[0]
  163. if '公司' in txt:
  164. t_s, s_e = txt.split('公司')[0], txt.split('公司')[-1].replace('(', '').replace(')', '').replace('(',
  165. '').replace(
  166. ')', '')
  167. # 分公司
  168. if '分公司' in txt:
  169. t_s = f'{t_s}公司分'
  170. txt = f'{t_s}公司({s_e})' if s_e else f'{t_s}公司'
  171. if txt[0] == '型': txt = txt[1:]
  172. self.res['type'] = RecItem(txt, conf)
  173. return
  174. def start_date(self):
  175. """
  176. 成立日期 ⚠️ 注册日期
  177. """
  178. for i in range(len(self.result)):
  179. res = self.result[i]
  180. txt = res[-1][0]
  181. conf = res[-1][1]
  182. txt = fix_text(txt)
  183. if '日期' in txt:
  184. txt = txt.split('日期')[-1]
  185. date = self.to_date(txt)
  186. self.res['start_date'] = RecItem(date, conf)
  187. def expire_date(self): # sourcery skip: hoist-similar-statement-from-if
  188. """
  189. 有效期
  190. """
  191. for i in range(len(self.result)):
  192. res = self.result[i]
  193. txt = res[-1][0]
  194. conf = res[-1][1]
  195. if '期限' in txt:
  196. if '至' in txt:
  197. txt = ''.join(txt.split('期限')[1:]).replace('*', '')
  198. date_from = txt.split('至')[0]
  199. date_to = txt.split('至')[-1]
  200. date = f'{self.to_date(date_from)} 至 {self.to_date(date_to)}'
  201. self.res['expire_date'] = RecItem(date, conf)
  202. return
  203. if '长期' in txt:
  204. self.res['expire_date'] = RecItem('长期', conf)
  205. return
  206. else:
  207. self.res['expire_date'] = RecItem('', conf)
  208. return
  209. def business_scope(self):
  210. """
  211. 经营范围
  212. """
  213. ocr = PaddleOCR(use_gpu=True)
  214. bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(self.ocr).detection(self.image, self.raw_results)
  215. if bool(bs_txt):
  216. self.res['business_scope'] = RecItem(bs_txt, bs_conf)
  217. add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
  218. if add_or_0:
  219. add_or = add_or_0
  220. else:
  221. add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
  222. if add_or_1:
  223. add_or = add_or_1
  224. else:
  225. return
  226. txt = add_or.txt
  227. if '所' in txt[:3] or '厂' in txt[:3]:
  228. txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
  229. if len(ad_txt) >= len(txt):
  230. self.res['address'] = RecItem(ad_txt, ad_conf)
  231. else:
  232. self.res['address'] = RecItem(txt, add_or.conf)
  233. return
  234. def address(self): # sourcery skip: use-named-expression
  235. """
  236. 住所
  237. """
  238. add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
  239. if add_or_0:
  240. add_or = add_or_0
  241. else:
  242. add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
  243. if add_or_1:
  244. add_or = add_or_1
  245. else:
  246. return
  247. txt = add_or.txt
  248. if '所' in txt[:3] or '厂' in txt[:3]:
  249. txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
  250. self.res['address'] = RecItem(txt, add_or.conf)
  251. return
  252. def stamp(self):
  253. """
  254. 印章检测
  255. """
  256. self.res['stamp'] = RecItem(send_request(self.image), 1.)
  257. return
  258. @staticmethod
  259. def cn_to_an(num):
  260. try:
  261. num = int(num)
  262. except ValueError:
  263. num = str(cn2an.cn2an(f'{num}万'))[:-4]
  264. except Exception:
  265. raise Exception('注册资本转化出错')
  266. finally:
  267. return f'{num}万元'
  268. @staticmethod
  269. def to_date(txt):
  270. if '长期' in txt: return '长期'
  271. if '永久' in txt: return '永久'
  272. if '不约定' in txt: return '不约定期限'
  273. date_in = re.findall(r"\d+", txt)
  274. if len(date_in) == 3:
  275. return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2]}日'
  276. else:
  277. return ''
  278. # ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
  279. # "business_scope", 'expire_date', 'address', 'stamp']
  280. def parse(self):
  281. self.social_code()
  282. self.company_name()
  283. self.legal_person()
  284. self.registered_capital()
  285. self.type()
  286. self.start_date()
  287. self.expire_date()
  288. self.business_scope()
  289. # self.address()
  290. self.stamp()
  291. return {key: self.res[key].to_dict() for key in self.keys}
  292. class BusinessLicenseParser1(Parser):
  293. def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List, ppocr):
  294. Parser.__init__(self, ocr_results, raw_results, ppocr)
  295. self.image = image
  296. self.ocr = PaddleOCR(use_gpu=True)
  297. def social_code(self):
  298. """
  299. 社会信用代码
  300. """
  301. # 得在"营业执照"以下
  302. result = []
  303. for i in range(len(self.result)):
  304. res = self.result[i]
  305. txt = res[-1][0]
  306. if "统一社" in txt or "会信用" in txt or "用代码" in txt:
  307. result = self.result[i:]
  308. break
  309. for i in range(len(result)):
  310. res = result[i]
  311. txt = res[-1][0]
  312. conf = res[-1][1]
  313. code = ''.join(re.findall(u'[\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]{15,18}', txt))
  314. if len(code):
  315. self.res['social_code'] = RecItem(code, conf)
  316. return
  317. def company_name(self):
  318. """
  319. 公司名称
  320. """
  321. for i in range(len(self.result)):
  322. res = self.result[i]
  323. txt = res[-1][0]
  324. conf = res[-1][1]
  325. if '称尔' in txt: txt = txt.replace('称尔', '称')
  326. if '名' in txt[:4] and '称' in txt[:4]:
  327. txt = '名称' + txt.split('称')[-1]
  328. if '名称' in txt:
  329. company_name = txt.split('名称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
  330. self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
  331. return
  332. if '称' in txt and txt[0] == '称' and len(txt) > 5:
  333. company_name = txt.split('称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
  334. self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
  335. return
  336. def legal_person(self):
  337. """
  338. 法人姓名
  339. """
  340. for i in range(len(self.result)):
  341. res = self.result[i]
  342. txt = res[-1][0].replace('市场监督', '')
  343. conf = res[-1][1]
  344. if '法定代表人' in txt or '代表人' in txt:
  345. legal_person = txt.split('代表人')[-1].split('营业')[0]
  346. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  347. return
  348. if '经营者' in txt:
  349. legal_person = txt.split('经营者')[-1].split('经营')[0]
  350. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  351. return
  352. if '负责人' in txt:
  353. legal_person = txt.split('负责人')[-1].split('责人')[0]
  354. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  355. return
  356. def registered_capital(self):
  357. """
  358. 注册资本
  359. """
  360. for i in range(len(self.result)):
  361. res = self.result[i]
  362. txt = res[-1][0]
  363. conf = res[-1][1]
  364. txt = fix_text(txt)
  365. if '注册资本' in txt:
  366. if '人民币' in txt[:4]:
  367. registered_capital = txt.split('人民币')[-1].split('万元')[0]
  368. txt = f'人民币{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
  369. elif '美元' in txt[:4]:
  370. registered_capital = txt.split('美元')[-1].split('万元')[0]
  371. txt = f'美元{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
  372. elif '人民币' in txt[-4:]:
  373. registered_capital = txt.split('资本')[-1].split('人民币')[0].split('万元')[0]
  374. txt = f'{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}人民币'
  375. else:
  376. registered_capital = txt.split('资本')[-1].split('币')[-1].split('万')[0]
  377. txt = self.cn_to_an(fix_text(clear_punctuation(registered_capital)))
  378. self.res['registered_capital'] = RecItem(txt, conf)
  379. return
  380. def type(self): # sourcery skip: hoist-similar-statement-from-if
  381. """
  382. 类型
  383. """
  384. for i in range(len(self.result)):
  385. res = self.result[i]
  386. txt = res[-1][0]
  387. conf = res[-1][1]
  388. txt = fix_text(clear_punctuation(txt))
  389. if '类型' in txt:
  390. txt = txt.split('类型')[-1].split('成立')[0].split('注册')[0]
  391. if '公司' in txt:
  392. t_s, s_e = txt.split('公司')[0], txt.split('公司')[-1].replace('(', '').replace(')', '').replace('(',
  393. '').replace(
  394. ')', '')
  395. # 分公司
  396. if '分公司' in txt:
  397. t_s = f'{t_s}公司分'
  398. txt = f'{t_s}公司({s_e})' if s_e else f'{t_s}公司'
  399. if txt[0] == '型': txt = txt[1:]
  400. self.res['type'] = RecItem(txt, conf)
  401. return
  402. def start_date(self):
  403. """
  404. 成立日期 ⚠️ 注册日期
  405. """
  406. for i in range(len(self.result)):
  407. res = self.result[i]
  408. txt = res[-1][0]
  409. conf = res[-1][1]
  410. txt = fix_text(txt)
  411. if '日期' in txt:
  412. txt = txt.split('日期')[-1]
  413. date = self.to_date(txt)
  414. self.res['start_date'] = RecItem(date, conf)
  415. def expire_date(self): # sourcery skip: hoist-similar-statement-from-if
  416. """
  417. 有效期
  418. """
  419. for i in range(len(self.result)):
  420. res = self.result[i]
  421. txt = res[-1][0]
  422. conf = res[-1][1]
  423. if '期限' in txt:
  424. if '至' in txt:
  425. txt = ''.join(txt.split('期限')[1:]).replace('*', '')
  426. date_from = txt.split('至')[0]
  427. date_to = txt.split('至')[-1]
  428. date = f'{self.to_date(date_from)} 至 {self.to_date(date_to)}'
  429. self.res['expire_date'] = RecItem(date, conf)
  430. return
  431. if '长期' in txt:
  432. self.res['expire_date'] = RecItem('长期', conf)
  433. return
  434. else:
  435. self.res['expire_date'] = RecItem('', conf)
  436. return
  437. def business_scope(self):
  438. """
  439. 经营范围
  440. """
  441. print('-------------经营范围处理开始--------------')
  442. bs_txt, bs_conf = BussinessParse1(self.ocr).bs_detection(self.image, self.raw_results)
  443. if bool(bs_txt):
  444. self.res['business_scope'] = RecItem(bs_txt, bs_conf)
  445. # sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
  446. # if bool(sb_or):
  447. # self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
  448. # else:
  449. # self.res['business_scope'] = RecItem('经营范围', random.random())
  450. print('-------------经营范围处理结束--------------')
  451. return
  452. def address(self): # sourcery skip: use-named-expression
  453. """
  454. 住所
  455. """
  456. # 切割方案
  457. ad_txt, ad_conf = BussinessParse1(self.ocr).ad_detection(self.image, self.raw_results)
  458. # 关键字方案
  459. add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
  460. if add_or_0:
  461. add_or = add_or_0
  462. else:
  463. add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
  464. if add_or_1:
  465. add_or = add_or_1
  466. else:
  467. return
  468. txt = add_or.txt
  469. if '所' in txt[:3] or '厂' in txt[:3]:
  470. txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
  471. if len(ad_txt) >= len(txt):
  472. self.res['address'] = RecItem(ad_txt, ad_conf)
  473. else:
  474. self.res['address'] = RecItem(txt, add_or.conf)
  475. return
  476. def stamp(self):
  477. """
  478. 印章检测
  479. """
  480. self.res['stamp'] = RecItem(send_request(self.image), 1.)
  481. return
  482. @staticmethod
  483. def cn_to_an(num):
  484. try:
  485. num = int(num)
  486. except ValueError:
  487. num = str(cn2an.cn2an(f'{num}万'))[:-4]
  488. except Exception:
  489. raise Exception('注册资本转化出错')
  490. finally:
  491. return f'{num}万元'
  492. @staticmethod
  493. def to_date(txt):
  494. if '长期' in txt: return '长期'
  495. if '永久' in txt: return '永久'
  496. if '不约定' in txt: return '不约定期限'
  497. date_in = re.findall(r"\d+", txt)
  498. if len(date_in) == 3:
  499. return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2]}日'
  500. else:
  501. return ''
  502. # ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
  503. # "business_scope", 'expire_date', 'address', 'stamp']
  504. def parse(self):
  505. self.social_code()
  506. self.company_name()
  507. self.legal_person()
  508. self.registered_capital()
  509. self.type()
  510. self.start_date()
  511. self.expire_date()
  512. self.business_scope()
  513. self.address()
  514. self.stamp()
  515. return {key: self.res[key].to_dict() for key in self.keys}