parser.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. import re
  2. from collections import defaultdict
  3. from dataclasses import dataclass
  4. import random
  5. from typing import List
  6. import cpca
  7. import cv2
  8. import numpy as np
  9. import string
  10. from zhon.hanzi import punctuation
  11. import cn2an
  12. from core.line_parser import OcrResult
  13. from core.square_parser import parser_xy
  14. from stamp.d_stamp import send_request
  15. def fix_text(text):
  16. err_dict = {'伍任': '伍仟', '看翟永奇': '翟永奇', '马依伴中国玻璃网': '马依俤', '20144年': '2014年', '江苏永东方网络': '江苏隽永东方网络',
  17. '(': '(', ')': ')', '型型': '类型', '壹任': '壹仟', '查佰': '壹佰'}
  18. for k, v in err_dict.items():
  19. text = text.replace(k, v)
  20. return text
  21. def clear_punctuation(txt):
  22. t = txt[:3]
  23. for c in string.punctuation:
  24. t = t.replace(c, '')
  25. for c in punctuation:
  26. t = t.replace(c, '')
  27. txt = t + txt[3:]
  28. return txt
  29. @dataclass
  30. class RecItem:
  31. text: str = ''
  32. confidence: float = 0.
  33. def to_dict(self):
  34. return {"text": self.text.strip(), "confidence": np.nan_to_num(self.confidence)}
  35. class Parser(object):
  36. def __init__(self, ocr_results: List[List[OcrResult]], raw_results: List):
  37. self.result = ocr_results
  38. self.res = defaultdict(RecItem)
  39. self.raw_results = raw_results
  40. self.keys = ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
  41. "business_scope", 'expire_date', 'address', 'stamp']
  42. for key in self.keys:
  43. self.res[key] = RecItem()
  44. # ch_an_al = re.compile('[\u4e00-\u9fa5+\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]')
  45. for item in self.result:
  46. tail = ['', 1.0]
  47. for k in range(len(item)):
  48. tail[0] = tail[0] + item[k].txt
  49. tail[1] = tail[1] + item[k].conf
  50. tail[1] = (tail[1] - 1.0) / len(item)
  51. item.append(tail)
  52. for i in range(len(self.result)):
  53. res = self.result[i]
  54. txt = res[-1][0]
  55. if "登记机关" in txt:
  56. self.result = self.result[:i + 1]
  57. break
  58. raw_OR_list = [OcrResult(np.array(res_raw[0]), res_raw[1][0].replace(' ', ''), res_raw[1][1]) for res_raw in
  59. self.raw_results]
  60. self.raw_results = raw_OR_list
  61. def parse(self):
  62. return self.res
  63. class BusinessLicenseParser(Parser):
  64. def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
  65. Parser.__init__(self, ocr_results, raw_results)
  66. self.image = image
  67. def social_code(self):
  68. """
  69. 社会信用代码
  70. """
  71. # 得在"营业执照"以下
  72. result = []
  73. for i in range(len(self.result)):
  74. res = self.result[i]
  75. txt = res[-1][0]
  76. if "统一社" in txt or "会信用" in txt or "用代码" in txt:
  77. result = self.result[i:]
  78. break
  79. for i in range(len(result)):
  80. res = result[i]
  81. txt = res[-1][0]
  82. conf = res[-1][1]
  83. code = ''.join(re.findall(u'[\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]{15,18}', txt))
  84. if len(code):
  85. self.res['social_code'] = RecItem(code, conf)
  86. return
  87. def company_name(self):
  88. """
  89. 公司名称
  90. """
  91. for i in range(len(self.result)):
  92. res = self.result[i]
  93. txt = res[-1][0]
  94. conf = res[-1][1]
  95. if '称尔' in txt: txt = txt.replace('称尔', '称')
  96. if '名' in txt[:4] and '称' in txt[:4]:
  97. txt = '名称' + txt.split('称')[-1]
  98. if '名称' in txt:
  99. company_name = txt.split('名称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
  100. self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
  101. return
  102. if '称' in txt and txt[0] == '称' and len(txt) > 5:
  103. company_name = txt.split('称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
  104. self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
  105. return
  106. def legal_person(self):
  107. """
  108. 法人姓名
  109. """
  110. for i in range(len(self.result)):
  111. res = self.result[i]
  112. txt = res[-1][0].replace('市场监督', '')
  113. conf = res[-1][1]
  114. if '法定代表人' in txt or '代表人' in txt:
  115. legal_person = txt.split('代表人')[-1].split('营业')[0]
  116. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  117. return
  118. if '经营者' in txt:
  119. legal_person = txt.split('经营者')[-1].split('经营')[0]
  120. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  121. return
  122. if '负责人' in txt:
  123. legal_person = txt.split('负责人')[-1].split('责人')[0]
  124. self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
  125. return
  126. def registered_capital(self):
  127. """
  128. 注册资本
  129. """
  130. for i in range(len(self.result)):
  131. res = self.result[i]
  132. txt = res[-1][0]
  133. conf = res[-1][1]
  134. txt = fix_text(txt)
  135. if '注册资本' in txt:
  136. if '人民币' in txt[:4]:
  137. registered_capital = txt.split('人民币')[-1].split('万元')[0]
  138. txt = f'人民币{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
  139. elif '美元' in txt[:4]:
  140. registered_capital = txt.split('美元')[-1].split('万元')[0]
  141. txt = f'美元{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
  142. elif '人民币' in txt[-4:]:
  143. registered_capital = txt.split('资本')[-1].split('人民币')[0].split('万元')[0]
  144. txt = f'{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}人民币'
  145. else:
  146. registered_capital = txt.split('资本')[-1].split('币')[-1].split('万')[0]
  147. txt = self.cn_to_an(fix_text(clear_punctuation(registered_capital)))
  148. self.res['registered_capital'] = RecItem(txt, conf)
  149. return
  150. def type(self): # sourcery skip: hoist-similar-statement-from-if
  151. """
  152. 类型
  153. """
  154. for i in range(len(self.result)):
  155. res = self.result[i]
  156. txt = res[-1][0]
  157. conf = res[-1][1]
  158. txt = fix_text(clear_punctuation(txt))
  159. if '类型' in txt:
  160. txt = txt.split('类型')[-1].split('成立')[0].split('注册')[0]
  161. if '公司' in txt:
  162. t_s, s_e = txt.split('公司')[0], txt.split('公司')[-1].replace('(', '').replace(')', '').replace('(',
  163. '').replace(
  164. ')', '')
  165. # 分公司
  166. if '分公司' in txt:
  167. t_s = f'{t_s}公司分'
  168. txt = f'{t_s}公司({s_e})' if s_e else f'{t_s}公司'
  169. if txt[0] == '型': txt = txt[1:]
  170. self.res['type'] = RecItem(txt, conf)
  171. return
  172. def start_date(self):
  173. """
  174. 成立日期 ⚠️ 注册日期
  175. """
  176. for i in range(len(self.result)):
  177. res = self.result[i]
  178. txt = res[-1][0]
  179. conf = res[-1][1]
  180. txt = fix_text(txt)
  181. if '日期' in txt:
  182. txt = txt.split('日期')[-1]
  183. date = self.to_date(txt)
  184. self.res['start_date'] = RecItem(date, conf)
  185. def expire_date(self): # sourcery skip: hoist-similar-statement-from-if
  186. """
  187. 有效期
  188. """
  189. for i in range(len(self.result)):
  190. res = self.result[i]
  191. txt = res[-1][0]
  192. conf = res[-1][1]
  193. if '期限' in txt:
  194. if '至' in txt:
  195. txt = ''.join(txt.split('期限')[1:]).replace('*', '')
  196. date_from = txt.split('至')[0]
  197. date_to = txt.split('至')[-1]
  198. date = f'{self.to_date(date_from)} 至 {self.to_date(date_to)}'
  199. self.res['expire_date'] = RecItem(date, conf)
  200. return
  201. if '长期' in txt:
  202. self.res['expire_date'] = RecItem('长期', conf)
  203. return
  204. else:
  205. self.res['expire_date'] = RecItem('', conf)
  206. return
  207. def business_scope(self):
  208. """
  209. 经营范围
  210. """
  211. sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
  212. if bool(sb_or):
  213. self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
  214. else:
  215. self.res['business_scope'] = RecItem('经营范围', random.random())
  216. return
  217. def address(self): # sourcery skip: use-named-expression
  218. """
  219. 住所
  220. """
  221. add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
  222. if add_or_0:
  223. add_or = add_or_0
  224. else:
  225. add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
  226. if add_or_1:
  227. add_or = add_or_1
  228. else:
  229. return
  230. txt = add_or.txt
  231. if '所' in txt[:3] or '厂' in txt[:3]:
  232. txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
  233. self.res['address'] = RecItem(txt, add_or.conf)
  234. return
  235. def stamp(self):
  236. """
  237. 印章检测
  238. """
  239. self.res['stamp'] = RecItem(send_request(self.image), 1.)
  240. return
  241. @staticmethod
  242. def cn_to_an(num):
  243. try:
  244. num = int(num)
  245. except ValueError:
  246. num = str(cn2an.cn2an(f'{num}万'))[:-4]
  247. except Exception:
  248. raise Exception('注册资本转化出错')
  249. finally:
  250. return f'{num}万元'
  251. @staticmethod
  252. def to_date(txt):
  253. if '长期' in txt: return '长期'
  254. if '永久' in txt: return '永久'
  255. if '不约定' in txt: return '不约定期限'
  256. date_in = re.findall(r"\d+", txt)
  257. if len(date_in) == 3:
  258. return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2]}日'
  259. else:
  260. return ''
  261. # ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
  262. # "business_scope", 'expire_date', 'address', 'stamp']
  263. def parse(self):
  264. self.social_code()
  265. self.company_name()
  266. self.legal_person()
  267. self.registered_capital()
  268. self.type()
  269. self.start_date()
  270. self.expire_date()
  271. self.business_scope()
  272. self.address()
  273. self.stamp()
  274. return {key: self.res[key].to_dict() for key in self.keys}