business_parse.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. import re
  2. from dataclasses import dataclass
  3. import cv2
  4. from paddleocr import PaddleOCR
  5. @dataclass
  6. class BussinessParse0(object):
  7. """
  8. 经营范围
  9. """
  10. ocr: PaddleOCR
  11. def detection(self, image, raw_results):
  12. h, w, _ = image.shape
  13. left_list = []
  14. right_list = []
  15. for i in raw_results:
  16. if bool(re.match('法定代表', i.txt)) or bool(re.match('经营者', i.txt)):
  17. [x0, _] = i.lt
  18. [_, y1] = i.rb
  19. left_list.append([x0, y1])
  20. elif bool(re.match('名', i.txt)) or bool(re.match('称', i.txt)):
  21. [x0, _] = i.lt
  22. [_, y1] = i.rb
  23. left_list.append([x0, y1])
  24. elif bool(re.match('类', i.txt)) or bool(re.match('型', i.txt)):
  25. [x0, _] = i.lt
  26. [_, y1] = i.rb
  27. left_list.append([x0, y1])
  28. elif bool(re.match('注册', i.txt)):
  29. [x0, _] = i.lt
  30. [_, y1] = i.rb
  31. right_list.append([x0, y1])
  32. elif bool(re.search('日期', i.txt)):
  33. [x0, _] = i.lt
  34. [_, y1] = i.rb
  35. right_list.append([x0, y1])
  36. elif bool(re.match('营业期限', i.txt)):
  37. [x0, _] = i.lt
  38. [_, y1] = i.rb
  39. right_list.append([x0, y1])
  40. t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
  41. t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
  42. l1 = sorted(left_list, key=lambda x: x[0])[0][0]
  43. r1 = sorted(right_list, key=lambda x: x[0])[0][0]
  44. left_img = image[int(t1):h, int(l1):int(r1)]
  45. right_img = image[int(t2):h, int(r1):w]
  46. left_result = self.ocr.ocr(left_img)
  47. right_result = self.ocr.ocr(right_img)
  48. left_conf_list = []
  49. right_conf_list = []
  50. left_conf = 0.0
  51. right_conf = 0.0
  52. left_txt = ''
  53. right_txt = ''
  54. for idx, res in enumerate(left_result):
  55. if len(left_result) - 1 != idx and bool(re.match('经营范围', res[1][0])):
  56. t = res[0][0][1]
  57. d = res[0][2][1]
  58. if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < (abs(d - t) * 1.8):
  59. left_txt += left_result[idx - 1][1][0]
  60. left_txt += res[1][0]
  61. left_conf_list.append(res[1][1])
  62. left_position = left_result[idx + 1][0][0][0]
  63. left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
  64. for x in left_result[idx + 1:]:
  65. if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
  66. left_down_position = (x[0][2][1] + x[0][3][1]) // 2
  67. left_txt += x[1][0][1:] if left_txt[-1] == x[1][0][0] else x[1][0]
  68. left_conf_list.append(x[1][1])
  69. left_txt = left_txt.replace('经营范围', '')
  70. if len(left_conf_list):
  71. left_conf = sum(left_conf_list) / len(left_conf_list)
  72. for idx, res in enumerate(right_result):
  73. if len(right_result) - 1 != idx:
  74. if bool(re.match('所', res[1][0])):
  75. right_txt = ''
  76. t = res[0][0][1]
  77. d = res[0][2][1]
  78. if len(res[1][0]) == 1:
  79. right_position = right_result[idx + 1][0][0][0]
  80. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  81. else:
  82. right_txt += res[1][0]
  83. right_conf_list.append(res[1][1])
  84. right_position = right_result[idx][0][0][0]
  85. right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
  86. for x in right_result[idx + 1:]:
  87. if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
  88. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  89. right_txt += x[1][0]
  90. right_conf_list.append(x[1][1])
  91. elif bool(re.match('住', res[1][0])):
  92. right_txt = ''
  93. t = res[0][0][1]
  94. d = res[0][2][1]
  95. if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
  96. standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
  97. right_position = right_result[idx + 1][0][0][0]
  98. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  99. else:
  100. standard = abs(res[0][1][0] - res[0][0][0]) // 5
  101. right_txt += res[1][0]
  102. right_conf_list.append(res[1][1])
  103. right_position = res[0][0][0]
  104. right_down_position = (res[0][2][1] + res[0][3][1]) // 2
  105. for x in right_result[idx + 1:]:
  106. if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
  107. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  108. right_txt += x[1][0]
  109. right_conf_list.append(x[1][1])
  110. elif bool(re.match('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
  111. right_txt = ''
  112. t = res[0][0][1]
  113. d = res[0][2][1]
  114. if len(res[1][0]) == 4:
  115. standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
  116. right_position = right_result[idx + 1][0][0][0]
  117. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  118. else:
  119. standard = abs(res[0][1][0] - res[0][0][0]) // 2
  120. right_txt += res[1][0]
  121. right_conf_list.append(res[1][1])
  122. right_position = res[0][0][0]
  123. right_down_position = (res[0][2][1] + res[0][3][1]) // 2
  124. for x in right_result[idx + 1:]:
  125. if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
  126. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  127. right_txt += x[1][0]
  128. right_conf_list.append(x[1][1])
  129. right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
  130. right_txt = re.sub('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
  131. if bool(re.match('所', right_txt)) or bool(re.match('住', right_txt)):
  132. right_txt = right_txt.replace('所', '')
  133. right_txt = right_txt.replace('住', '')
  134. if len(right_conf_list):
  135. right_conf = sum(right_conf_list) / len(right_conf_list)
  136. return left_txt, left_conf, right_txt, right_conf
  137. @dataclass
  138. class BussinessParse1(object):
  139. """
  140. 经营范围
  141. """
  142. ocr: PaddleOCR
  143. def bs_detection(self, image, raw_results):
  144. h, w, _ = image.shape
  145. down_list = []
  146. down_list2 = []
  147. raw_txt = ''
  148. down_txt = ''
  149. raw_conf_list = []
  150. down_conf_list = []
  151. down_conf = 0.0
  152. simple_key = False
  153. for i in raw_results:
  154. if bool(re.search('日期', i.txt)):
  155. [x0, _] = i.lt
  156. [_, y1] = i.rb
  157. down_list.append([x0, y1])
  158. elif bool(re.match('营业期限', i.txt)):
  159. simple_key = True
  160. [x0, _] = i.lt
  161. [_, y1] = i.rb
  162. down_list.append([x0, y1])
  163. elif bool(re.match('登记', i.txt)):
  164. [_, y0] = i.lt
  165. down_list2.append(y0)
  166. elif bool(re.match('经营范围', i.txt)):
  167. [x0, y0] = i.lt
  168. [x1, _] = i.rb
  169. for j in raw_results:
  170. [x, _] = j.lt
  171. [_, y] = j.rb
  172. if abs(x - x1) <= abs(x1 - x0) and y >= y0 and '登记' not in j.txt:
  173. raw_txt += j.txt
  174. raw_conf_list.append(j.conf)
  175. if len(down_list) and len(down_list2):
  176. t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
  177. l2 = sorted(down_list, key=lambda x: x[0])[0][0]
  178. d2 = int(down_list2[0]) if len(down_list2) else h
  179. down_img = image[int(t2):d2, int(l2):w]
  180. h1, w1, _ = down_img.shape
  181. down_result = self.ocr.ocr(down_img)
  182. for res in down_result:
  183. if simple_key:
  184. l1 = res[0][0][0]
  185. if l1 < 7 * w1 // 24:
  186. down_txt += res[1][0]
  187. down_conf_list.append(res[1][1])
  188. elif bool(re.match('经营范围', res[1][0])):
  189. t = res[0][0][1]
  190. for i in down_result:
  191. if i[0][2][1] < t and i[0][0][0] < 7 * w1 // 24:
  192. down_txt += res[1][0]
  193. down_conf_list.append(res[1][1])
  194. down_txt = down_txt.replace('经营范围', '')
  195. raw_txt = raw_txt.replace('经营范围', '')
  196. if len(down_conf_list):
  197. down_conf = sum(down_conf_list) / len(down_conf_list)
  198. if len(raw_txt) > len(down_txt):
  199. down_txt = raw_txt
  200. down_conf = sum(raw_conf_list) / len(raw_conf_list)
  201. return down_txt, down_conf
  202. def ad_detection(self, image, raw_results):
  203. h, w, _ = image.shape
  204. top_list1 = []
  205. top_list2 = []
  206. top_conf_list = []
  207. top_conf = 0.0
  208. top_txt = ''
  209. last_key = ''
  210. type_key = False
  211. for i in raw_results:
  212. if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
  213. [_, y0] = i.lt
  214. top_list2.append(y0)
  215. elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
  216. [x0, _] = i.lt
  217. [_, y1] = i.rb
  218. top_list1.append([x0, y1])
  219. elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
  220. type_key = True
  221. [x0, _] = i.lt
  222. [_, y1] = i.rb
  223. top_list1.append([x0, y1])
  224. elif bool(re.match(r'注册资本', i.txt)):
  225. [_, y0] = i.lt
  226. top_list2.append(y0)
  227. elif bool(re.search(r'日期', i.txt)):
  228. [_, y0] = i.lt
  229. top_list2.append(y0)
  230. elif bool(re.match(r'营业期限', i.txt)):
  231. [_, y0] = i.lt
  232. top_list2.append(y0)
  233. t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
  234. l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
  235. d1 = sorted(top_list2)[0]
  236. top_img = image[int(t1): int(d1), int(l1): w]
  237. top_result = self.ocr.ocr(top_img)
  238. # 住所信息
  239. for idx, res in enumerate(top_result):
  240. # print(res)
  241. if bool(re.match(r'所', res[1][0])):
  242. top_txt = ''
  243. t = res[0][0][1]
  244. d = res[0][2][1]
  245. if len(last_key):
  246. top_txt += last_key
  247. print('top_txt', top_txt)
  248. if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
  249. top_position = top_result[idx + 1][0][0][0]
  250. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  251. else:
  252. top_txt += res[1][0]
  253. top_conf_list.append(res[1][1])
  254. top_position = top_result[idx][0][0][0]
  255. top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
  256. if len(top_result) - 1 != idx:
  257. for x in top_result[idx + 1:]:
  258. if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
  259. d - t) * 1.2:
  260. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  261. top_txt += x[1][0]
  262. top_conf_list.append(x[1][1])
  263. # print('top_txt', top_txt)
  264. elif bool(re.match(r'住', res[1][0])):
  265. top_txt = ''
  266. t = res[0][0][1]
  267. d = res[0][2][1]
  268. if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  269. d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
  270. top_result[idx - 1][1][0] and idx != 0:
  271. last_key = top_result[idx - 1][1][0]
  272. if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
  273. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  274. standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
  275. top_position = top_result[idx + 1][0][0][0]
  276. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  277. else:
  278. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  279. standard = abs(res[0][1][0] - res[0][0][0]) // 5
  280. # 长文本直接添加至结果输出
  281. top_txt += res[1][0]
  282. top_conf_list.append(res[1][1])
  283. top_position = res[0][0][0]
  284. top_down_position = (res[0][2][1] + res[0][3][1]) // 2
  285. if len(top_result) - 1 != idx:
  286. for x in top_result[idx + 1:]:
  287. if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
  288. d - t) * 1.2:
  289. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  290. top_txt += x[1][0]
  291. top_conf_list.append(x[1][1])
  292. # print(top_txt)
  293. elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
  294. top_txt = ''
  295. t = res[0][0][1]
  296. d = res[0][2][1]
  297. if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  298. d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
  299. top_result[idx - 1][1][0] and idx != 0:
  300. top_txt += top_result[idx - 1][1][0]
  301. if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
  302. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  303. standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
  304. top_position = top_result[idx + 1][0][0][0]
  305. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  306. else:
  307. # 此情况为长文本,则采用框的左右坐标的1/2为标准
  308. standard = abs(res[0][1][0] - res[0][0][0]) // 2
  309. # 长文本直接添加至结果输出
  310. top_txt += res[1][0]
  311. top_conf_list.append(res[1][1])
  312. top_position = res[0][0][0]
  313. top_down_position = (res[0][2][1] + res[0][3][1]) // 2
  314. if len(top_result) - 1 != idx:
  315. for x in top_result[idx + 1:]:
  316. if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
  317. d - t) * 1.2:
  318. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  319. top_txt += x[1][0]
  320. top_conf_list.append(x[1][1])
  321. top_conf_list.append(x[1][1])
  322. if len(top_txt) == 0 and type_key:
  323. for res in top_result:
  324. top_txt += res[1][0]
  325. top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
  326. if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
  327. top_txt = top_txt.replace('所', '')
  328. top_txt = top_txt.replace('住', '')
  329. if len(top_conf_list):
  330. top_conf = sum(top_conf_list) / len(top_conf_list)
  331. # cv2.imshow('11', top_img)
  332. # cv2.waitKey(0)
  333. return top_txt, top_conf