business_parse.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. import re
  2. from dataclasses import dataclass
  3. import cv2
  4. from paddleocr import PaddleOCR
  5. @dataclass
  6. class BussinessParse0(object):
  7. """
  8. 经营范围
  9. """
  10. ocr: PaddleOCR
  11. def detection(self, image, raw_results):
  12. h, w, _ = image.shape
  13. left_list = []
  14. right_list = []
  15. for i in raw_results:
  16. if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
  17. [x0, _] = i.lt
  18. [_, y1] = i.rb
  19. left_list.append([x0, y1])
  20. elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
  21. [x0, _] = i.lt
  22. [_, y1] = i.rb
  23. left_list.append([x0, y1])
  24. elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
  25. [x0, _] = i.lt
  26. [_, y1] = i.rb
  27. left_list.append([x0, y1])
  28. elif bool(re.match(r'注册', i.txt)):
  29. [x0, _] = i.lt
  30. [_, y1] = i.rb
  31. right_list.append([x0, y1])
  32. elif bool(re.search(r'日期', i.txt)):
  33. [x0, _] = i.lt
  34. [_, y1] = i.rb
  35. right_list.append([x0, y1])
  36. elif bool(re.match(r'营业期限', i.txt)):
  37. [x0, _] = i.lt
  38. [_, y1] = i.rb
  39. right_list.append([x0, y1])
  40. t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
  41. t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
  42. l1 = sorted(left_list, key=lambda x: x[0])[0][0]
  43. r1 = sorted(right_list, key=lambda x: x[0])[0][0]
  44. left_img = image[int(t1): h, int(l1): int(r1)]
  45. right_img = image[int(t2): h, int(r1): w]
  46. left_result = self.ocr.ocr(left_img)
  47. right_result = self.ocr.ocr(right_img)
  48. left_conf_list = []
  49. right_conf_list = []
  50. left_conf = 0.0
  51. right_conf = 0.0
  52. left_txt = ''
  53. right_txt = ''
  54. for idx, res in enumerate(left_result):
  55. if len(left_result) - 1 != idx:
  56. if bool(re.match(r'经营范围', res[1][0])):
  57. t = res[0][0][1]
  58. d = res[0][2][1]
  59. # 判断上一条信息是否为经营范围内容
  60. if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
  61. d - t) * 1.8:
  62. left_txt += left_result[idx - 1][1][0]
  63. left_txt += res[1][0]
  64. left_conf_list.append(res[1][1])
  65. left_position = left_result[idx + 1][0][0][0]
  66. left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
  67. for x in left_result[idx + 1:]:
  68. if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
  69. left_down_position = (x[0][2][1] + x[0][3][1]) // 2
  70. if left_txt[-1] == x[1][0][0]:
  71. left_txt += x[1][0][1:]
  72. else:
  73. left_txt += x[1][0]
  74. left_conf_list.append(x[1][1])
  75. # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
  76. left_txt = left_txt.replace('经营范围', '')
  77. if len(left_conf_list):
  78. left_conf = sum(left_conf_list) / len(left_conf_list)
  79. # 住所信息
  80. for idx, res in enumerate(right_result):
  81. if len(right_result) - 1 != idx:
  82. if bool(re.match(r'所', res[1][0])):
  83. right_txt = ''
  84. t = res[0][0][1]
  85. d = res[0][2][1]
  86. if len(res[1][0]) == 1:
  87. right_position = right_result[idx + 1][0][0][0]
  88. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  89. else:
  90. right_txt += res[1][0]
  91. right_conf_list.append(res[1][1])
  92. right_position = right_result[idx][0][0][0]
  93. right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
  94. for x in right_result[idx + 1:]:
  95. if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
  96. d - t) * 1.2 and '登记机关' not in x[1][0]:
  97. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  98. right_txt += x[1][0]
  99. right_conf_list.append(x[1][1])
  100. elif bool(re.match(r'住', res[1][0])):
  101. right_txt = ''
  102. t = res[0][0][1]
  103. d = res[0][2][1]
  104. if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
  105. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  106. standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
  107. right_position = right_result[idx + 1][0][0][0]
  108. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  109. else:
  110. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  111. standard = abs(res[0][1][0] - res[0][0][0]) // 5
  112. # 长文本直接添加至结果输出
  113. right_txt += res[1][0]
  114. right_conf_list.append(res[1][1])
  115. right_position = res[0][0][0]
  116. right_down_position = (res[0][2][1] + res[0][3][1]) // 2
  117. for x in right_result[idx + 1:]:
  118. if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
  119. d - t) * 1.2 and '登记机关' not in x[1][0]:
  120. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  121. right_txt += x[1][0]
  122. right_conf_list.append(x[1][1])
  123. elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
  124. right_txt = ''
  125. t = res[0][0][1]
  126. d = res[0][2][1]
  127. if len(res[1][0]) == 4:
  128. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  129. standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
  130. right_position = right_result[idx + 1][0][0][0]
  131. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  132. else:
  133. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  134. standard = abs(res[0][1][0] - res[0][0][0]) // 2
  135. # 长文本直接添加至结果输出
  136. right_txt += res[1][0]
  137. right_conf_list.append(res[1][1])
  138. right_position = res[0][0][0]
  139. right_down_position = (res[0][2][1] + res[0][3][1]) // 2
  140. for x in right_result[idx + 1:]:
  141. if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
  142. d - t) * 1.2 and '登记机关' not in x[1][0]:
  143. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  144. right_txt += x[1][0]
  145. right_conf_list.append(x[1][1])
  146. right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
  147. right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
  148. if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
  149. right_txt = right_txt.replace('所', '')
  150. right_txt = right_txt.replace('住', '')
  151. if len(right_conf_list):
  152. right_conf = sum(right_conf_list) / len(right_conf_list)
  153. return left_txt, left_conf, right_txt, right_conf
  154. @dataclass
  155. class BussinessParse1(object):
  156. """
  157. 经营范围
  158. """
  159. ocr: PaddleOCR
  160. def bs_detection(self, image, raw_results):
  161. h, w, _ = image.shape
  162. down_list = []
  163. down_list2 = []
  164. for i in raw_results:
  165. if bool(re.match(r'注册资本', i.txt)):
  166. [x0, _] = i.lt
  167. [_, y1] = i.rb
  168. down_list.append([x0, y1])
  169. elif bool(re.search(r'日期', i.txt)):
  170. [x0, _] = i.lt
  171. [_, y1] = i.rb
  172. down_list.append([x0, y1])
  173. elif bool(re.match(r'营业期限', i.txt)):
  174. [x0, _] = i.lt
  175. [_, y1] = i.rb
  176. down_list.append([x0, y1])
  177. elif bool(re.match(r'登记', i.txt)):
  178. [_, y0] = i.lt
  179. down_list2.append(y0)
  180. t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
  181. l2 = sorted(down_list, key=lambda x: x[0])[0][0]
  182. d2 = int(down_list2[0]) if len(down_list2) else h
  183. down_img = image[int(t2): d2, int(l2): w]
  184. down_result = self.ocr.ocr(down_img)
  185. down_conf_list = []
  186. down_conf = 0.0
  187. down_txt = ''
  188. for idx, res in enumerate(down_result):
  189. # print(res)
  190. if len(down_result) - 1 != idx:
  191. if bool(re.match(r'经营范围', res[1][0])):
  192. t = res[0][0][1]
  193. d = res[0][2][1]
  194. if len(down_result[idx - 1][1][0]) > 15 and abs(
  195. down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  196. d - t) * 1.8:
  197. down_txt += down_result[idx - 1][1][0]
  198. down_txt += res[1][0]
  199. down_conf_list.append(res[1][1])
  200. down_position = down_result[idx + 1][0][0][0]
  201. down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
  202. for x in down_result[idx + 1:]:
  203. print(abs(down_down_position - x[0][0][1]))
  204. print(abs(d - t) * 1.2)
  205. if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
  206. d - t) * 1.8:
  207. down_down_position = (x[0][2][1] + x[0][3][1]) // 2
  208. if down_txt[-1] == x[1][0][0]:
  209. down_txt += x[1][0][1:]
  210. else:
  211. down_txt += x[1][0]
  212. down_conf_list.append(x[1][1])
  213. # print(down_txt)
  214. down_txt = down_txt.replace('经营范围', '')
  215. if len(down_conf_list):
  216. down_conf = sum(down_conf_list) / len(down_conf_list)
  217. # cv2.imshow('11', down_img)
  218. # cv2.waitKey(0)
  219. return down_txt, down_conf
  220. def ad_detection(self, image, raw_results):
  221. h, w, _ = image.shape
  222. top_list1 = []
  223. top_list2 = []
  224. type_key = False
  225. for i in raw_results:
  226. if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
  227. [_, y0] = i.lt
  228. top_list2.append(y0)
  229. elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
  230. [x0, _] = i.lt
  231. [_, y1] = i.rb
  232. top_list1.append([x0, y1])
  233. elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
  234. type_key = True
  235. [x0, _] = i.lt
  236. [_, y1] = i.rb
  237. top_list1.append([x0, y1])
  238. elif bool(re.match(r'注册资本', i.txt)):
  239. [_, y0] = i.lt
  240. top_list2.append(y0)
  241. elif bool(re.search(r'日期', i.txt)):
  242. [_, y0] = i.lt
  243. top_list2.append(y0)
  244. elif bool(re.match(r'营业期限', i.txt)):
  245. [_, y0] = i.lt
  246. top_list2.append(y0)
  247. t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
  248. l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
  249. d1 = sorted(top_list2)[0]
  250. top_img = image[int(t1): int(d1), int(l1): w]
  251. top_result = self.ocr.ocr(top_img)
  252. top_conf_list = []
  253. top_conf = 0.0
  254. top_txt = ''
  255. last_key = ''
  256. # 住所信息
  257. for idx, res in enumerate(top_result):
  258. # print(res)
  259. if bool(re.match(r'所', res[1][0])):
  260. top_txt = ''
  261. t = res[0][0][1]
  262. d = res[0][2][1]
  263. if len(last_key):
  264. top_txt += last_key
  265. print('top_txt', top_txt)
  266. if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
  267. top_position = top_result[idx + 1][0][0][0]
  268. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  269. else:
  270. top_txt += res[1][0]
  271. top_conf_list.append(res[1][1])
  272. top_position = top_result[idx][0][0][0]
  273. top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
  274. if len(top_result) - 1 != idx:
  275. for x in top_result[idx + 1:]:
  276. if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
  277. d - t) * 1.2:
  278. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  279. top_txt += x[1][0]
  280. top_conf_list.append(x[1][1])
  281. # print('top_txt', top_txt)
  282. elif bool(re.match(r'住', res[1][0])):
  283. top_txt = ''
  284. t = res[0][0][1]
  285. d = res[0][2][1]
  286. if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  287. d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
  288. top_result[idx - 1][1][0] and idx != 0:
  289. last_key = top_result[idx - 1][1][0]
  290. if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
  291. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  292. standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
  293. top_position = top_result[idx + 1][0][0][0]
  294. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  295. else:
  296. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  297. standard = abs(res[0][1][0] - res[0][0][0]) // 5
  298. # 长文本直接添加至结果输出
  299. top_txt += res[1][0]
  300. top_conf_list.append(res[1][1])
  301. top_position = res[0][0][0]
  302. top_down_position = (res[0][2][1] + res[0][3][1]) // 2
  303. if len(top_result) - 1 != idx:
  304. for x in top_result[idx + 1:]:
  305. if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
  306. d - t) * 1.2:
  307. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  308. top_txt += x[1][0]
  309. top_conf_list.append(x[1][1])
  310. # print(top_txt)
  311. elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
  312. top_txt = ''
  313. t = res[0][0][1]
  314. d = res[0][2][1]
  315. if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  316. d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
  317. top_result[idx - 1][1][0] and idx != 0:
  318. top_txt += top_result[idx - 1][1][0]
  319. if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
  320. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  321. standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
  322. top_position = top_result[idx + 1][0][0][0]
  323. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  324. else:
  325. # 此情况为长文本,则采用框的左右坐标的1/2为标准
  326. standard = abs(res[0][1][0] - res[0][0][0]) // 2
  327. # 长文本直接添加至结果输出
  328. top_txt += res[1][0]
  329. top_conf_list.append(res[1][1])
  330. top_position = res[0][0][0]
  331. top_down_position = (res[0][2][1] + res[0][3][1]) // 2
  332. if len(top_result) - 1 != idx:
  333. for x in top_result[idx + 1:]:
  334. if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
  335. d - t) * 1.2:
  336. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  337. top_txt += x[1][0]
  338. top_conf_list.append(x[1][1])
  339. top_conf_list.append(x[1][1])
  340. if len(top_txt) == 0 and type_key:
  341. for res in top_result:
  342. top_txt += res[1][0]
  343. top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
  344. if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
  345. top_txt = top_txt.replace('所', '')
  346. top_txt = top_txt.replace('住', '')
  347. if len(top_conf_list):
  348. top_conf = sum(top_conf_list) / len(top_conf_list)
  349. # cv2.imshow('11', top_img)
  350. # cv2.waitKey(0)
  351. return top_txt, top_conf