business_parse.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. import re
  2. from dataclasses import dataclass
  3. import cv2
  4. from paddleocr import PaddleOCR
  5. @dataclass
  6. class BussinessParse0(object):
  7. """
  8. 经营范围
  9. """
  10. ocr: PaddleOCR
  11. def detection(self, image, raw_results):
  12. h, w, _ = image.shape
  13. left_list = []
  14. right_list = []
  15. for i in raw_results:
  16. if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
  17. [x0, _] = i.lt
  18. [_, y1] = i.rb
  19. left_list.append([x0, y1])
  20. elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
  21. [x0, _] = i.lt
  22. [_, y1] = i.rb
  23. left_list.append([x0, y1])
  24. elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
  25. [x0, _] = i.lt
  26. [_, y1] = i.rb
  27. left_list.append([x0, y1])
  28. elif bool(re.match(r'注册', i.txt)):
  29. [x0, _] = i.lt
  30. [_, y1] = i.rb
  31. right_list.append([x0, y1])
  32. elif bool(re.search(r'日期', i.txt)):
  33. [x0, _] = i.lt
  34. [_, y1] = i.rb
  35. right_list.append([x0, y1])
  36. elif bool(re.match(r'营业期限', i.txt)):
  37. [x0, _] = i.lt
  38. [_, y1] = i.rb
  39. right_list.append([x0, y1])
  40. t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
  41. t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
  42. l1 = sorted(left_list, key=lambda x: x[0])[0][0]
  43. r1 = sorted(right_list, key=lambda x: x[0])[0][0]
  44. left_img = image[int(t1): h, int(l1): int(r1)]
  45. right_img = image[int(t2): h, int(r1): w]
  46. left_result = self.ocr.ocr(left_img)
  47. right_result = self.ocr.ocr(right_img)
  48. left_conf_list = []
  49. right_conf_list = []
  50. left_conf = 0.0
  51. right_conf = 0.0
  52. left_txt = ''
  53. right_txt = ''
  54. for idx, res in enumerate(left_result):
  55. if len(left_result) - 1 != idx:
  56. if bool(re.match(r'经营范围', res[1][0])):
  57. t = res[0][0][1]
  58. d = res[0][2][1]
  59. # 判断上一条信息是否为经营范围内容
  60. if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
  61. d - t) * 1.8:
  62. left_txt += left_result[idx - 1][1][0]
  63. left_txt += res[1][0]
  64. left_conf_list.append(res[1][1])
  65. left_position = left_result[idx + 1][0][0][0]
  66. left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
  67. for x in left_result[idx + 1:]:
  68. if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
  69. left_down_position = (x[0][2][1] + x[0][3][1]) // 2
  70. if left_txt[-1] == x[1][0][0]:
  71. left_txt += x[1][0][1:]
  72. else:
  73. left_txt += x[1][0]
  74. left_conf_list.append(x[1][1])
  75. # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
  76. left_txt = left_txt.replace('经营范围', '')
  77. if len(left_conf_list):
  78. left_conf = sum(left_conf_list) / len(left_conf_list)
  79. # 住所信息
  80. for idx, res in enumerate(right_result):
  81. if len(right_result) - 1 != idx:
  82. if bool(re.match(r'所', res[1][0])):
  83. right_txt = ''
  84. t = res[0][0][1]
  85. d = res[0][2][1]
  86. if len(res[1][0]) == 1:
  87. right_position = right_result[idx + 1][0][0][0]
  88. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  89. else:
  90. right_txt += res[1][0]
  91. right_conf_list.append(res[1][1])
  92. right_position = right_result[idx][0][0][0]
  93. right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
  94. for x in right_result[idx + 1:]:
  95. if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
  96. d - t) * 1.2 and '登记机关' not in x[1][0]:
  97. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  98. right_txt += x[1][0]
  99. right_conf_list.append(x[1][1])
  100. elif bool(re.match(r'住', res[1][0])):
  101. right_txt = ''
  102. t = res[0][0][1]
  103. d = res[0][2][1]
  104. if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
  105. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  106. standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
  107. right_position = right_result[idx + 1][0][0][0]
  108. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  109. else:
  110. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  111. standard = abs(res[0][1][0] - res[0][0][0]) // 5
  112. # 长文本直接添加至结果输出
  113. right_txt += res[1][0]
  114. right_conf_list.append(res[1][1])
  115. right_position = res[0][0][0]
  116. right_down_position = (res[0][2][1] + res[0][3][1]) // 2
  117. for x in right_result[idx + 1:]:
  118. if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
  119. d - t) * 1.2 and '登记机关' not in x[1][0]:
  120. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  121. right_txt += x[1][0]
  122. right_conf_list.append(x[1][1])
  123. elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
  124. right_txt = ''
  125. t = res[0][0][1]
  126. d = res[0][2][1]
  127. if len(res[1][0]) == 4:
  128. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  129. standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
  130. right_position = right_result[idx + 1][0][0][0]
  131. right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
  132. else:
  133. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  134. standard = abs(res[0][1][0] - res[0][0][0]) // 2
  135. # 长文本直接添加至结果输出
  136. right_txt += res[1][0]
  137. right_conf_list.append(res[1][1])
  138. right_position = res[0][0][0]
  139. right_down_position = (res[0][2][1] + res[0][3][1]) // 2
  140. for x in right_result[idx + 1:]:
  141. if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
  142. d - t) * 1.2 and '登记机关' not in x[1][0]:
  143. right_down_position = (x[0][2][1] + x[0][3][1]) // 2
  144. right_txt += x[1][0]
  145. right_conf_list.append(x[1][1])
  146. right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
  147. right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
  148. if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
  149. right_txt = right_txt.replace('所', '')
  150. right_txt = right_txt.replace('住', '')
  151. if len(right_conf_list):
  152. right_conf = sum(right_conf_list) / len(right_conf_list)
  153. return left_txt, left_conf, right_txt, right_conf
  154. @dataclass
  155. class BussinessParse1(object):
  156. """
  157. 经营范围
  158. """
  159. ocr: PaddleOCR
  160. def bs_detection(self, image, raw_results):
  161. h, w, _ = image.shape
  162. down_list = []
  163. down_list2 = []
  164. raw_txt = ''
  165. down_txt = ''
  166. raw_conf_list = []
  167. down_conf_list = []
  168. down_conf = 0.0
  169. simple_key = False
  170. for i in raw_results:
  171. if bool(re.search(r'日期', i.txt)):
  172. [x0, _] = i.lt
  173. [_, y1] = i.rb
  174. down_list.append([x0, y1])
  175. elif bool(re.match(r'营业期限', i.txt)):
  176. simple_key = True
  177. [x0, _] = i.lt
  178. [_, y1] = i.rb
  179. down_list.append([x0, y1])
  180. elif bool(re.match(r'登记', i.txt)):
  181. [_, y0] = i.lt
  182. down_list2.append(y0)
  183. elif bool(re.match(r'经营范围', i.txt)):
  184. [x0, y0] = i.lt
  185. [x1, _] = i.rb
  186. # 第一方案:
  187. for j in raw_results:
  188. [x, _] = j.lt
  189. [_, y] = j.rb
  190. if abs(x - x1) <= abs(x1 - x0) and y >= y0 and '登记' not in j.txt:
  191. raw_txt += j.txt
  192. raw_conf_list.append(j.conf)
  193. if len(down_list) and len(down_list2):
  194. t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
  195. l2 = sorted(down_list, key=lambda x: x[0])[0][0]
  196. d2 = int(down_list2[0]) if len(down_list2) else h
  197. down_img = image[int(t2): d2, int(l2): w]
  198. h1, w1, _ = down_img.shape
  199. down_result = self.ocr.ocr(down_img)
  200. # print('simple_key', simple_key)
  201. # 第二方案(检索到‘营业期限’关键词)
  202. if simple_key:
  203. # print('111')
  204. for res in down_result:
  205. l1 = res[0][0][0]
  206. if l1 < (7 * w1 // 24):
  207. down_txt += res[1][0]
  208. down_conf_list.append(res[1][1])
  209. # print(down_txt)
  210. # 第三套方案
  211. else:
  212. for idx, res in enumerate(down_result):
  213. if bool(re.match(r'经营范围', res[1][0])):
  214. t = res[0][0][1]
  215. for i in down_result:
  216. if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
  217. down_txt += res[1][0]
  218. down_conf_list.append(res[1][1])
  219. down_txt = down_txt.replace('经营范围', '')
  220. raw_txt = raw_txt.replace('经营范围', '')
  221. if len(down_conf_list):
  222. down_conf = sum(down_conf_list) / len(down_conf_list)
  223. if len(raw_txt) > len(down_txt):
  224. down_txt = raw_txt
  225. down_conf = sum(raw_conf_list) / len(raw_conf_list)
  226. # cv2.imshow('11', down_img)
  227. # cv2.waitKey(0)
  228. return down_txt, down_conf
  229. def ad_detection(self, image, raw_results):
  230. h, w, _ = image.shape
  231. top_list1 = []
  232. top_list2 = []
  233. top_conf_list = []
  234. top_conf = 0.0
  235. top_txt = ''
  236. last_key = ''
  237. type_key = False
  238. for i in raw_results:
  239. if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
  240. [_, y0] = i.lt
  241. top_list2.append(y0)
  242. elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
  243. [x0, _] = i.lt
  244. [_, y1] = i.rb
  245. top_list1.append([x0, y1])
  246. elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
  247. type_key = True
  248. [x0, _] = i.lt
  249. [_, y1] = i.rb
  250. top_list1.append([x0, y1])
  251. elif bool(re.match(r'注册资本', i.txt)):
  252. [_, y0] = i.lt
  253. top_list2.append(y0)
  254. elif bool(re.search(r'日期', i.txt)):
  255. [_, y0] = i.lt
  256. top_list2.append(y0)
  257. elif bool(re.match(r'营业期限', i.txt)):
  258. [_, y0] = i.lt
  259. top_list2.append(y0)
  260. t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
  261. l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
  262. d1 = sorted(top_list2)[0]
  263. top_img = image[int(t1): int(d1), int(l1): w]
  264. top_result = self.ocr.ocr(top_img)
  265. # 住所信息
  266. for idx, res in enumerate(top_result):
  267. # print(res)
  268. if bool(re.match(r'所', res[1][0])):
  269. top_txt = ''
  270. t = res[0][0][1]
  271. d = res[0][2][1]
  272. if len(last_key):
  273. top_txt += last_key
  274. print('top_txt', top_txt)
  275. if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
  276. top_position = top_result[idx + 1][0][0][0]
  277. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  278. else:
  279. top_txt += res[1][0]
  280. top_conf_list.append(res[1][1])
  281. top_position = top_result[idx][0][0][0]
  282. top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
  283. if len(top_result) - 1 != idx:
  284. for x in top_result[idx + 1:]:
  285. if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
  286. d - t) * 1.2:
  287. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  288. top_txt += x[1][0]
  289. top_conf_list.append(x[1][1])
  290. # print('top_txt', top_txt)
  291. elif bool(re.match(r'住', res[1][0])):
  292. top_txt = ''
  293. t = res[0][0][1]
  294. d = res[0][2][1]
  295. if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  296. d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
  297. top_result[idx - 1][1][0] and idx != 0:
  298. last_key = top_result[idx - 1][1][0]
  299. if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
  300. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  301. standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
  302. top_position = top_result[idx + 1][0][0][0]
  303. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  304. else:
  305. # 此情况为长文本,则采用框的左右坐标的1/5为标准
  306. standard = abs(res[0][1][0] - res[0][0][0]) // 5
  307. # 长文本直接添加至结果输出
  308. top_txt += res[1][0]
  309. top_conf_list.append(res[1][1])
  310. top_position = res[0][0][0]
  311. top_down_position = (res[0][2][1] + res[0][3][1]) // 2
  312. if len(top_result) - 1 != idx:
  313. for x in top_result[idx + 1:]:
  314. if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
  315. d - t) * 1.2:
  316. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  317. top_txt += x[1][0]
  318. top_conf_list.append(x[1][1])
  319. # print(top_txt)
  320. elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
  321. top_txt = ''
  322. t = res[0][0][1]
  323. d = res[0][2][1]
  324. if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
  325. d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
  326. top_result[idx - 1][1][0] and idx != 0:
  327. top_txt += top_result[idx - 1][1][0]
  328. if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
  329. # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
  330. standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
  331. top_position = top_result[idx + 1][0][0][0]
  332. top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
  333. else:
  334. # 此情况为长文本,则采用框的左右坐标的1/2为标准
  335. standard = abs(res[0][1][0] - res[0][0][0]) // 2
  336. # 长文本直接添加至结果输出
  337. top_txt += res[1][0]
  338. top_conf_list.append(res[1][1])
  339. top_position = res[0][0][0]
  340. top_down_position = (res[0][2][1] + res[0][3][1]) // 2
  341. if len(top_result) - 1 != idx:
  342. for x in top_result[idx + 1:]:
  343. if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
  344. d - t) * 1.2:
  345. top_down_position = (x[0][2][1] + x[0][3][1]) // 2
  346. top_txt += x[1][0]
  347. top_conf_list.append(x[1][1])
  348. top_conf_list.append(x[1][1])
  349. if len(top_txt) == 0 and type_key:
  350. for res in top_result:
  351. top_txt += res[1][0]
  352. top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
  353. if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
  354. top_txt = top_txt.replace('所', '')
  355. top_txt = top_txt.replace('住', '')
  356. if len(top_conf_list):
  357. top_conf = sum(top_conf_list) / len(top_conf_list)
  358. # cv2.imshow('11', top_img)
  359. # cv2.waitKey(0)
  360. return top_txt, top_conf