|
@@ -0,0 +1,372 @@
|
|
|
+import re
|
|
|
+from dataclasses import dataclass
|
|
|
+import cv2
|
|
|
+from paddleocr import PaddleOCR
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class BussinessParse0(object):
|
|
|
+ """
|
|
|
+ 经营范围
|
|
|
+ """
|
|
|
+ ocr: PaddleOCR
|
|
|
+
|
|
|
+ def detection(self, image, raw_results):
|
|
|
+ h, w, _ = image.shape
|
|
|
+ left_list = []
|
|
|
+ right_list = []
|
|
|
+ for i in raw_results:
|
|
|
+ if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ left_list.append([x0, y1])
|
|
|
+ elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ left_list.append([x0, y1])
|
|
|
+ elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ left_list.append([x0, y1])
|
|
|
+ elif bool(re.match(r'注册', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ right_list.append([x0, y1])
|
|
|
+ elif bool(re.search(r'日期', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ right_list.append([x0, y1])
|
|
|
+ elif bool(re.match(r'营业期限', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ right_list.append([x0, y1])
|
|
|
+ t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
+ t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
+ l1 = sorted(left_list, key=lambda x: x[0])[0][0]
|
|
|
+ r1 = sorted(right_list, key=lambda x: x[0])[0][0]
|
|
|
+
|
|
|
+ left_img = image[int(t1): h, int(l1): int(r1)]
|
|
|
+ right_img = image[int(t2): h, int(r1): w]
|
|
|
+ left_result = self.ocr.ocr(left_img)
|
|
|
+ right_result = self.ocr.ocr(right_img)
|
|
|
+
|
|
|
+ left_conf_list = []
|
|
|
+ right_conf_list = []
|
|
|
+ left_conf = 0.0
|
|
|
+ right_conf = 0.0
|
|
|
+ left_txt = ''
|
|
|
+ right_txt = ''
|
|
|
+ for idx, res in enumerate(left_result):
|
|
|
+ if len(left_result) - 1 != idx:
|
|
|
+ if bool(re.match(r'经营范围', res[1][0])):
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ # 判断上一条信息是否为经营范围内容
|
|
|
+ if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
|
|
|
+ d - t) * 1.8:
|
|
|
+ left_txt += left_result[idx - 1][1][0]
|
|
|
+
|
|
|
+ left_txt += res[1][0]
|
|
|
+ left_conf_list.append(res[1][1])
|
|
|
+
|
|
|
+ left_position = left_result[idx + 1][0][0][0]
|
|
|
+ left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
|
|
|
+ for x in left_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
|
|
|
+ left_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ if left_txt[-1] == x[1][0][0]:
|
|
|
+ left_txt += x[1][0][1:]
|
|
|
+ else:
|
|
|
+ left_txt += x[1][0]
|
|
|
+ left_conf_list.append(x[1][1])
|
|
|
+ # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
|
|
|
+ left_txt = left_txt.replace('经营范围', '')
|
|
|
+ if len(left_conf_list):
|
|
|
+ left_conf = sum(left_conf_list) / len(left_conf_list)
|
|
|
+ # 住所信息
|
|
|
+ for idx, res in enumerate(right_result):
|
|
|
+ if len(right_result) - 1 != idx:
|
|
|
+ if bool(re.match(r'所', res[1][0])):
|
|
|
+ right_txt = ''
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(res[1][0]) == 1:
|
|
|
+ right_position = right_result[idx + 1][0][0][0]
|
|
|
+ right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
|
|
|
+ else:
|
|
|
+ right_txt += res[1][0]
|
|
|
+ right_conf_list.append(res[1][1])
|
|
|
+ right_position = right_result[idx][0][0][0]
|
|
|
+ right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
|
|
|
+ for x in right_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
|
|
|
+ d - t) * 1.2 and '登记机关' not in x[1][0]:
|
|
|
+ right_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ right_txt += x[1][0]
|
|
|
+ right_conf_list.append(x[1][1])
|
|
|
+ elif bool(re.match(r'住', res[1][0])):
|
|
|
+ right_txt = ''
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+
|
|
|
+ if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
|
|
|
+ # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
|
|
|
+ standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
|
|
|
+ right_position = right_result[idx + 1][0][0][0]
|
|
|
+ right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
|
|
|
+ else:
|
|
|
+ # 此情况为长文本,则采用框的左右坐标的1/5为标准
|
|
|
+ standard = abs(res[0][1][0] - res[0][0][0]) // 5
|
|
|
+ # 长文本直接添加至结果输出
|
|
|
+ right_txt += res[1][0]
|
|
|
+ right_conf_list.append(res[1][1])
|
|
|
+ right_position = res[0][0][0]
|
|
|
+ right_down_position = (res[0][2][1] + res[0][3][1]) // 2
|
|
|
+
|
|
|
+ for x in right_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
|
|
|
+ d - t) * 1.2 and '登记机关' not in x[1][0]:
|
|
|
+ right_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ right_txt += x[1][0]
|
|
|
+ right_conf_list.append(x[1][1])
|
|
|
+ elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
|
|
|
+ right_txt = ''
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(res[1][0]) == 4:
|
|
|
+ # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
|
|
|
+ standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
|
|
|
+ right_position = right_result[idx + 1][0][0][0]
|
|
|
+ right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
|
|
|
+ else:
|
|
|
+ # 此情况为长文本,则采用框的左右坐标的1/5为标准
|
|
|
+ standard = abs(res[0][1][0] - res[0][0][0]) // 2
|
|
|
+ # 长文本直接添加至结果输出
|
|
|
+ right_txt += res[1][0]
|
|
|
+ right_conf_list.append(res[1][1])
|
|
|
+ right_position = res[0][0][0]
|
|
|
+ right_down_position = (res[0][2][1] + res[0][3][1]) // 2
|
|
|
+
|
|
|
+ for x in right_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
|
|
|
+ d - t) * 1.2 and '登记机关' not in x[1][0]:
|
|
|
+ right_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ right_txt += x[1][0]
|
|
|
+ right_conf_list.append(x[1][1])
|
|
|
+ right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
|
|
|
+ right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
|
|
|
+ if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
|
|
|
+ right_txt = right_txt.replace('所', '')
|
|
|
+ right_txt = right_txt.replace('住', '')
|
|
|
+ if len(right_conf_list):
|
|
|
+ right_conf = sum(right_conf_list) / len(right_conf_list)
|
|
|
+ return left_txt, left_conf, right_txt, right_conf
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class BussinessParse1(object):
|
|
|
+ """
|
|
|
+ 经营范围
|
|
|
+ """
|
|
|
+ ocr: PaddleOCR
|
|
|
+
|
|
|
+ def bs_detection(self, image, raw_results):
|
|
|
+ h, w, _ = image.shape
|
|
|
+ down_list = []
|
|
|
+ down_list2 = []
|
|
|
+ for i in raw_results:
|
|
|
+ if bool(re.match(r'注册资本', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ down_list.append([x0, y1])
|
|
|
+ elif bool(re.search(r'日期', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ down_list.append([x0, y1])
|
|
|
+ elif bool(re.match(r'营业期限', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ down_list.append([x0, y1])
|
|
|
+ elif bool(re.match(r'登记', i.txt)):
|
|
|
+ [_, y0] = i.lt
|
|
|
+ down_list2.append(y0)
|
|
|
+
|
|
|
+ t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
+ l2 = sorted(down_list, key=lambda x: x[0])[0][0]
|
|
|
+ d2 = int(down_list2[0]) if len(down_list2) else h
|
|
|
+ down_img = image[int(t2): d2, int(l2): w]
|
|
|
+
|
|
|
+ down_result = self.ocr.ocr(down_img)
|
|
|
+
|
|
|
+ down_conf_list = []
|
|
|
+ down_conf = 0.0
|
|
|
+ down_txt = ''
|
|
|
+ for idx, res in enumerate(down_result):
|
|
|
+ # print(res)
|
|
|
+ if len(down_result) - 1 != idx:
|
|
|
+ if bool(re.match(r'经营范围', res[1][0])):
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(down_result[idx - 1][1][0]) > 15 and abs(
|
|
|
+ down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.8:
|
|
|
+ down_txt += down_result[idx - 1][1][0]
|
|
|
+ down_txt += res[1][0]
|
|
|
+ down_conf_list.append(res[1][1])
|
|
|
+ down_position = down_result[idx + 1][0][0][0]
|
|
|
+ down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
|
|
|
+ for x in down_result[idx + 1:]:
|
|
|
+ print(abs(down_down_position - x[0][0][1]))
|
|
|
+ print(abs(d - t) * 1.2)
|
|
|
+ if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.8:
|
|
|
+ down_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ if down_txt[-1] == x[1][0][0]:
|
|
|
+ down_txt += x[1][0][1:]
|
|
|
+ else:
|
|
|
+ down_txt += x[1][0]
|
|
|
+ down_conf_list.append(x[1][1])
|
|
|
+ # print(down_txt)
|
|
|
+ down_txt = down_txt.replace('经营范围', '')
|
|
|
+ if len(down_conf_list):
|
|
|
+ down_conf = sum(down_conf_list) / len(down_conf_list)
|
|
|
+
|
|
|
+ # cv2.imshow('11', down_img)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ return down_txt, down_conf
|
|
|
+
|
|
|
+ def ad_detection(self, image, raw_results):
|
|
|
+ h, w, _ = image.shape
|
|
|
+ top_list1 = []
|
|
|
+ top_list2 = []
|
|
|
+ type_key = False
|
|
|
+ for i in raw_results:
|
|
|
+ if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
|
|
|
+ [_, y0] = i.lt
|
|
|
+ top_list2.append(y0)
|
|
|
+ elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ top_list1.append([x0, y1])
|
|
|
+ elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
|
|
|
+ type_key = True
|
|
|
+ [x0, _] = i.lt
|
|
|
+ [_, y1] = i.rb
|
|
|
+ top_list1.append([x0, y1])
|
|
|
+ elif bool(re.match(r'注册资本', i.txt)):
|
|
|
+ [_, y0] = i.lt
|
|
|
+ top_list2.append(y0)
|
|
|
+ elif bool(re.search(r'日期', i.txt)):
|
|
|
+ [_, y0] = i.lt
|
|
|
+ top_list2.append(y0)
|
|
|
+ elif bool(re.match(r'营业期限', i.txt)):
|
|
|
+ [_, y0] = i.lt
|
|
|
+ top_list2.append(y0)
|
|
|
+ t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
|
|
|
+ l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
|
|
|
+ d1 = sorted(top_list2)[0]
|
|
|
+
|
|
|
+ top_img = image[int(t1): int(d1), int(l1): w]
|
|
|
+ top_result = self.ocr.ocr(top_img)
|
|
|
+
|
|
|
+ top_conf_list = []
|
|
|
+ top_conf = 0.0
|
|
|
+ top_txt = ''
|
|
|
+ last_key = ''
|
|
|
+ # 住所信息
|
|
|
+ for idx, res in enumerate(top_result):
|
|
|
+ # print(res)
|
|
|
+ if bool(re.match(r'所', res[1][0])):
|
|
|
+ top_txt = ''
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(last_key):
|
|
|
+ top_txt += last_key
|
|
|
+ print('top_txt', top_txt)
|
|
|
+ if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
|
|
|
+ top_position = top_result[idx + 1][0][0][0]
|
|
|
+ top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
|
|
|
+ else:
|
|
|
+ top_txt += res[1][0]
|
|
|
+ top_conf_list.append(res[1][1])
|
|
|
+ top_position = top_result[idx][0][0][0]
|
|
|
+ top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
|
|
|
+ if len(top_result) - 1 != idx:
|
|
|
+ for x in top_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.2:
|
|
|
+ top_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ top_txt += x[1][0]
|
|
|
+ top_conf_list.append(x[1][1])
|
|
|
+ # print('top_txt', top_txt)
|
|
|
+ elif bool(re.match(r'住', res[1][0])):
|
|
|
+ top_txt = ''
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
|
|
|
+ top_result[idx - 1][1][0] and idx != 0:
|
|
|
+ last_key = top_result[idx - 1][1][0]
|
|
|
+ if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
|
|
|
+ # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
|
|
|
+ standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
|
|
|
+ top_position = top_result[idx + 1][0][0][0]
|
|
|
+ top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
|
|
|
+ else:
|
|
|
+ # 此情况为长文本,则采用框的左右坐标的1/5为标准
|
|
|
+ standard = abs(res[0][1][0] - res[0][0][0]) // 5
|
|
|
+ # 长文本直接添加至结果输出
|
|
|
+ top_txt += res[1][0]
|
|
|
+ top_conf_list.append(res[1][1])
|
|
|
+ top_position = res[0][0][0]
|
|
|
+ top_down_position = (res[0][2][1] + res[0][3][1]) // 2
|
|
|
+ if len(top_result) - 1 != idx:
|
|
|
+ for x in top_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.2:
|
|
|
+ top_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ top_txt += x[1][0]
|
|
|
+ top_conf_list.append(x[1][1])
|
|
|
+ # print(top_txt)
|
|
|
+ elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
|
|
|
+ top_txt = ''
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
|
|
|
+ top_result[idx - 1][1][0] and idx != 0:
|
|
|
+ top_txt += top_result[idx - 1][1][0]
|
|
|
+ if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
|
|
|
+ # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
|
|
|
+ standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
|
|
|
+ top_position = top_result[idx + 1][0][0][0]
|
|
|
+ top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
|
|
|
+ else:
|
|
|
+ # 此情况为长文本,则采用框的左右坐标的1/2为标准
|
|
|
+ standard = abs(res[0][1][0] - res[0][0][0]) // 2
|
|
|
+ # 长文本直接添加至结果输出
|
|
|
+ top_txt += res[1][0]
|
|
|
+ top_conf_list.append(res[1][1])
|
|
|
+ top_position = res[0][0][0]
|
|
|
+ top_down_position = (res[0][2][1] + res[0][3][1]) // 2
|
|
|
+ if len(top_result) - 1 != idx:
|
|
|
+ for x in top_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
|
|
|
+ d - t) * 1.2:
|
|
|
+ top_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ top_txt += x[1][0]
|
|
|
+ top_conf_list.append(x[1][1])
|
|
|
+ top_conf_list.append(x[1][1])
|
|
|
+ if len(top_txt) == 0 and type_key:
|
|
|
+ for res in top_result:
|
|
|
+ top_txt += res[1][0]
|
|
|
+ top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
|
|
|
+ if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
|
|
|
+ top_txt = top_txt.replace('所', '')
|
|
|
+ top_txt = top_txt.replace('住', '')
|
|
|
+ if len(top_conf_list):
|
|
|
+ top_conf = sum(top_conf_list) / len(top_conf_list)
|
|
|
+
|
|
|
+ # cv2.imshow('11', top_img)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ return top_txt, top_conf
|