123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- import re
- from dataclasses import dataclass
- import cv2
- from paddleocr import PaddleOCR
- @dataclass
- class BussinessParse0(object):
- """
- 经营范围
- """
- ocr: PaddleOCR
- def detection(self, image, raw_results):
- h, w, _ = image.shape
- left_list = []
- right_list = []
- for i in raw_results:
- if bool(re.match('法定代表', i.txt)) or bool(re.match('经营者', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- left_list.append([x0, y1])
- elif bool(re.match('名', i.txt)) or bool(re.match('称', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- left_list.append([x0, y1])
- elif bool(re.match('类', i.txt)) or bool(re.match('型', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- left_list.append([x0, y1])
- elif bool(re.match('注册', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- right_list.append([x0, y1])
- elif bool(re.search('日期', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- right_list.append([x0, y1])
- elif bool(re.match('营业期限', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- right_list.append([x0, y1])
- t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
- t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
- l1 = sorted(left_list, key=lambda x: x[0])[0][0]
- r1 = sorted(right_list, key=lambda x: x[0])[0][0]
- left_img = image[int(t1):h, int(l1):int(r1)]
- right_img = image[int(t2):h, int(r1):w]
- left_result = self.ocr.ocr(left_img)
- right_result = self.ocr.ocr(right_img)
- left_conf_list = []
- right_conf_list = []
- left_conf = 0.0
- right_conf = 0.0
- left_txt = ''
- right_txt = ''
- for idx, res in enumerate(left_result):
- if len(left_result) - 1 != idx and bool(re.match('经营范围', res[1][0])):
- t = res[0][0][1]
- d = res[0][2][1]
- if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < (abs(d - t) * 1.8):
- left_txt += left_result[idx - 1][1][0]
- left_txt += res[1][0]
- left_conf_list.append(res[1][1])
- left_position = left_result[idx + 1][0][0][0]
- left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
- for x in left_result[idx + 1:]:
- if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
- left_down_position = (x[0][2][1] + x[0][3][1]) // 2
- left_txt += x[1][0][1:] if left_txt[-1] == x[1][0][0] else x[1][0]
- left_conf_list.append(x[1][1])
- left_txt = left_txt.replace('经营范围', '')
- if len(left_conf_list):
- left_conf = sum(left_conf_list) / len(left_conf_list)
- for idx, res in enumerate(right_result):
- if len(right_result) - 1 != idx:
- if bool(re.match('所', res[1][0])):
- right_txt = ''
- t = res[0][0][1]
- d = res[0][2][1]
- if len(res[1][0]) == 1:
- right_position = right_result[idx + 1][0][0][0]
- right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
- else:
- right_txt += res[1][0]
- right_conf_list.append(res[1][1])
- right_position = right_result[idx][0][0][0]
- right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
- for x in right_result[idx + 1:]:
- if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
- right_down_position = (x[0][2][1] + x[0][3][1]) // 2
- right_txt += x[1][0]
- right_conf_list.append(x[1][1])
- elif bool(re.match('住', res[1][0])):
- right_txt = ''
- t = res[0][0][1]
- d = res[0][2][1]
- if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
- standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
- right_position = right_result[idx + 1][0][0][0]
- right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
- else:
- standard = abs(res[0][1][0] - res[0][0][0]) // 5
- right_txt += res[1][0]
- right_conf_list.append(res[1][1])
- right_position = res[0][0][0]
- right_down_position = (res[0][2][1] + res[0][3][1]) // 2
- for x in right_result[idx + 1:]:
- if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
- right_down_position = (x[0][2][1] + x[0][3][1]) // 2
- right_txt += x[1][0]
- right_conf_list.append(x[1][1])
- elif bool(re.match('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
- right_txt = ''
- t = res[0][0][1]
- d = res[0][2][1]
- if len(res[1][0]) == 4:
- standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
- right_position = right_result[idx + 1][0][0][0]
- right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
- else:
- standard = abs(res[0][1][0] - res[0][0][0]) // 2
- right_txt += res[1][0]
- right_conf_list.append(res[1][1])
- right_position = res[0][0][0]
- right_down_position = (res[0][2][1] + res[0][3][1]) // 2
- for x in right_result[idx + 1:]:
- if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
- right_down_position = (x[0][2][1] + x[0][3][1]) // 2
- right_txt += x[1][0]
- right_conf_list.append(x[1][1])
- right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
- right_txt = re.sub('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
- if bool(re.match('所', right_txt)) or bool(re.match('住', right_txt)):
- right_txt = right_txt.replace('所', '')
- right_txt = right_txt.replace('住', '')
- if len(right_conf_list):
- right_conf = sum(right_conf_list) / len(right_conf_list)
- return left_txt, left_conf, right_txt, right_conf
- @dataclass
- class BussinessParse1(object):
- """
- 经营范围
- """
- ocr: PaddleOCR
- def bs_detection(self, image, raw_results):
- h, w, _ = image.shape
- down_list = []
- down_list2 = []
- raw_txt = ''
- down_txt = ''
- raw_conf_list = []
- down_conf_list = []
- down_conf = 0.0
- simple_key = False
- for i in raw_results:
- if bool(re.search('日期', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- down_list.append([x0, y1])
- elif bool(re.match('营业期限', i.txt)):
- simple_key = True
- [x0, _] = i.lt
- [_, y1] = i.rb
- down_list.append([x0, y1])
- elif bool(re.match('登记', i.txt)):
- [_, y0] = i.lt
- down_list2.append(y0)
- elif bool(re.match('经营范围', i.txt)):
- [x0, y0] = i.lt
- [x1, _] = i.rb
- for j in raw_results:
- [x, _] = j.lt
- [_, y] = j.rb
- if abs(x - x1) <= abs(x1 - x0) and y >= y0 and '登记' not in j.txt:
- raw_txt += j.txt
- raw_conf_list.append(j.conf)
- if len(down_list) and len(down_list2):
- t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
- l2 = sorted(down_list, key=lambda x: x[0])[0][0]
- d2 = int(down_list2[0]) if len(down_list2) else h
- down_img = image[int(t2):d2, int(l2):w]
- h1, w1, _ = down_img.shape
- down_result = self.ocr.ocr(down_img)
- for res in down_result:
- if simple_key:
- l1 = res[0][0][0]
- if l1 < 7 * w1 // 24:
- down_txt += res[1][0]
- down_conf_list.append(res[1][1])
- elif bool(re.match('经营范围', res[1][0])):
- t = res[0][0][1]
- for i in down_result:
- if i[0][2][1] < t and i[0][0][0] < 7 * w1 // 24:
- down_txt += res[1][0]
- down_conf_list.append(res[1][1])
- down_txt = down_txt.replace('经营范围', '')
- raw_txt = raw_txt.replace('经营范围', '')
- if len(down_conf_list):
- down_conf = sum(down_conf_list) / len(down_conf_list)
- if len(raw_txt) > len(down_txt):
- down_txt = raw_txt
- down_conf = sum(raw_conf_list) / len(raw_conf_list)
- return down_txt, down_conf
- def ad_detection(self, image, raw_results):
- h, w, _ = image.shape
- top_list1 = []
- top_list2 = []
- top_conf_list = []
- top_conf = 0.0
- top_txt = ''
- last_key = ''
- type_key = False
- for i in raw_results:
- if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
- [_, y0] = i.lt
- top_list2.append(y0)
- elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
- [x0, _] = i.lt
- [_, y1] = i.rb
- top_list1.append([x0, y1])
- elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
- type_key = True
- [x0, _] = i.lt
- [_, y1] = i.rb
- top_list1.append([x0, y1])
- elif bool(re.match(r'注册资本', i.txt)):
- [_, y0] = i.lt
- top_list2.append(y0)
- elif bool(re.search(r'日期', i.txt)):
- [_, y0] = i.lt
- top_list2.append(y0)
- elif bool(re.match(r'营业期限', i.txt)):
- [_, y0] = i.lt
- top_list2.append(y0)
- t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
- l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
- d1 = sorted(top_list2)[0]
- top_img = image[int(t1): int(d1), int(l1): w]
- top_result = self.ocr.ocr(top_img)
- # 住所信息
- for idx, res in enumerate(top_result):
- # print(res)
- if bool(re.match(r'所', res[1][0])):
- top_txt = ''
- t = res[0][0][1]
- d = res[0][2][1]
- if len(last_key):
- top_txt += last_key
- print('top_txt', top_txt)
- if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
- top_position = top_result[idx + 1][0][0][0]
- top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
- else:
- top_txt += res[1][0]
- top_conf_list.append(res[1][1])
- top_position = top_result[idx][0][0][0]
- top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
- if len(top_result) - 1 != idx:
- for x in top_result[idx + 1:]:
- if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
- d - t) * 1.2:
- top_down_position = (x[0][2][1] + x[0][3][1]) // 2
- top_txt += x[1][0]
- top_conf_list.append(x[1][1])
- # print('top_txt', top_txt)
- elif bool(re.match(r'住', res[1][0])):
- top_txt = ''
- t = res[0][0][1]
- d = res[0][2][1]
- if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
- d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
- top_result[idx - 1][1][0] and idx != 0:
- last_key = top_result[idx - 1][1][0]
- if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
- # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
- standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
- top_position = top_result[idx + 1][0][0][0]
- top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
- else:
- # 此情况为长文本,则采用框的左右坐标的1/5为标准
- standard = abs(res[0][1][0] - res[0][0][0]) // 5
- # 长文本直接添加至结果输出
- top_txt += res[1][0]
- top_conf_list.append(res[1][1])
- top_position = res[0][0][0]
- top_down_position = (res[0][2][1] + res[0][3][1]) // 2
- if len(top_result) - 1 != idx:
- for x in top_result[idx + 1:]:
- if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
- d - t) * 1.2:
- top_down_position = (x[0][2][1] + x[0][3][1]) // 2
- top_txt += x[1][0]
- top_conf_list.append(x[1][1])
- # print(top_txt)
- elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
- top_txt = ''
- t = res[0][0][1]
- d = res[0][2][1]
- if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
- d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
- top_result[idx - 1][1][0] and idx != 0:
- top_txt += top_result[idx - 1][1][0]
- if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
- # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
- standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
- top_position = top_result[idx + 1][0][0][0]
- top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
- else:
- # 此情况为长文本,则采用框的左右坐标的1/2为标准
- standard = abs(res[0][1][0] - res[0][0][0]) // 2
- # 长文本直接添加至结果输出
- top_txt += res[1][0]
- top_conf_list.append(res[1][1])
- top_position = res[0][0][0]
- top_down_position = (res[0][2][1] + res[0][3][1]) // 2
- if len(top_result) - 1 != idx:
- for x in top_result[idx + 1:]:
- if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
- d - t) * 1.2:
- top_down_position = (x[0][2][1] + x[0][3][1]) // 2
- top_txt += x[1][0]
- top_conf_list.append(x[1][1])
- top_conf_list.append(x[1][1])
- if len(top_txt) == 0 and type_key:
- for res in top_result:
- top_txt += res[1][0]
- top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
- if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
- top_txt = top_txt.replace('所', '')
- top_txt = top_txt.replace('住', '')
- if len(top_conf_list):
- top_conf = sum(top_conf_list) / len(top_conf_list)
- # cv2.imshow('11', top_img)
- # cv2.waitKey(0)
- return top_txt, top_conf
|