|
@@ -16,27 +16,27 @@ class BussinessParse0(object):
|
|
|
left_list = []
|
|
|
right_list = []
|
|
|
for i in raw_results:
|
|
|
- if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
|
|
|
+ if bool(re.match('法定代表', i.txt)) or bool(re.match('经营者', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
left_list.append([x0, y1])
|
|
|
- elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
|
|
|
+ elif bool(re.match('名', i.txt)) or bool(re.match('称', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
left_list.append([x0, y1])
|
|
|
- elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
|
|
|
+ elif bool(re.match('类', i.txt)) or bool(re.match('型', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
left_list.append([x0, y1])
|
|
|
- elif bool(re.match(r'注册', i.txt)):
|
|
|
+ elif bool(re.match('注册', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
right_list.append([x0, y1])
|
|
|
- elif bool(re.search(r'日期', i.txt)):
|
|
|
+ elif bool(re.search('日期', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
right_list.append([x0, y1])
|
|
|
- elif bool(re.match(r'营业期限', i.txt)):
|
|
|
+ elif bool(re.match('营业期限', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
right_list.append([x0, y1])
|
|
@@ -44,12 +44,10 @@ class BussinessParse0(object):
|
|
|
t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
l1 = sorted(left_list, key=lambda x: x[0])[0][0]
|
|
|
r1 = sorted(right_list, key=lambda x: x[0])[0][0]
|
|
|
-
|
|
|
- left_img = image[int(t1): h, int(l1): int(r1)]
|
|
|
- right_img = image[int(t2): h, int(r1): w]
|
|
|
+ left_img = image[int(t1):h, int(l1):int(r1)]
|
|
|
+ right_img = image[int(t2):h, int(r1):w]
|
|
|
left_result = self.ocr.ocr(left_img)
|
|
|
right_result = self.ocr.ocr(right_img)
|
|
|
-
|
|
|
left_conf_list = []
|
|
|
right_conf_list = []
|
|
|
left_conf = 0.0
|
|
@@ -57,105 +55,88 @@ class BussinessParse0(object):
|
|
|
left_txt = ''
|
|
|
right_txt = ''
|
|
|
for idx, res in enumerate(left_result):
|
|
|
- if len(left_result) - 1 != idx:
|
|
|
- if bool(re.match(r'经营范围', res[1][0])):
|
|
|
- t = res[0][0][1]
|
|
|
- d = res[0][2][1]
|
|
|
- # 判断上一条信息是否为经营范围内容
|
|
|
- if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
|
|
|
- d - t) * 1.8:
|
|
|
- left_txt += left_result[idx - 1][1][0]
|
|
|
-
|
|
|
- left_txt += res[1][0]
|
|
|
- left_conf_list.append(res[1][1])
|
|
|
+ if len(left_result) - 1 != idx and bool(re.match('经营范围', res[1][0])):
|
|
|
+ t = res[0][0][1]
|
|
|
+ d = res[0][2][1]
|
|
|
+ if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < (abs(d - t) * 1.8):
|
|
|
+ left_txt += left_result[idx - 1][1][0]
|
|
|
+ left_txt += res[1][0]
|
|
|
+ left_conf_list.append(res[1][1])
|
|
|
+ left_position = left_result[idx + 1][0][0][0]
|
|
|
+ left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
|
|
|
|
|
|
- left_position = left_result[idx + 1][0][0][0]
|
|
|
- left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
|
|
|
- for x in left_result[idx + 1:]:
|
|
|
- if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
|
|
|
- left_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
- if left_txt[-1] == x[1][0][0]:
|
|
|
- left_txt += x[1][0][1:]
|
|
|
- else:
|
|
|
- left_txt += x[1][0]
|
|
|
- left_conf_list.append(x[1][1])
|
|
|
- # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
|
|
|
+ for x in left_result[idx + 1:]:
|
|
|
+ if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
|
|
|
+ left_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
+ left_txt += x[1][0][1:] if left_txt[-1] == x[1][0][0] else x[1][0]
|
|
|
+ left_conf_list.append(x[1][1])
|
|
|
left_txt = left_txt.replace('经营范围', '')
|
|
|
if len(left_conf_list):
|
|
|
left_conf = sum(left_conf_list) / len(left_conf_list)
|
|
|
- # 住所信息
|
|
|
for idx, res in enumerate(right_result):
|
|
|
if len(right_result) - 1 != idx:
|
|
|
- if bool(re.match(r'所', res[1][0])):
|
|
|
+ if bool(re.match('所', res[1][0])):
|
|
|
right_txt = ''
|
|
|
t = res[0][0][1]
|
|
|
d = res[0][2][1]
|
|
|
if len(res[1][0]) == 1:
|
|
|
right_position = right_result[idx + 1][0][0][0]
|
|
|
right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
|
|
|
+
|
|
|
else:
|
|
|
right_txt += res[1][0]
|
|
|
right_conf_list.append(res[1][1])
|
|
|
right_position = right_result[idx][0][0][0]
|
|
|
right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
|
|
|
+
|
|
|
for x in right_result[idx + 1:]:
|
|
|
- if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
|
|
|
- d - t) * 1.2 and '登记机关' not in x[1][0]:
|
|
|
+ if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
|
|
|
right_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
right_txt += x[1][0]
|
|
|
right_conf_list.append(x[1][1])
|
|
|
- elif bool(re.match(r'住', res[1][0])):
|
|
|
+ elif bool(re.match('住', res[1][0])):
|
|
|
right_txt = ''
|
|
|
t = res[0][0][1]
|
|
|
d = res[0][2][1]
|
|
|
-
|
|
|
if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
|
|
|
- # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
|
|
|
standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
|
|
|
right_position = right_result[idx + 1][0][0][0]
|
|
|
right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
|
|
|
+
|
|
|
else:
|
|
|
- # 此情况为长文本,则采用框的左右坐标的1/5为标准
|
|
|
standard = abs(res[0][1][0] - res[0][0][0]) // 5
|
|
|
- # 长文本直接添加至结果输出
|
|
|
right_txt += res[1][0]
|
|
|
right_conf_list.append(res[1][1])
|
|
|
right_position = res[0][0][0]
|
|
|
right_down_position = (res[0][2][1] + res[0][3][1]) // 2
|
|
|
-
|
|
|
for x in right_result[idx + 1:]:
|
|
|
- if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
|
|
|
- d - t) * 1.2 and '登记机关' not in x[1][0]:
|
|
|
+ if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
|
|
|
right_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
right_txt += x[1][0]
|
|
|
right_conf_list.append(x[1][1])
|
|
|
- elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
|
|
|
+ elif bool(re.match('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
|
|
|
right_txt = ''
|
|
|
t = res[0][0][1]
|
|
|
d = res[0][2][1]
|
|
|
if len(res[1][0]) == 4:
|
|
|
- # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
|
|
|
standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
|
|
|
right_position = right_result[idx + 1][0][0][0]
|
|
|
right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
|
|
|
+
|
|
|
else:
|
|
|
- # 此情况为长文本,则采用框的左右坐标的1/5为标准
|
|
|
standard = abs(res[0][1][0] - res[0][0][0]) // 2
|
|
|
- # 长文本直接添加至结果输出
|
|
|
right_txt += res[1][0]
|
|
|
right_conf_list.append(res[1][1])
|
|
|
right_position = res[0][0][0]
|
|
|
right_down_position = (res[0][2][1] + res[0][3][1]) // 2
|
|
|
-
|
|
|
for x in right_result[idx + 1:]:
|
|
|
- if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
|
|
|
- d - t) * 1.2 and '登记机关' not in x[1][0]:
|
|
|
+ if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
|
|
|
right_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
right_txt += x[1][0]
|
|
|
right_conf_list.append(x[1][1])
|
|
|
right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
|
|
|
- right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
|
|
|
- if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
|
|
|
+ right_txt = re.sub('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
|
|
|
+ if bool(re.match('所', right_txt)) or bool(re.match('住', right_txt)):
|
|
|
right_txt = right_txt.replace('所', '')
|
|
|
right_txt = right_txt.replace('住', '')
|
|
|
if len(right_conf_list):
|
|
@@ -181,22 +162,21 @@ class BussinessParse1(object):
|
|
|
down_conf = 0.0
|
|
|
simple_key = False
|
|
|
for i in raw_results:
|
|
|
- if bool(re.search(r'日期', i.txt)):
|
|
|
+ if bool(re.search('日期', i.txt)):
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
down_list.append([x0, y1])
|
|
|
- elif bool(re.match(r'营业期限', i.txt)):
|
|
|
+ elif bool(re.match('营业期限', i.txt)):
|
|
|
simple_key = True
|
|
|
[x0, _] = i.lt
|
|
|
[_, y1] = i.rb
|
|
|
down_list.append([x0, y1])
|
|
|
- elif bool(re.match(r'登记', i.txt)):
|
|
|
+ elif bool(re.match('登记', i.txt)):
|
|
|
[_, y0] = i.lt
|
|
|
down_list2.append(y0)
|
|
|
- elif bool(re.match(r'经营范围', i.txt)):
|
|
|
+ elif bool(re.match('经营范围', i.txt)):
|
|
|
[x0, y0] = i.lt
|
|
|
[x1, _] = i.rb
|
|
|
- # 第一方案:
|
|
|
for j in raw_results:
|
|
|
[x, _] = j.lt
|
|
|
[_, y] = j.rb
|
|
@@ -207,28 +187,21 @@ class BussinessParse1(object):
|
|
|
t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
l2 = sorted(down_list, key=lambda x: x[0])[0][0]
|
|
|
d2 = int(down_list2[0]) if len(down_list2) else h
|
|
|
- down_img = image[int(t2): d2, int(l2): w]
|
|
|
+ down_img = image[int(t2):d2, int(l2):w]
|
|
|
h1, w1, _ = down_img.shape
|
|
|
down_result = self.ocr.ocr(down_img)
|
|
|
- # print('simple_key', simple_key)
|
|
|
- # 第二方案(检索到‘营业期限’关键词)
|
|
|
- if simple_key:
|
|
|
- # print('111')
|
|
|
- for res in down_result:
|
|
|
+ for res in down_result:
|
|
|
+ if simple_key:
|
|
|
l1 = res[0][0][0]
|
|
|
- if l1 < (7 * w1 // 24):
|
|
|
+ if l1 < 7 * w1 // 24:
|
|
|
down_txt += res[1][0]
|
|
|
down_conf_list.append(res[1][1])
|
|
|
- # print(down_txt)
|
|
|
- # 第三套方案
|
|
|
- else:
|
|
|
- for idx, res in enumerate(down_result):
|
|
|
- if bool(re.match(r'经营范围', res[1][0])):
|
|
|
- t = res[0][0][1]
|
|
|
- for i in down_result:
|
|
|
- if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
|
|
|
- down_txt += res[1][0]
|
|
|
- down_conf_list.append(res[1][1])
|
|
|
+ elif bool(re.match('经营范围', res[1][0])):
|
|
|
+ t = res[0][0][1]
|
|
|
+ for i in down_result:
|
|
|
+ if i[0][2][1] < t and i[0][0][0] < 7 * w1 // 24:
|
|
|
+ down_txt += res[1][0]
|
|
|
+ down_conf_list.append(res[1][1])
|
|
|
down_txt = down_txt.replace('经营范围', '')
|
|
|
raw_txt = raw_txt.replace('经营范围', '')
|
|
|
if len(down_conf_list):
|
|
@@ -236,8 +209,6 @@ class BussinessParse1(object):
|
|
|
if len(raw_txt) > len(down_txt):
|
|
|
down_txt = raw_txt
|
|
|
down_conf = sum(raw_conf_list) / len(raw_conf_list)
|
|
|
- # cv2.imshow('11', down_img)
|
|
|
- # cv2.waitKey(0)
|
|
|
return down_txt, down_conf
|
|
|
|
|
|
def ad_detection(self, image, raw_results):
|