|
@@ -174,63 +174,68 @@ class BussinessParse1(object):
|
|
h, w, _ = image.shape
|
|
h, w, _ = image.shape
|
|
down_list = []
|
|
down_list = []
|
|
down_list2 = []
|
|
down_list2 = []
|
|
|
|
+ raw_txt = ''
|
|
|
|
+ down_txt = ''
|
|
|
|
+ raw_conf_list = []
|
|
|
|
+ down_conf_list = []
|
|
|
|
+ down_conf = 0.0
|
|
|
|
+ simple_key = False
|
|
for i in raw_results:
|
|
for i in raw_results:
|
|
- if bool(re.match(r'注册资本', i.txt)):
|
|
|
|
- [x0, _] = i.lt
|
|
|
|
- [_, y1] = i.rb
|
|
|
|
- down_list.append([x0, y1])
|
|
|
|
- elif bool(re.search(r'日期', i.txt)):
|
|
|
|
|
|
+ if bool(re.search(r'日期', i.txt)):
|
|
[x0, _] = i.lt
|
|
[x0, _] = i.lt
|
|
[_, y1] = i.rb
|
|
[_, y1] = i.rb
|
|
down_list.append([x0, y1])
|
|
down_list.append([x0, y1])
|
|
elif bool(re.match(r'营业期限', i.txt)):
|
|
elif bool(re.match(r'营业期限', i.txt)):
|
|
|
|
+ simple_key = True
|
|
[x0, _] = i.lt
|
|
[x0, _] = i.lt
|
|
[_, y1] = i.rb
|
|
[_, y1] = i.rb
|
|
down_list.append([x0, y1])
|
|
down_list.append([x0, y1])
|
|
elif bool(re.match(r'登记', i.txt)):
|
|
elif bool(re.match(r'登记', i.txt)):
|
|
[_, y0] = i.lt
|
|
[_, y0] = i.lt
|
|
down_list2.append(y0)
|
|
down_list2.append(y0)
|
|
-
|
|
|
|
- t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
|
- l2 = sorted(down_list, key=lambda x: x[0])[0][0]
|
|
|
|
- d2 = int(down_list2[0]) if len(down_list2) else h
|
|
|
|
- down_img = image[int(t2): d2, int(l2): w]
|
|
|
|
-
|
|
|
|
- down_result = self.ocr.ocr(down_img)
|
|
|
|
-
|
|
|
|
- down_conf_list = []
|
|
|
|
- down_conf = 0.0
|
|
|
|
- down_txt = ''
|
|
|
|
- for idx, res in enumerate(down_result):
|
|
|
|
- # print(res)
|
|
|
|
- if len(down_result) - 1 != idx:
|
|
|
|
- if bool(re.match(r'经营范围', res[1][0])):
|
|
|
|
- t = res[0][0][1]
|
|
|
|
- d = res[0][2][1]
|
|
|
|
- if len(down_result[idx - 1][1][0]) > 15 and abs(
|
|
|
|
- down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
|
|
|
|
- d - t) * 1.8:
|
|
|
|
- down_txt += down_result[idx - 1][1][0]
|
|
|
|
- down_txt += res[1][0]
|
|
|
|
- down_conf_list.append(res[1][1])
|
|
|
|
- down_position = down_result[idx + 1][0][0][0]
|
|
|
|
- down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
|
|
|
|
- for x in down_result[idx + 1:]:
|
|
|
|
- print(abs(down_down_position - x[0][0][1]))
|
|
|
|
- print(abs(d - t) * 1.2)
|
|
|
|
- if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
|
|
|
|
- d - t) * 1.8:
|
|
|
|
- down_down_position = (x[0][2][1] + x[0][3][1]) // 2
|
|
|
|
- if down_txt[-1] == x[1][0][0]:
|
|
|
|
- down_txt += x[1][0][1:]
|
|
|
|
- else:
|
|
|
|
- down_txt += x[1][0]
|
|
|
|
- down_conf_list.append(x[1][1])
|
|
|
|
- # print(down_txt)
|
|
|
|
|
|
+ elif bool(re.match(r'经营范围', i.txt)):
|
|
|
|
+ [x0, y0] = i.lt
|
|
|
|
+ [x1, _] = i.rb
|
|
|
|
+ # 第一方案:
|
|
|
|
+ for j in raw_results:
|
|
|
|
+ [x, _] = j.lt
|
|
|
|
+ [_, y] = j.rb
|
|
|
|
+ if abs(x - x1) <= abs(x1 - x0) and y >= y0 and '登记' not in j.txt:
|
|
|
|
+ raw_txt += j.txt
|
|
|
|
+ raw_conf_list.append(j.conf)
|
|
|
|
+ if len(down_list) and len(down_list2):
|
|
|
|
+ t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
|
|
|
|
+ l2 = sorted(down_list, key=lambda x: x[0])[0][0]
|
|
|
|
+ d2 = int(down_list2[0]) if len(down_list2) else h
|
|
|
|
+ down_img = image[int(t2): d2, int(l2): w]
|
|
|
|
+ h1, w1, _ = down_img.shape
|
|
|
|
+ down_result = self.ocr.ocr(down_img)
|
|
|
|
+ # print('simple_key', simple_key)
|
|
|
|
+ # 第二方案(检索到‘营业期限’关键词)
|
|
|
|
+ if simple_key:
|
|
|
|
+ # print('111')
|
|
|
|
+ for res in down_result:
|
|
|
|
+ l1 = res[0][0][0]
|
|
|
|
+ if l1 < (7 * w1 // 24):
|
|
|
|
+ down_txt += res[1][0]
|
|
|
|
+ down_conf_list.append(res[1][1])
|
|
|
|
+ # print(down_txt)
|
|
|
|
+ # 第三套方案
|
|
|
|
+ else:
|
|
|
|
+ for idx, res in enumerate(down_result):
|
|
|
|
+ if bool(re.match(r'经营范围', res[1][0])):
|
|
|
|
+ t = res[0][0][1]
|
|
|
|
+ for i in down_result:
|
|
|
|
+ if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
|
|
|
|
+ down_txt += res[1][0]
|
|
|
|
+ down_conf_list.append(res[1][1])
|
|
down_txt = down_txt.replace('经营范围', '')
|
|
down_txt = down_txt.replace('经营范围', '')
|
|
|
|
+ raw_txt = raw_txt.replace('经营范围', '')
|
|
if len(down_conf_list):
|
|
if len(down_conf_list):
|
|
down_conf = sum(down_conf_list) / len(down_conf_list)
|
|
down_conf = sum(down_conf_list) / len(down_conf_list)
|
|
-
|
|
|
|
|
|
+ if len(raw_txt) > len(down_txt):
|
|
|
|
+ down_txt = raw_txt
|
|
|
|
+ down_conf = sum(raw_conf_list) / len(raw_conf_list)
|
|
# cv2.imshow('11', down_img)
|
|
# cv2.imshow('11', down_img)
|
|
# cv2.waitKey(0)
|
|
# cv2.waitKey(0)
|
|
return down_txt, down_conf
|
|
return down_txt, down_conf
|
|
@@ -239,6 +244,10 @@ class BussinessParse1(object):
|
|
h, w, _ = image.shape
|
|
h, w, _ = image.shape
|
|
top_list1 = []
|
|
top_list1 = []
|
|
top_list2 = []
|
|
top_list2 = []
|
|
|
|
+ top_conf_list = []
|
|
|
|
+ top_conf = 0.0
|
|
|
|
+ top_txt = ''
|
|
|
|
+ last_key = ''
|
|
type_key = False
|
|
type_key = False
|
|
for i in raw_results:
|
|
for i in raw_results:
|
|
if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
|
|
if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
|
|
@@ -269,10 +278,6 @@ class BussinessParse1(object):
|
|
top_img = image[int(t1): int(d1), int(l1): w]
|
|
top_img = image[int(t1): int(d1), int(l1): w]
|
|
top_result = self.ocr.ocr(top_img)
|
|
top_result = self.ocr.ocr(top_img)
|
|
|
|
|
|
- top_conf_list = []
|
|
|
|
- top_conf = 0.0
|
|
|
|
- top_txt = ''
|
|
|
|
- last_key = ''
|
|
|
|
# 住所信息
|
|
# 住所信息
|
|
for idx, res in enumerate(top_result):
|
|
for idx, res in enumerate(top_result):
|
|
# print(res)
|
|
# print(res)
|