2 år sedan · 68a40031b0
--- a/blfe_core/business_parse.py
+++ b/blfe_core/business_parse.py
@@ -16,27 +16,27 @@ class BussinessParse0(object):
 
				         left_list = []
			
 
				         right_list = []
			
 
				         for i in raw_results:
			
 
				-            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
			
 
				+            if bool(re.match('法定代表', i.txt)) or bool(re.match('经营者', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 left_list.append([x0, y1])
			
 
				-            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
			
 
				+            elif bool(re.match('名', i.txt)) or bool(re.match('称', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 left_list.append([x0, y1])
			
 
				-            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
			
 
				+            elif bool(re.match('类', i.txt)) or bool(re.match('型', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 left_list.append([x0, y1])
			
 
				-            elif bool(re.match(r'注册', i.txt)):
			
 
				+            elif bool(re.match('注册', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 right_list.append([x0, y1])
			
 
				-            elif bool(re.search(r'日期', i.txt)):
			
 
				+            elif bool(re.search('日期', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 right_list.append([x0, y1])
			
 
				-            elif bool(re.match(r'营业期限', i.txt)):
			
 
				+            elif bool(re.match('营业期限', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 right_list.append([x0, y1])
			
@@ -44,12 +44,10 @@ class BussinessParse0(object):
 
				         t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
			
 
				         l1 = sorted(left_list, key=lambda x: x[0])[0][0]
			
 
				         r1 = sorted(right_list, key=lambda x: x[0])[0][0]
			
 
				-
			
 
				-        left_img = image[int(t1): h, int(l1): int(r1)]
			
 
				-        right_img = image[int(t2): h, int(r1): w]
			
 
				+        left_img = image[int(t1):h, int(l1):int(r1)]
			
 
				+        right_img = image[int(t2):h, int(r1):w]
			
 
				         left_result = self.ocr.ocr(left_img)
			
 
				         right_result = self.ocr.ocr(right_img)
			
 
				-
			
 
				         left_conf_list = []
			
 
				         right_conf_list = []
			
 
				         left_conf = 0.0
			
@@ -57,105 +55,88 @@ class BussinessParse0(object):
 
				         left_txt = ''
			
 
				         right_txt = ''
			
 
				         for idx, res in enumerate(left_result):
			
 
				-            if len(left_result) - 1 != idx:
			
 
				-                if bool(re.match(r'经营范围', res[1][0])):
			
 
				-                    t = res[0][0][1]
			
 
				-                    d = res[0][2][1]
			
 
				-                    # 判断上一条信息是否为经营范围内容
			
 
				-                    if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
			
 
				-                            d - t) * 1.8:
			
 
				-                        left_txt += left_result[idx - 1][1][0]
			
 
				-
			
 
				-                    left_txt += res[1][0]
			
 
				-                    left_conf_list.append(res[1][1])
			
 
				+            if len(left_result) - 1 != idx and bool(re.match('经营范围', res[1][0])):
			
 
				+                t = res[0][0][1]
			
 
				+                d = res[0][2][1]
			
 
				+                if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < (abs(d - t) * 1.8):
			
 
				+                    left_txt += left_result[idx - 1][1][0]
			
 
				+                left_txt += res[1][0]
			
 
				+                left_conf_list.append(res[1][1])
			
 
				+                left_position = left_result[idx + 1][0][0][0]
			
 
				+                left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
			
 
				 
			
 
				-                    left_position = left_result[idx + 1][0][0][0]
			
 
				-                    left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
			
 
				-                    for x in left_result[idx + 1:]:
			
 
				-                        if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
			
 
				-                            left_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				-                            if left_txt[-1] == x[1][0][0]:
			
 
				-                                left_txt += x[1][0][1:]
			
 
				-                            else:
			
 
				-                                left_txt += x[1][0]
			
 
				-                            left_conf_list.append(x[1][1])
			
 
				-                # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
			
 
				+                for x in left_result[idx + 1:]:
			
 
				+                    if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
			
 
				+                        left_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                        left_txt += x[1][0][1:] if left_txt[-1] == x[1][0][0] else x[1][0]
			
 
				+                        left_conf_list.append(x[1][1])
			
 
				             left_txt = left_txt.replace('经营范围', '')
			
 
				             if len(left_conf_list):
			
 
				                 left_conf = sum(left_conf_list) / len(left_conf_list)
			
 
				-        # 住所信息
			
 
				         for idx, res in enumerate(right_result):
			
 
				             if len(right_result) - 1 != idx:
			
 
				-                if bool(re.match(r'所', res[1][0])):
			
 
				+                if bool(re.match('所', res[1][0])):
			
 
				                     right_txt = ''
			
 
				                     t = res[0][0][1]
			
 
				                     d = res[0][2][1]
			
 
				                     if len(res[1][0]) == 1:
			
 
				                         right_position = right_result[idx + 1][0][0][0]
			
 
				                         right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
			
 
				+
			
 
				                     else:
			
 
				                         right_txt += res[1][0]
			
 
				                         right_conf_list.append(res[1][1])
			
 
				                         right_position = right_result[idx][0][0][0]
			
 
				                         right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
			
 
				+
			
 
				                     for x in right_result[idx + 1:]:
			
 
				-                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
			
 
				-                                d - t) * 1.2 and '登记机关' not in x[1][0]:
			
 
				+                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
			
 
				                             right_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				                             right_txt += x[1][0]
			
 
				                             right_conf_list.append(x[1][1])
			
 
				-                elif bool(re.match(r'住', res[1][0])):
			
 
				+                elif bool(re.match('住', res[1][0])):
			
 
				                     right_txt = ''
			
 
				                     t = res[0][0][1]
			
 
				                     d = res[0][2][1]
			
 
				-
			
 
				                     if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
			
 
				-                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
			
 
				                         standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
			
 
				                         right_position = right_result[idx + 1][0][0][0]
			
 
				                         right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
			
 
				+
			
 
				                     else:
			
 
				-                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
			
 
				                         standard = abs(res[0][1][0] - res[0][0][0]) // 5
			
 
				-                        # 长文本直接添加至结果输出
			
 
				                         right_txt += res[1][0]
			
 
				                         right_conf_list.append(res[1][1])
			
 
				                         right_position = res[0][0][0]
			
 
				                         right_down_position = (res[0][2][1] + res[0][3][1]) // 2
			
 
				-
			
 
				                     for x in right_result[idx + 1:]:
			
 
				-                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
			
 
				-                                d - t) * 1.2 and '登记机关' not in x[1][0]:
			
 
				+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
			
 
				                             right_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				                             right_txt += x[1][0]
			
 
				                             right_conf_list.append(x[1][1])
			
 
				-                elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
			
 
				+                elif bool(re.match('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
			
 
				                     right_txt = ''
			
 
				                     t = res[0][0][1]
			
 
				                     d = res[0][2][1]
			
 
				                     if len(res[1][0]) == 4:
			
 
				-                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
			
 
				                         standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
			
 
				                         right_position = right_result[idx + 1][0][0][0]
			
 
				                         right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
			
 
				+
			
 
				                     else:
			
 
				-                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
			
 
				                         standard = abs(res[0][1][0] - res[0][0][0]) // 2
			
 
				-                        # 长文本直接添加至结果输出
			
 
				                         right_txt += res[1][0]
			
 
				                         right_conf_list.append(res[1][1])
			
 
				                         right_position = res[0][0][0]
			
 
				                         right_down_position = (res[0][2][1] + res[0][3][1]) // 2
			
 
				-
			
 
				                     for x in right_result[idx + 1:]:
			
 
				-                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
			
 
				-                                d - t) * 1.2 and '登记机关' not in x[1][0]:
			
 
				+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
			
 
				                             right_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				                             right_txt += x[1][0]
			
 
				                             right_conf_list.append(x[1][1])
			
 
				             right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
			
 
				-            right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
			
 
				-            if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
			
 
				+            right_txt = re.sub('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
			
 
				+            if bool(re.match('所', right_txt)) or bool(re.match('住', right_txt)):
			
 
				                 right_txt = right_txt.replace('所', '')
			
 
				                 right_txt = right_txt.replace('住', '')
			
 
				             if len(right_conf_list):
			
@@ -181,22 +162,21 @@ class BussinessParse1(object):
 
				         down_conf = 0.0
			
 
				         simple_key = False
			
 
				         for i in raw_results:
			
 
				-            if bool(re.search(r'日期', i.txt)):
			
 
				+            if bool(re.search('日期', i.txt)):
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 down_list.append([x0, y1])
			
 
				-            elif bool(re.match(r'营业期限', i.txt)):
			
 
				+            elif bool(re.match('营业期限', i.txt)):
			
 
				                 simple_key = True
			
 
				                 [x0, _] = i.lt
			
 
				                 [_, y1] = i.rb
			
 
				                 down_list.append([x0, y1])
			
 
				-            elif bool(re.match(r'登记', i.txt)):
			
 
				+            elif bool(re.match('登记', i.txt)):
			
 
				                 [_, y0] = i.lt
			
 
				                 down_list2.append(y0)
			
 
				-            elif bool(re.match(r'经营范围', i.txt)):
			
 
				+            elif bool(re.match('经营范围', i.txt)):
			
 
				                 [x0, y0] = i.lt
			
 
				                 [x1, _] = i.rb
			
 
				-                # 第一方案：
			
 
				                 for j in raw_results:
			
 
				                     [x, _] = j.lt
			
 
				                     [_, y] = j.rb
			
@@ -207,28 +187,21 @@ class BussinessParse1(object):
 
				             t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
			
 
				             l2 = sorted(down_list, key=lambda x: x[0])[0][0]
			
 
				             d2 = int(down_list2[0]) if len(down_list2) else h
			
 
				-            down_img = image[int(t2): d2, int(l2): w]
			
 
				+            down_img = image[int(t2):d2, int(l2):w]
			
 
				             h1, w1, _ = down_img.shape
			
 
				             down_result = self.ocr.ocr(down_img)
			
 
				-            # print('simple_key', simple_key)
			
 
				-            # 第二方案(检索到‘营业期限’关键词)
			
 
				-            if simple_key:
			
 
				-                # print('111')
			
 
				-                for res in down_result:
			
 
				+            for res in down_result:
			
 
				+                if simple_key:
			
 
				                     l1 = res[0][0][0]
			
 
				-                    if l1 < (7 * w1 // 24):
			
 
				+                    if l1 < 7 * w1 // 24:
			
 
				                         down_txt += res[1][0]
			
 
				                         down_conf_list.append(res[1][1])
			
 
				-                # print(down_txt)
			
 
				-            # 第三套方案
			
 
				-            else:
			
 
				-                for idx, res in enumerate(down_result):
			
 
				-                    if bool(re.match(r'经营范围', res[1][0])):
			
 
				-                        t = res[0][0][1]
			
 
				-                        for i in down_result:
			
 
				-                            if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
			
 
				-                                down_txt += res[1][0]
			
 
				-                                down_conf_list.append(res[1][1])
			
 
				+                elif bool(re.match('经营范围', res[1][0])):
			
 
				+                    t = res[0][0][1]
			
 
				+                    for i in down_result:
			
 
				+                        if i[0][2][1] < t and i[0][0][0] < 7 * w1 // 24:
			
 
				+                            down_txt += res[1][0]
			
 
				+                            down_conf_list.append(res[1][1])
			
 
				         down_txt = down_txt.replace('经营范围', '')
			
 
				         raw_txt = raw_txt.replace('经营范围', '')
			
 
				         if len(down_conf_list):
			
@@ -236,8 +209,6 @@ class BussinessParse1(object):
 
				         if len(raw_txt) > len(down_txt):
			
 
				             down_txt = raw_txt
			
 
				             down_conf = sum(raw_conf_list) / len(raw_conf_list)
			
 
				-        # cv2.imshow('11', down_img)
			
 
				-        # cv2.waitKey(0)
			
 
				         return down_txt, down_conf
			
 
				 
			
 
				     def ad_detection(self, image, raw_results):
			
--- a/blfe_core/parser.py
+++ b/blfe_core/parser.py
@@ -253,8 +253,7 @@ class BusinessLicenseParser0(Parser):
 
				         """
			
 
				         经营范围
			
 
				         """
			
 
				-        ocr = PaddleOCR(use_gpu=True)
			
 
				-        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(self.ocr).detection(self.image, self.raw_results)
			
 
				+        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(self.ppocr).detection(self.image, self.raw_results)
			
 
				         if bool(bs_txt):
			
 
				             self.res['business_scope'] = RecItem(bs_txt, bs_conf)
			
 
				 
			
@@ -349,7 +348,6 @@ class BusinessLicenseParser1(Parser):
 
				     def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List, ppocr):
			
 
				         Parser.__init__(self, ocr_results, raw_results, ppocr)
			
 
				         self.image = image
			
 
				-        self.ocr = PaddleOCR(use_gpu=True)
			
 
				 
			
 
				     def social_code(self):
			
 
				         """
			
@@ -520,7 +518,7 @@ class BusinessLicenseParser1(Parser):
 
				         """
			
 
				         print('-------------经营范围处理开始--------------')
			
 
				 
			
 
				-        bs_txt, bs_conf = BussinessParse1(self.ocr).bs_detection(self.image, self.raw_results)
			
 
				+        bs_txt, bs_conf = BussinessParse1(self.ppocr).bs_detection(self.image, self.raw_results)
			
 
				 
			
 
				         if bool(bs_txt):
			
 
				             self.res['business_scope'] = RecItem(bs_txt, bs_conf)
			
@@ -540,7 +538,7 @@ class BusinessLicenseParser1(Parser):
 
				         住所
			
 
				         """
			
 
				         # 切割方案
			
 
				-        ad_txt, ad_conf = BussinessParse1(self.ocr).ad_detection(self.image, self.raw_results)
			
 
				+        ad_txt, ad_conf = BussinessParse1(self.ppocr).ad_detection(self.image, self.raw_results)
			
 
				         # 关键字方案
			
 
				         add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
			
 
				         if add_or_0: