zeke-chin преди 2 години
родител
ревизия
68a40031b0
променени са 2 файла, в които са добавени 51 реда и са изтрити 82 реда
  1. 48 77
      blfe_core/business_parse.py
  2. 3 5
      blfe_core/parser.py

+ 48 - 77
blfe_core/business_parse.py

@@ -16,27 +16,27 @@ class BussinessParse0(object):
         left_list = []
         right_list = []
         for i in raw_results:
-            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
+            if bool(re.match('法定代表', i.txt)) or bool(re.match('经营者', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 left_list.append([x0, y1])
-            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
+            elif bool(re.match('名', i.txt)) or bool(re.match('称', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 left_list.append([x0, y1])
-            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
+            elif bool(re.match('类', i.txt)) or bool(re.match('型', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 left_list.append([x0, y1])
-            elif bool(re.match(r'注册', i.txt)):
+            elif bool(re.match('注册', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 right_list.append([x0, y1])
-            elif bool(re.search(r'日期', i.txt)):
+            elif bool(re.search('日期', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 right_list.append([x0, y1])
-            elif bool(re.match(r'营业期限', i.txt)):
+            elif bool(re.match('营业期限', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 right_list.append([x0, y1])
@@ -44,12 +44,10 @@ class BussinessParse0(object):
         t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
         l1 = sorted(left_list, key=lambda x: x[0])[0][0]
         r1 = sorted(right_list, key=lambda x: x[0])[0][0]
-
-        left_img = image[int(t1): h, int(l1): int(r1)]
-        right_img = image[int(t2): h, int(r1): w]
+        left_img = image[int(t1):h, int(l1):int(r1)]
+        right_img = image[int(t2):h, int(r1):w]
         left_result = self.ocr.ocr(left_img)
         right_result = self.ocr.ocr(right_img)
-
         left_conf_list = []
         right_conf_list = []
         left_conf = 0.0
@@ -57,105 +55,88 @@ class BussinessParse0(object):
         left_txt = ''
         right_txt = ''
         for idx, res in enumerate(left_result):
-            if len(left_result) - 1 != idx:
-                if bool(re.match(r'经营范围', res[1][0])):
-                    t = res[0][0][1]
-                    d = res[0][2][1]
-                    # 判断上一条信息是否为经营范围内容
-                    if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
-                            d - t) * 1.8:
-                        left_txt += left_result[idx - 1][1][0]
-
-                    left_txt += res[1][0]
-                    left_conf_list.append(res[1][1])
+            if len(left_result) - 1 != idx and bool(re.match('经营范围', res[1][0])):
+                t = res[0][0][1]
+                d = res[0][2][1]
+                if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < (abs(d - t) * 1.8):
+                    left_txt += left_result[idx - 1][1][0]
+                left_txt += res[1][0]
+                left_conf_list.append(res[1][1])
+                left_position = left_result[idx + 1][0][0][0]
+                left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
 
-                    left_position = left_result[idx + 1][0][0][0]
-                    left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
-                    for x in left_result[idx + 1:]:
-                        if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
-                            left_down_position = (x[0][2][1] + x[0][3][1]) // 2
-                            if left_txt[-1] == x[1][0][0]:
-                                left_txt += x[1][0][1:]
-                            else:
-                                left_txt += x[1][0]
-                            left_conf_list.append(x[1][1])
-                # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
+                for x in left_result[idx + 1:]:
+                    if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
+                        left_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                        left_txt += x[1][0][1:] if left_txt[-1] == x[1][0][0] else x[1][0]
+                        left_conf_list.append(x[1][1])
             left_txt = left_txt.replace('经营范围', '')
             if len(left_conf_list):
                 left_conf = sum(left_conf_list) / len(left_conf_list)
-        # 住所信息
         for idx, res in enumerate(right_result):
             if len(right_result) - 1 != idx:
-                if bool(re.match(r'所', res[1][0])):
+                if bool(re.match('所', res[1][0])):
                     right_txt = ''
                     t = res[0][0][1]
                     d = res[0][2][1]
                     if len(res[1][0]) == 1:
                         right_position = right_result[idx + 1][0][0][0]
                         right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
+
                     else:
                         right_txt += res[1][0]
                         right_conf_list.append(res[1][1])
                         right_position = right_result[idx][0][0][0]
                         right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
+
                     for x in right_result[idx + 1:]:
-                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
-                                d - t) * 1.2 and '登记机关' not in x[1][0]:
+                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
                             right_down_position = (x[0][2][1] + x[0][3][1]) // 2
                             right_txt += x[1][0]
                             right_conf_list.append(x[1][1])
-                elif bool(re.match(r'住', res[1][0])):
+                elif bool(re.match('住', res[1][0])):
                     right_txt = ''
                     t = res[0][0][1]
                     d = res[0][2][1]
-
                     if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
-                        # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
                         standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
                         right_position = right_result[idx + 1][0][0][0]
                         right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
+
                     else:
-                        # 此情况为长文本,则采用框的左右坐标的1/5为标准
                         standard = abs(res[0][1][0] - res[0][0][0]) // 5
-                        # 长文本直接添加至结果输出
                         right_txt += res[1][0]
                         right_conf_list.append(res[1][1])
                         right_position = res[0][0][0]
                         right_down_position = (res[0][2][1] + res[0][3][1]) // 2
-
                     for x in right_result[idx + 1:]:
-                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
-                                d - t) * 1.2 and '登记机关' not in x[1][0]:
+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
                             right_down_position = (x[0][2][1] + x[0][3][1]) // 2
                             right_txt += x[1][0]
                             right_conf_list.append(x[1][1])
-                elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
+                elif bool(re.match('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
                     right_txt = ''
                     t = res[0][0][1]
                     d = res[0][2][1]
                     if len(res[1][0]) == 4:
-                        # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
                         standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
                         right_position = right_result[idx + 1][0][0][0]
                         right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
+
                     else:
-                        # 此情况为长文本,则采用框的左右坐标的1/5为标准
                         standard = abs(res[0][1][0] - res[0][0][0]) // 2
-                        # 长文本直接添加至结果输出
                         right_txt += res[1][0]
                         right_conf_list.append(res[1][1])
                         right_position = res[0][0][0]
                         right_down_position = (res[0][2][1] + res[0][3][1]) // 2
-
                     for x in right_result[idx + 1:]:
-                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
-                                d - t) * 1.2 and '登记机关' not in x[1][0]:
+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < (abs(d - t) * 1.2) and '登记机关' not in x[1][0]:
                             right_down_position = (x[0][2][1] + x[0][3][1]) // 2
                             right_txt += x[1][0]
                             right_conf_list.append(x[1][1])
             right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
-            right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
-            if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
+            right_txt = re.sub('经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
+            if bool(re.match('所', right_txt)) or bool(re.match('住', right_txt)):
                 right_txt = right_txt.replace('所', '')
                 right_txt = right_txt.replace('住', '')
             if len(right_conf_list):
@@ -181,22 +162,21 @@ class BussinessParse1(object):
         down_conf = 0.0
         simple_key = False
         for i in raw_results:
-            if bool(re.search(r'日期', i.txt)):
+            if bool(re.search('日期', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 down_list.append([x0, y1])
-            elif bool(re.match(r'营业期限', i.txt)):
+            elif bool(re.match('营业期限', i.txt)):
                 simple_key = True
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 down_list.append([x0, y1])
-            elif bool(re.match(r'登记', i.txt)):
+            elif bool(re.match('登记', i.txt)):
                 [_, y0] = i.lt
                 down_list2.append(y0)
-            elif bool(re.match(r'经营范围', i.txt)):
+            elif bool(re.match('经营范围', i.txt)):
                 [x0, y0] = i.lt
                 [x1, _] = i.rb
-                # 第一方案:
                 for j in raw_results:
                     [x, _] = j.lt
                     [_, y] = j.rb
@@ -207,28 +187,21 @@ class BussinessParse1(object):
             t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
             l2 = sorted(down_list, key=lambda x: x[0])[0][0]
             d2 = int(down_list2[0]) if len(down_list2) else h
-            down_img = image[int(t2): d2, int(l2): w]
+            down_img = image[int(t2):d2, int(l2):w]
             h1, w1, _ = down_img.shape
             down_result = self.ocr.ocr(down_img)
-            # print('simple_key', simple_key)
-            # 第二方案(检索到‘营业期限’关键词)
-            if simple_key:
-                # print('111')
-                for res in down_result:
+            for res in down_result:
+                if simple_key:
                     l1 = res[0][0][0]
-                    if l1 < (7 * w1 // 24):
+                    if l1 < 7 * w1 // 24:
                         down_txt += res[1][0]
                         down_conf_list.append(res[1][1])
-                # print(down_txt)
-            # 第三套方案
-            else:
-                for idx, res in enumerate(down_result):
-                    if bool(re.match(r'经营范围', res[1][0])):
-                        t = res[0][0][1]
-                        for i in down_result:
-                            if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
-                                down_txt += res[1][0]
-                                down_conf_list.append(res[1][1])
+                elif bool(re.match('经营范围', res[1][0])):
+                    t = res[0][0][1]
+                    for i in down_result:
+                        if i[0][2][1] < t and i[0][0][0] < 7 * w1 // 24:
+                            down_txt += res[1][0]
+                            down_conf_list.append(res[1][1])
         down_txt = down_txt.replace('经营范围', '')
         raw_txt = raw_txt.replace('经营范围', '')
         if len(down_conf_list):
@@ -236,8 +209,6 @@ class BussinessParse1(object):
         if len(raw_txt) > len(down_txt):
             down_txt = raw_txt
             down_conf = sum(raw_conf_list) / len(raw_conf_list)
-        # cv2.imshow('11', down_img)
-        # cv2.waitKey(0)
         return down_txt, down_conf
 
     def ad_detection(self, image, raw_results):

+ 3 - 5
blfe_core/parser.py

@@ -253,8 +253,7 @@ class BusinessLicenseParser0(Parser):
         """
         经营范围
         """
-        ocr = PaddleOCR(use_gpu=True)
-        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(self.ocr).detection(self.image, self.raw_results)
+        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(self.ppocr).detection(self.image, self.raw_results)
         if bool(bs_txt):
             self.res['business_scope'] = RecItem(bs_txt, bs_conf)
 
@@ -349,7 +348,6 @@ class BusinessLicenseParser1(Parser):
     def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List, ppocr):
         Parser.__init__(self, ocr_results, raw_results, ppocr)
         self.image = image
-        self.ocr = PaddleOCR(use_gpu=True)
 
     def social_code(self):
         """
@@ -520,7 +518,7 @@ class BusinessLicenseParser1(Parser):
         """
         print('-------------经营范围处理开始--------------')
 
-        bs_txt, bs_conf = BussinessParse1(self.ocr).bs_detection(self.image, self.raw_results)
+        bs_txt, bs_conf = BussinessParse1(self.ppocr).bs_detection(self.image, self.raw_results)
 
         if bool(bs_txt):
             self.res['business_scope'] = RecItem(bs_txt, bs_conf)
@@ -540,7 +538,7 @@ class BusinessLicenseParser1(Parser):
         住所
         """
         # 切割方案
-        ad_txt, ad_conf = BussinessParse1(self.ocr).ad_detection(self.image, self.raw_results)
+        ad_txt, ad_conf = BussinessParse1(self.ppocr).ad_detection(self.image, self.raw_results)
         # 关键字方案
         add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
         if add_or_0: