před 2 roky · e16414b390
--- a/core/business_parse.py
+++ b/core/business_parse.py
@@ -0,0 +1,372 @@
 
															+import re
														
 
															+from dataclasses import dataclass
														
 
															+import cv2
														
 
															+from paddleocr import PaddleOCR
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class BussinessParse0(object):
														
 
															+    """
														
 
															+    经营范围
														
 
															+    """
														
 
															+    ocr: PaddleOCR
														
 
															+
														
 
															+    def detection(self, image, raw_results):
														
 
															+        h, w, _ = image.shape
														
 
															+        left_list = []
														
 
															+        right_list = []
														
 
															+        for i in raw_results:
														
 
															+            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                left_list.append([x0, y1])
														
 
															+            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                left_list.append([x0, y1])
														
 
															+            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                left_list.append([x0, y1])
														
 
															+            elif bool(re.match(r'注册', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                right_list.append([x0, y1])
														
 
															+            elif bool(re.search(r'日期', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                right_list.append([x0, y1])
														
 
															+            elif bool(re.match(r'营业期限', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                right_list.append([x0, y1])
														
 
															+        t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
														
 
															+        t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
														
 
															+        l1 = sorted(left_list, key=lambda x: x[0])[0][0]
														
 
															+        r1 = sorted(right_list, key=lambda x: x[0])[0][0]
														
 
															+
														
 
															+        left_img = image[int(t1): h, int(l1): int(r1)]
														
 
															+        right_img = image[int(t2): h, int(r1): w]
														
 
															+        left_result = self.ocr.ocr(left_img)
														
 
															+        right_result = self.ocr.ocr(right_img)
														
 
															+
														
 
															+        left_conf_list = []
														
 
															+        right_conf_list = []
														
 
															+        left_conf = 0.0
														
 
															+        right_conf = 0.0
														
 
															+        left_txt = ''
														
 
															+        right_txt = ''
														
 
															+        for idx, res in enumerate(left_result):
														
 
															+            if len(left_result) - 1 != idx:
														
 
															+                if bool(re.match(r'经营范围', res[1][0])):
														
 
															+                    t = res[0][0][1]
														
 
															+                    d = res[0][2][1]
														
 
															+                    # 判断上一条信息是否为经营范围内容
														
 
															+                    if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
														
 
															+                            d - t) * 1.8:
														
 
															+                        left_txt += left_result[idx - 1][1][0]
														
 
															+
														
 
															+                    left_txt += res[1][0]
														
 
															+                    left_conf_list.append(res[1][1])
														
 
															+
														
 
															+                    left_position = left_result[idx + 1][0][0][0]
														
 
															+                    left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
														
 
															+                    for x in left_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
														
 
															+                            left_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            if left_txt[-1] == x[1][0][0]:
														
 
															+                                left_txt += x[1][0][1:]
														
 
															+                            else:
														
 
															+                                left_txt += x[1][0]
														
 
															+                            left_conf_list.append(x[1][1])
														
 
															+                # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
														
 
															+            left_txt = left_txt.replace('经营范围', '')
														
 
															+            if len(left_conf_list):
														
 
															+                left_conf = sum(left_conf_list) / len(left_conf_list)
														
 
															+        # 住所信息
														
 
															+        for idx, res in enumerate(right_result):
														
 
															+            if len(right_result) - 1 != idx:
														
 
															+                if bool(re.match(r'所', res[1][0])):
														
 
															+                    right_txt = ''
														
 
															+                    t = res[0][0][1]
														
 
															+                    d = res[0][2][1]
														
 
															+                    if len(res[1][0]) == 1:
														
 
															+                        right_position = right_result[idx + 1][0][0][0]
														
 
															+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
														
 
															+                    else:
														
 
															+                        right_txt += res[1][0]
														
 
															+                        right_conf_list.append(res[1][1])
														
 
															+                        right_position = right_result[idx][0][0][0]
														
 
															+                        right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
														
 
															+                    for x in right_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
														
 
															+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
														
 
															+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            right_txt += x[1][0]
														
 
															+                            right_conf_list.append(x[1][1])
														
 
															+                elif bool(re.match(r'住', res[1][0])):
														
 
															+                    right_txt = ''
														
 
															+                    t = res[0][0][1]
														
 
															+                    d = res[0][2][1]
														
 
															+
														
 
															+                    if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
														
 
															+                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
														
 
															+                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
														
 
															+                        right_position = right_result[idx + 1][0][0][0]
														
 
															+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
														
 
															+                    else:
														
 
															+                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
														
 
															+                        standard = abs(res[0][1][0] - res[0][0][0]) // 5
														
 
															+                        # 长文本直接添加至结果输出
														
 
															+                        right_txt += res[1][0]
														
 
															+                        right_conf_list.append(res[1][1])
														
 
															+                        right_position = res[0][0][0]
														
 
															+                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2
														
 
															+
														
 
															+                    for x in right_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
														
 
															+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
														
 
															+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            right_txt += x[1][0]
														
 
															+                            right_conf_list.append(x[1][1])
														
 
															+                elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
														
 
															+                    right_txt = ''
														
 
															+                    t = res[0][0][1]
														
 
															+                    d = res[0][2][1]
														
 
															+                    if len(res[1][0]) == 4:
														
 
															+                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
														
 
															+                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
														
 
															+                        right_position = right_result[idx + 1][0][0][0]
														
 
															+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
														
 
															+                    else:
														
 
															+                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
														
 
															+                        standard = abs(res[0][1][0] - res[0][0][0]) // 2
														
 
															+                        # 长文本直接添加至结果输出
														
 
															+                        right_txt += res[1][0]
														
 
															+                        right_conf_list.append(res[1][1])
														
 
															+                        right_position = res[0][0][0]
														
 
															+                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2
														
 
															+
														
 
															+                    for x in right_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
														
 
															+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
														
 
															+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            right_txt += x[1][0]
														
 
															+                            right_conf_list.append(x[1][1])
														
 
															+            right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
														
 
															+            right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
														
 
															+            if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
														
 
															+                right_txt = right_txt.replace('所', '')
														
 
															+                right_txt = right_txt.replace('住', '')
														
 
															+            if len(right_conf_list):
														
 
															+                right_conf = sum(right_conf_list) / len(right_conf_list)
														
 
															+        return left_txt, left_conf, right_txt, right_conf
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class BussinessParse1(object):
														
 
															+    """
														
 
															+    经营范围
														
 
															+    """
														
 
															+    ocr: PaddleOCR
														
 
															+
														
 
															+    def bs_detection(self, image, raw_results):
														
 
															+        h, w, _ = image.shape
														
 
															+        down_list = []
														
 
															+        down_list2 = []
														
 
															+        for i in raw_results:
														
 
															+            if bool(re.match(r'注册资本', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                down_list.append([x0, y1])
														
 
															+            elif bool(re.search(r'日期', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                down_list.append([x0, y1])
														
 
															+            elif bool(re.match(r'营业期限', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                down_list.append([x0, y1])
														
 
															+            elif bool(re.match(r'登记', i.txt)):
														
 
															+                [_, y0] = i.lt
														
 
															+                down_list2.append(y0)
														
 
															+
														
 
															+        t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
														
 
															+        l2 = sorted(down_list, key=lambda x: x[0])[0][0]
														
 
															+        d2 = int(down_list2[0]) if len(down_list2) else h
														
 
															+        down_img = image[int(t2): d2, int(l2): w]
														
 
															+
														
 
															+        down_result = self.ocr.ocr(down_img)
														
 
															+
														
 
															+        down_conf_list = []
														
 
															+        down_conf = 0.0
														
 
															+        down_txt = ''
														
 
															+        for idx, res in enumerate(down_result):
														
 
															+            # print(res)
														
 
															+            if len(down_result) - 1 != idx:
														
 
															+                if bool(re.match(r'经营范围', res[1][0])):
														
 
															+                    t = res[0][0][1]
														
 
															+                    d = res[0][2][1]
														
 
															+                    if len(down_result[idx - 1][1][0]) > 15 and abs(
														
 
															+                            down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
														
 
															+                        d - t) * 1.8:
														
 
															+                        down_txt += down_result[idx - 1][1][0]
														
 
															+                    down_txt += res[1][0]
														
 
															+                    down_conf_list.append(res[1][1])
														
 
															+                    down_position = down_result[idx + 1][0][0][0]
														
 
															+                    down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
														
 
															+                    for x in down_result[idx + 1:]:
														
 
															+                        print(abs(down_down_position - x[0][0][1]))
														
 
															+                        print(abs(d - t) * 1.2)
														
 
															+                        if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
														
 
															+                                d - t) * 1.8:
														
 
															+                            down_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            if down_txt[-1] == x[1][0][0]:
														
 
															+                                down_txt += x[1][0][1:]
														
 
															+                            else:
														
 
															+                                down_txt += x[1][0]
														
 
															+                            down_conf_list.append(x[1][1])
														
 
															+                        # print(down_txt)
														
 
															+        down_txt = down_txt.replace('经营范围', '')
														
 
															+        if len(down_conf_list):
														
 
															+            down_conf = sum(down_conf_list) / len(down_conf_list)
														
 
															+
														
 
															+        # cv2.imshow('11', down_img)
														
 
															+        # cv2.waitKey(0)
														
 
															+        return down_txt, down_conf
														
 
															+
														
 
															+    def ad_detection(self, image, raw_results):
														
 
															+        h, w, _ = image.shape
														
 
															+        top_list1 = []
														
 
															+        top_list2 = []
														
 
															+        type_key = False
														
 
															+        for i in raw_results:
														
 
															+            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
														
 
															+                [_, y0] = i.lt
														
 
															+                top_list2.append(y0)
														
 
															+            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                top_list1.append([x0, y1])
														
 
															+            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
														
 
															+                type_key = True
														
 
															+                [x0, _] = i.lt
														
 
															+                [_, y1] = i.rb
														
 
															+                top_list1.append([x0, y1])
														
 
															+            elif bool(re.match(r'注册资本', i.txt)):
														
 
															+                [_, y0] = i.lt
														
 
															+                top_list2.append(y0)
														
 
															+            elif bool(re.search(r'日期', i.txt)):
														
 
															+                [_, y0] = i.lt
														
 
															+                top_list2.append(y0)
														
 
															+            elif bool(re.match(r'营业期限', i.txt)):
														
 
															+                [_, y0] = i.lt
														
 
															+                top_list2.append(y0)
														
 
															+        t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
														
 
															+        l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
														
 
															+        d1 = sorted(top_list2)[0]
														
 
															+
														
 
															+        top_img = image[int(t1): int(d1), int(l1): w]
														
 
															+        top_result = self.ocr.ocr(top_img)
														
 
															+
														
 
															+        top_conf_list = []
														
 
															+        top_conf = 0.0
														
 
															+        top_txt = ''
														
 
															+        last_key = ''
														
 
															+        # 住所信息
														
 
															+        for idx, res in enumerate(top_result):
														
 
															+            # print(res)
														
 
															+            if bool(re.match(r'所', res[1][0])):
														
 
															+                top_txt = ''
														
 
															+                t = res[0][0][1]
														
 
															+                d = res[0][2][1]
														
 
															+                if len(last_key):
														
 
															+                    top_txt += last_key
														
 
															+                    print('top_txt', top_txt)
														
 
															+                if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
														
 
															+                    top_position = top_result[idx + 1][0][0][0]
														
 
															+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
														
 
															+                else:
														
 
															+                    top_txt += res[1][0]
														
 
															+                    top_conf_list.append(res[1][1])
														
 
															+                    top_position = top_result[idx][0][0][0]
														
 
															+                    top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
														
 
															+                if len(top_result) - 1 != idx:
														
 
															+                    for x in top_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
														
 
															+                                d - t) * 1.2:
														
 
															+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            top_txt += x[1][0]
														
 
															+                            top_conf_list.append(x[1][1])
														
 
															+                # print('top_txt', top_txt)
														
 
															+            elif bool(re.match(r'住', res[1][0])):
														
 
															+                top_txt = ''
														
 
															+                t = res[0][0][1]
														
 
															+                d = res[0][2][1]
														
 
															+                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
														
 
															+                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
														
 
															+                        top_result[idx - 1][1][0] and idx != 0:
														
 
															+                    last_key = top_result[idx - 1][1][0]
														
 
															+                if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
														
 
															+                    # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
														
 
															+                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
														
 
															+                    top_position = top_result[idx + 1][0][0][0]
														
 
															+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
														
 
															+                else:
														
 
															+                    # 此情况为长文本，则采用框的左右坐标的1/5为标准
														
 
															+                    standard = abs(res[0][1][0] - res[0][0][0]) // 5
														
 
															+                    # 长文本直接添加至结果输出
														
 
															+                    top_txt += res[1][0]
														
 
															+                    top_conf_list.append(res[1][1])
														
 
															+                    top_position = res[0][0][0]
														
 
															+                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
														
 
															+                if len(top_result) - 1 != idx:
														
 
															+                    for x in top_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
														
 
															+                                d - t) * 1.2:
														
 
															+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            top_txt += x[1][0]
														
 
															+                            top_conf_list.append(x[1][1])
														
 
															+                # print(top_txt)
														
 
															+            elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
														
 
															+                top_txt = ''
														
 
															+                t = res[0][0][1]
														
 
															+                d = res[0][2][1]
														
 
															+                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
														
 
															+                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
														
 
															+                        top_result[idx - 1][1][0] and idx != 0:
														
 
															+                    top_txt += top_result[idx - 1][1][0]
														
 
															+                if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
														
 
															+                    # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
														
 
															+                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
														
 
															+                    top_position = top_result[idx + 1][0][0][0]
														
 
															+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
														
 
															+                else:
														
 
															+                    # 此情况为长文本，则采用框的左右坐标的1/2为标准
														
 
															+                    standard = abs(res[0][1][0] - res[0][0][0]) // 2
														
 
															+                    # 长文本直接添加至结果输出
														
 
															+                    top_txt += res[1][0]
														
 
															+                    top_conf_list.append(res[1][1])
														
 
															+                    top_position = res[0][0][0]
														
 
															+                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
														
 
															+                if len(top_result) - 1 != idx:
														
 
															+                    for x in top_result[idx + 1:]:
														
 
															+                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
														
 
															+                                d - t) * 1.2:
														
 
															+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
														
 
															+                            top_txt += x[1][0]
														
 
															+                            top_conf_list.append(x[1][1])
														
 
															+                            top_conf_list.append(x[1][1])
														
 
															+        if len(top_txt) == 0 and type_key:
														
 
															+            for res in top_result:
														
 
															+                top_txt += res[1][0]
														
 
															+        top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
														
 
															+        if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
														
 
															+            top_txt = top_txt.replace('所', '')
														
 
															+            top_txt = top_txt.replace('住', '')
														
 
															+        if len(top_conf_list):
														
 
															+            top_conf = sum(top_conf_list) / len(top_conf_list)
														
 
															+
														
 
															+        # cv2.imshow('11', top_img)
														
 
															+        # cv2.waitKey(0)
														
 
															+        return top_txt, top_conf
														
--- a/core/direction.py
+++ b/core/direction.py
@@ -163,11 +163,11 @@ class AngleDetector(object):
 
															     # -> angle       result(ocr生)
														
 
															     @sxtimeit
														
 
															     def detect_angle(self, img):
														
 
															-        image_type, result = self.detect_img(img)
														
 
															+        result = self.ocr.ocr(img, cls=True)
														
 
															         ocr_anchor = BusinessLicenseAnchor('营业执照', [Direction.TOP])
														
 
															         try:
														
 
															             angle = detect_angle(result, ocr_anchor)
														
 
															-            return angle, result, image_type
														
 
															+            return angle, result
														
 
															         except Exception as e:
														
 
															             print(e)
														
@@ -176,8 +176,4 @@ class AngleDetector(object):
 
															             result = self.ocr.ocr(img, cls=True)
														
 
															             angle = detect_angle(result, ocr_anchor)
														
 
															             # 旋转90度之后要重新计算角度
														
 
															-            return (angle - 1 + 4) % 4, result, image_type
														
 
															-
														
 
															-    def detect_img(self, img):
														
 
															-        result = self.ocr.ocr(img, cls=True)
														
 
															-        return 1, result
														
 
															+            return (angle - 1 + 4) % 4, result
														
--- a/core/ocr.py
+++ b/core/ocr.py
@@ -27,17 +27,21 @@ class BusinessLicenseOcr:
 
															     def predict(self, image: np.ndarray) -> ():
														
 
															         # 旋转后img angle result(生ocr)
														
 
															-        image, angle, result, image_type = self._pre_process(image)
														
 
															+        image, angle, result = self._pre_process(image)
														
 
															         print(f'---------- detect angle: {angle} 角度 --------')
														
 
															         if angle != 0:
														
 
															             _, _, result = self._ocr(image)
														
 
															+        # 判断类型
														
 
															+        image_type = self._type(result)
														
 
															+
														
 
															         # 去除 市场监督 水印
														
 
															         for i_k, i_v in enumerate(result):
														
 
															             if '市场监督' in i_v[1][0] and len(i_v[1][0]) < 7:
														
 
															                 del result[i_k]
														
 
															                 break
														
 
															+
														
 
															         return self._post_process(result, angle, image_type, image)
														
 
															     # 预处理(旋转图片)
														
@@ -45,7 +49,7 @@ class BusinessLicenseOcr:
 
															     # -> 正向的img(旋转后) 源img角度 result(ocr生)
														
 
															     def _pre_process(self, image) -> (np.ndarray, int, Any):
														
 
															         # pic角度 result(ocr生)
														
 
															-        angle, result, image_type = self.angle_detector.detect_angle(image)
														
 
															+        angle, result= self.angle_detector.detect_angle(image)
														
 
															         if angle == 1:
														
 
															             image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
														
@@ -53,7 +57,18 @@ class BusinessLicenseOcr:
 
															             image = cv2.rotate(image, cv2.ROTATE_180)
														
 
															         if angle == 3:
														
 
															             image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
														
 
															-        return image, angle, result, image_type
														
 
															+        return image, angle, result
														
 
															+
														
 
															+    def _type(self, result):
														
 
															+        anchor = False
														
 
															+        code = False
														
 
															+        for res in result:
														
 
															+            txt = res[1][0]
														
 
															+            if "营业执照" in txt:
														
 
															+                anchor = res
														
 
															+            if "统一社" in txt or "会信用" in txt or "用代码" in txt:
														
 
															+                code = res
														
 
															+        return 0 if (code and anchor) and (code[0][0][0] < anchor[0][0][0]) else 1
														
 
															     # 获取模型检测结果
														
 
															     def _ocr(self, image):
														
@@ -83,8 +98,10 @@ class BusinessLicenseOcr:
 
															         print('-------------')
														
 
															         conf = line_parser.confidence
														
 
															-        parser = BusinessLicenseParser(line_result, image, result)
														
 
															-        # if int(image_type) == 0:
														
 
															+        if image_type == 0:
														
 
															+            parser = BusinessLicenseParser0(line_result, image, result)
														
 
															+        if image_type == 1:
														
 
															+            parser = BusinessLicenseParser1(line_result, image, result)
														
 
															         ocr_res = parser.parse()
														
@@ -96,7 +113,4 @@ class BusinessLicenseOcr:
 
															         print(res)
														
 
															         return res
														
 
															-    # def _get_type(self, image) -> int:
														
 
															-
														
 
															-
														
 
															-
														
 
															+    # def _get_type(self, image) -> int:
														
--- a/core/parser.py
+++ b/core/parser.py
@@ -7,8 +7,12 @@ import cpca
 
															 import cv2
														
 
															 import numpy as np
														
 
															 import string
														
 
															+
														
 
															+from paddleocr import PaddleOCR
														
 
															 from zhon.hanzi import punctuation
														
 
															 import cn2an
														
 
															+
														
 
															+from core.business_parse import BussinessParse0, BussinessParse1
														
 
															 from core.line_parser import OcrResult
														
 
															 from core.square_parser import parser_xy
														
 
															 from stamp.d_stamp import send_request
														
@@ -76,7 +80,7 @@ class Parser(object):
 
															         return self.res
														
 
															-class BusinessLicenseParser(Parser):
														
 
															+class BusinessLicenseParser0(Parser):
														
 
															     def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
														
 
															         Parser.__init__(self, ocr_results, raw_results)
														
@@ -249,11 +253,29 @@ class BusinessLicenseParser(Parser):
 
															         """
														
 
															         经营范围
														
 
															         """
														
 
															-        sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
														
 
															-        if bool(sb_or):
														
 
															-            self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
														
 
															+        ocr = PaddleOCR(use_gpu=True)
														
 
															+        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(ocr).detection(self.image, self.raw_results)
														
 
															+        if bool(bs_txt):
														
 
															+            self.res['business_scope'] = RecItem(bs_txt, bs_conf)
														
 
															+
														
 
															+        add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
														
 
															+        if add_or_0:
														
 
															+            add_or = add_or_0
														
 
															         else:
														
 
															-            self.res['business_scope'] = RecItem('经营范围', random.random())
														
 
															+            add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
														
 
															+            if add_or_1:
														
 
															+                add_or = add_or_1
														
 
															+            else:
														
 
															+                return
														
 
															+        txt = add_or.txt
														
 
															+        if '所' in txt[:3] or '厂' in txt[:3]:
														
 
															+            txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
														
 
															+
														
 
															+        if len(ad_txt) >= len(txt):
														
 
															+            self.res['address'] = RecItem(ad_txt, ad_conf)
														
 
															+        else:
														
 
															+            self.res['address'] = RecItem(txt, add_or.conf)
														
 
															+
														
 
															         return
														
 
															     def address(self):  # sourcery skip: use-named-expression
														
@@ -317,6 +339,271 @@ class BusinessLicenseParser(Parser):
 
															         self.start_date()
														
 
															         self.expire_date()
														
 
															         self.business_scope()
														
 
															-        self.address()
														
 
															+        # self.address()
														
 
															         self.stamp()
														
 
															         return {key: self.res[key].to_dict() for key in self.keys}
														
 
															+
														
 
															+
														
 
															+class BusinessLicenseParser1(Parser):
														
 
															+
														
 
															+    def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
														
 
															+        Parser.__init__(self, ocr_results, raw_results)
														
 
															+        self.image = image
														
 
															+        self.ocr = PaddleOCR(use_gpu=True)
														
 
															+
														
 
															+    def social_code(self):
														
 
															+        """
														
 
															+        社会信用代码
														
 
															+        """
														
 
															+        # 得在"营业执照"以下
														
 
															+        result = []
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            if "统一社" in txt or "会信用" in txt or "用代码" in txt:
														
 
															+                result = self.result[i:]
														
 
															+                break
														
 
															+
														
 
															+        for i in range(len(result)):
														
 
															+            res = result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            code = ''.join(re.findall(u'[\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]{15,18}', txt))
														
 
															+            if len(code):
														
 
															+                self.res['social_code'] = RecItem(code, conf)
														
 
															+                return
														
 
															+
														
 
															+    def company_name(self):
														
 
															+        """
														
 
															+        公司名称
														
 
															+        """
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            if '称尔' in txt: txt = txt.replace('称尔', '称')
														
 
															+            if '名' in txt[:4] and '称' in txt[:4]:
														
 
															+                txt = '名称' + txt.split('称')[-1]
														
 
															+
														
 
															+            if '名称' in txt:
														
 
															+                company_name = txt.split('名称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
														
 
															+                self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
														
 
															+                return
														
 
															+            if '称' in txt and txt[0] == '称' and len(txt) > 5:
														
 
															+                company_name = txt.split('称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
														
 
															+                self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
														
 
															+                return
														
 
															+
														
 
															+    def legal_person(self):
														
 
															+        """
														
 
															+        法人姓名
														
 
															+        """
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0].replace('市场监督', '')
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            if '法定代表人' in txt or '代表人' in txt:
														
 
															+                legal_person = txt.split('代表人')[-1].split('营业')[0]
														
 
															+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
														
 
															+                return
														
 
															+            if '经营者' in txt:
														
 
															+                legal_person = txt.split('经营者')[-1].split('经营')[0]
														
 
															+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
														
 
															+                return
														
 
															+            if '负责人' in txt:
														
 
															+                legal_person = txt.split('负责人')[-1].split('责人')[0]
														
 
															+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
														
 
															+                return
														
 
															+
														
 
															+    def registered_capital(self):
														
 
															+        """
														
 
															+        注册资本
														
 
															+        """
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            txt = fix_text(txt)
														
 
															+
														
 
															+            if '注册资本' in txt:
														
 
															+                if '人民币' in txt[:4]:
														
 
															+                    registered_capital = txt.split('人民币')[-1].split('万元')[0]
														
 
															+                    txt = f'人民币{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
														
 
															+                elif '美元' in txt[:4]:
														
 
															+                    registered_capital = txt.split('美元')[-1].split('万元')[0]
														
 
															+                    txt = f'美元{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
														
 
															+                elif '人民币' in txt[-4:]:
														
 
															+                    registered_capital = txt.split('资本')[-1].split('人民币')[0].split('万元')[0]
														
 
															+                    txt = f'{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}人民币'
														
 
															+                else:
														
 
															+                    registered_capital = txt.split('资本')[-1].split('币')[-1].split('万')[0]
														
 
															+                    txt = self.cn_to_an(fix_text(clear_punctuation(registered_capital)))
														
 
															+
														
 
															+                self.res['registered_capital'] = RecItem(txt, conf)
														
 
															+                return
														
 
															+
														
 
															+    def type(self):  # sourcery skip: hoist-similar-statement-from-if
														
 
															+        """
														
 
															+        类型
														
 
															+        """
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            txt = fix_text(clear_punctuation(txt))
														
 
															+
														
 
															+            if '类型' in txt:
														
 
															+                txt = txt.split('类型')[-1].split('成立')[0].split('注册')[0]
														
 
															+                if '公司' in txt:
														
 
															+                    t_s, s_e = txt.split('公司')[0], txt.split('公司')[-1].replace('(', '').replace(')', '').replace('（',
														
 
															+                                                                                                                 '').replace(
														
 
															+                        '）', '')
														
 
															+
														
 
															+                    # 分公司
														
 
															+                    if '分公司' in txt:
														
 
															+                        t_s = f'{t_s}公司分'
														
 
															+
														
 
															+                    txt = f'{t_s}公司（{s_e}）' if s_e else f'{t_s}公司'
														
 
															+
														
 
															+                if txt[0] == '型': txt = txt[1:]
														
 
															+                self.res['type'] = RecItem(txt, conf)
														
 
															+                return
														
 
															+
														
 
															+    def start_date(self):
														
 
															+        """
														
 
															+        成立日期 ⚠️ 注册日期
														
 
															+        """
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            txt = fix_text(txt)
														
 
															+
														
 
															+            if '日期' in txt:
														
 
															+                txt = txt.split('日期')[-1]
														
 
															+                date = self.to_date(txt)
														
 
															+                self.res['start_date'] = RecItem(date, conf)
														
 
															+
														
 
															+    def expire_date(self):  # sourcery skip: hoist-similar-statement-from-if
														
 
															+        """
														
 
															+        有效期
														
 
															+        """
														
 
															+        for i in range(len(self.result)):
														
 
															+            res = self.result[i]
														
 
															+            txt = res[-1][0]
														
 
															+            conf = res[-1][1]
														
 
															+
														
 
															+            if '期限' in txt:
														
 
															+                if '至' in txt:
														
 
															+                    txt = ''.join(txt.split('期限')[1:]).replace('*', '')
														
 
															+                    date_from = txt.split('至')[0]
														
 
															+                    date_to = txt.split('至')[-1]
														
 
															+                    date = f'{self.to_date(date_from)} 至 {self.to_date(date_to)}'
														
 
															+                    self.res['expire_date'] = RecItem(date, conf)
														
 
															+                    return
														
 
															+                if '长期' in txt:
														
 
															+                    self.res['expire_date'] = RecItem('长期', conf)
														
 
															+                    return
														
 
															+                else:
														
 
															+                    self.res['expire_date'] = RecItem('', conf)
														
 
															+                    return
														
 
															+
														
 
															+    def business_scope(self):
														
 
															+        """
														
 
															+        经营范围
														
 
															+        """
														
 
															+        print('-------------经营范围处理开始--------------')
														
 
															+
														
 
															+        bs_txt, bs_conf = BussinessParse1(self.ocr).bs_detection(self.image, self.raw_results)
														
 
															+
														
 
															+        if bool(bs_txt):
														
 
															+            self.res['business_scope'] = RecItem(bs_txt, bs_conf)
														
 
															+
														
 
															+        # sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
														
 
															+        # if bool(sb_or):
														
 
															+        #     self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
														
 
															+        # else:
														
 
															+        #     self.res['business_scope'] = RecItem('经营范围', random.random())
														
 
															+
														
 
															+        print('-------------经营范围处理结束--------------')
														
 
															+
														
 
															+        return
														
 
															+
														
 
															+    def address(self):  # sourcery skip: use-named-expression
														
 
															+        """
														
 
															+        住所
														
 
															+        """
														
 
															+        # 切割方案
														
 
															+        ad_txt, ad_conf = BussinessParse1(self.ocr).ad_detection(self.image, self.raw_results)
														
 
															+        # 关键字方案
														
 
															+        add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
														
 
															+        if add_or_0:
														
 
															+            add_or = add_or_0
														
 
															+        else:
														
 
															+            add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
														
 
															+            if add_or_1:
														
 
															+                add_or = add_or_1
														
 
															+            else:
														
 
															+                return
														
 
															+
														
 
															+        txt = add_or.txt
														
 
															+
														
 
															+        if '所' in txt[:3] or '厂' in txt[:3]:
														
 
															+            txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
														
 
															+
														
 
															+        if len(ad_txt) >= len(txt):
														
 
															+            self.res['address'] = RecItem(ad_txt, ad_conf)
														
 
															+        else:
														
 
															+            self.res['address'] = RecItem(txt, add_or.conf)
														
 
															+
														
 
															+        return
														
 
															+
														
 
															+    def stamp(self):
														
 
															+        """
														
 
															+        印章检测
														
 
															+        """
														
 
															+        self.res['stamp'] = RecItem(send_request(self.image), 1.)
														
 
															+        return
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def cn_to_an(num):
														
 
															+        try:
														
 
															+            num = int(num)
														
 
															+        except ValueError:
														
 
															+            num = str(cn2an.cn2an(f'{num}万'))[:-4]
														
 
															+        except Exception:
														
 
															+            raise Exception('注册资本转化出错')
														
 
															+        finally:
														
 
															+            return f'{num}万元'
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def to_date(txt):
														
 
															+        if '长期' in txt: return '长期'
														
 
															+        if '永久' in txt: return '永久'
														
 
															+        if '不约定' in txt: return '不约定期限'
														
 
															+        date_in = re.findall(r"\d+", txt)
														
 
															+        if len(date_in) == 3:
														
 
															+            return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2]}日'
														
 
															+        else:
														
 
															+            return ''
														
 
															+
														
 
															+    # ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
														
 
															+    #  "business_scope", 'expire_date', 'address', 'stamp']
														
 
															+    def parse(self):
														
 
															+        self.social_code()
														
 
															+        self.company_name()
														
 
															+        self.legal_person()
														
 
															+        self.registered_capital()
														
 
															+        self.type()
														
 
															+        self.start_date()
														
 
															+        self.expire_date()
														
 
															+        self.business_scope()
														
 
															+        self.address()
														
 
															+        self.stamp()
														
 
															+        return {key: self.res[key].to_dict() for key in self.keys}
														
--- a/core/square_parser.py
+++ b/core/square_parser.py
@@ -122,14 +122,12 @@ def get_key_other_or(res_raw_list, key_heard: OcrResult, key_title):
 
															 def parser_xy(res_line, res_raw, key):
														
 
															     # 在 res_line 中找到 key 对应的坐标
														
 
															-    print('res_line', res_line)
														
 
															     key_row = []
														
 
															     for row in res_line:
														
 
															         print(row[-1])
														
 
															         if key in row[-1][0]:
														
 
															             key_row = row
														
 
															             break
														
 
															-    print(key_row)
														
 
															     if not bool(key_row): return
														
 
															     key_heard, key_title = get_key_fist_line(key_row, key)
														
 
															     return get_key_other_or(res_raw, key_heard, key_title)