il y a 2 ans · e16414b390
--- a/core/business_parse.py
+++ b/core/business_parse.py
@@ -0,0 +1,372 @@
 
				+import re
			
 
				+from dataclasses import dataclass
			
 
				+import cv2
			
 
				+from paddleocr import PaddleOCR
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BussinessParse0(object):
			
 
				+    """
			
 
				+    经营范围
			
 
				+    """
			
 
				+    ocr: PaddleOCR
			
 
				+
			
 
				+    def detection(self, image, raw_results):
			
 
				+        h, w, _ = image.shape
			
 
				+        left_list = []
			
 
				+        right_list = []
			
 
				+        for i in raw_results:
			
 
				+            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                left_list.append([x0, y1])
			
 
				+            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                left_list.append([x0, y1])
			
 
				+            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                left_list.append([x0, y1])
			
 
				+            elif bool(re.match(r'注册', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                right_list.append([x0, y1])
			
 
				+            elif bool(re.search(r'日期', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                right_list.append([x0, y1])
			
 
				+            elif bool(re.match(r'营业期限', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                right_list.append([x0, y1])
			
 
				+        t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
			
 
				+        t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
			
 
				+        l1 = sorted(left_list, key=lambda x: x[0])[0][0]
			
 
				+        r1 = sorted(right_list, key=lambda x: x[0])[0][0]
			
 
				+
			
 
				+        left_img = image[int(t1): h, int(l1): int(r1)]
			
 
				+        right_img = image[int(t2): h, int(r1): w]
			
 
				+        left_result = self.ocr.ocr(left_img)
			
 
				+        right_result = self.ocr.ocr(right_img)
			
 
				+
			
 
				+        left_conf_list = []
			
 
				+        right_conf_list = []
			
 
				+        left_conf = 0.0
			
 
				+        right_conf = 0.0
			
 
				+        left_txt = ''
			
 
				+        right_txt = ''
			
 
				+        for idx, res in enumerate(left_result):
			
 
				+            if len(left_result) - 1 != idx:
			
 
				+                if bool(re.match(r'经营范围', res[1][0])):
			
 
				+                    t = res[0][0][1]
			
 
				+                    d = res[0][2][1]
			
 
				+                    # 判断上一条信息是否为经营范围内容
			
 
				+                    if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
			
 
				+                            d - t) * 1.8:
			
 
				+                        left_txt += left_result[idx - 1][1][0]
			
 
				+
			
 
				+                    left_txt += res[1][0]
			
 
				+                    left_conf_list.append(res[1][1])
			
 
				+
			
 
				+                    left_position = left_result[idx + 1][0][0][0]
			
 
				+                    left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
			
 
				+                    for x in left_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
			
 
				+                            left_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            if left_txt[-1] == x[1][0][0]:
			
 
				+                                left_txt += x[1][0][1:]
			
 
				+                            else:
			
 
				+                                left_txt += x[1][0]
			
 
				+                            left_conf_list.append(x[1][1])
			
 
				+                # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
			
 
				+            left_txt = left_txt.replace('经营范围', '')
			
 
				+            if len(left_conf_list):
			
 
				+                left_conf = sum(left_conf_list) / len(left_conf_list)
			
 
				+        # 住所信息
			
 
				+        for idx, res in enumerate(right_result):
			
 
				+            if len(right_result) - 1 != idx:
			
 
				+                if bool(re.match(r'所', res[1][0])):
			
 
				+                    right_txt = ''
			
 
				+                    t = res[0][0][1]
			
 
				+                    d = res[0][2][1]
			
 
				+                    if len(res[1][0]) == 1:
			
 
				+                        right_position = right_result[idx + 1][0][0][0]
			
 
				+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
			
 
				+                    else:
			
 
				+                        right_txt += res[1][0]
			
 
				+                        right_conf_list.append(res[1][1])
			
 
				+                        right_position = right_result[idx][0][0][0]
			
 
				+                        right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
			
 
				+                    for x in right_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
			
 
				+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
			
 
				+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            right_txt += x[1][0]
			
 
				+                            right_conf_list.append(x[1][1])
			
 
				+                elif bool(re.match(r'住', res[1][0])):
			
 
				+                    right_txt = ''
			
 
				+                    t = res[0][0][1]
			
 
				+                    d = res[0][2][1]
			
 
				+
			
 
				+                    if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
			
 
				+                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
			
 
				+                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
			
 
				+                        right_position = right_result[idx + 1][0][0][0]
			
 
				+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
			
 
				+                    else:
			
 
				+                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
			
 
				+                        standard = abs(res[0][1][0] - res[0][0][0]) // 5
			
 
				+                        # 长文本直接添加至结果输出
			
 
				+                        right_txt += res[1][0]
			
 
				+                        right_conf_list.append(res[1][1])
			
 
				+                        right_position = res[0][0][0]
			
 
				+                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2
			
 
				+
			
 
				+                    for x in right_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
			
 
				+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
			
 
				+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            right_txt += x[1][0]
			
 
				+                            right_conf_list.append(x[1][1])
			
 
				+                elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
			
 
				+                    right_txt = ''
			
 
				+                    t = res[0][0][1]
			
 
				+                    d = res[0][2][1]
			
 
				+                    if len(res[1][0]) == 4:
			
 
				+                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
			
 
				+                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
			
 
				+                        right_position = right_result[idx + 1][0][0][0]
			
 
				+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
			
 
				+                    else:
			
 
				+                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
			
 
				+                        standard = abs(res[0][1][0] - res[0][0][0]) // 2
			
 
				+                        # 长文本直接添加至结果输出
			
 
				+                        right_txt += res[1][0]
			
 
				+                        right_conf_list.append(res[1][1])
			
 
				+                        right_position = res[0][0][0]
			
 
				+                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2
			
 
				+
			
 
				+                    for x in right_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
			
 
				+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
			
 
				+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            right_txt += x[1][0]
			
 
				+                            right_conf_list.append(x[1][1])
			
 
				+            right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
			
 
				+            right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
			
 
				+            if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
			
 
				+                right_txt = right_txt.replace('所', '')
			
 
				+                right_txt = right_txt.replace('住', '')
			
 
				+            if len(right_conf_list):
			
 
				+                right_conf = sum(right_conf_list) / len(right_conf_list)
			
 
				+        return left_txt, left_conf, right_txt, right_conf
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BussinessParse1(object):
			
 
				+    """
			
 
				+    经营范围
			
 
				+    """
			
 
				+    ocr: PaddleOCR
			
 
				+
			
 
				+    def bs_detection(self, image, raw_results):
			
 
				+        h, w, _ = image.shape
			
 
				+        down_list = []
			
 
				+        down_list2 = []
			
 
				+        for i in raw_results:
			
 
				+            if bool(re.match(r'注册资本', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                down_list.append([x0, y1])
			
 
				+            elif bool(re.search(r'日期', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                down_list.append([x0, y1])
			
 
				+            elif bool(re.match(r'营业期限', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                down_list.append([x0, y1])
			
 
				+            elif bool(re.match(r'登记', i.txt)):
			
 
				+                [_, y0] = i.lt
			
 
				+                down_list2.append(y0)
			
 
				+
			
 
				+        t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
			
 
				+        l2 = sorted(down_list, key=lambda x: x[0])[0][0]
			
 
				+        d2 = int(down_list2[0]) if len(down_list2) else h
			
 
				+        down_img = image[int(t2): d2, int(l2): w]
			
 
				+
			
 
				+        down_result = self.ocr.ocr(down_img)
			
 
				+
			
 
				+        down_conf_list = []
			
 
				+        down_conf = 0.0
			
 
				+        down_txt = ''
			
 
				+        for idx, res in enumerate(down_result):
			
 
				+            # print(res)
			
 
				+            if len(down_result) - 1 != idx:
			
 
				+                if bool(re.match(r'经营范围', res[1][0])):
			
 
				+                    t = res[0][0][1]
			
 
				+                    d = res[0][2][1]
			
 
				+                    if len(down_result[idx - 1][1][0]) > 15 and abs(
			
 
				+                            down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
			
 
				+                        d - t) * 1.8:
			
 
				+                        down_txt += down_result[idx - 1][1][0]
			
 
				+                    down_txt += res[1][0]
			
 
				+                    down_conf_list.append(res[1][1])
			
 
				+                    down_position = down_result[idx + 1][0][0][0]
			
 
				+                    down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
			
 
				+                    for x in down_result[idx + 1:]:
			
 
				+                        print(abs(down_down_position - x[0][0][1]))
			
 
				+                        print(abs(d - t) * 1.2)
			
 
				+                        if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
			
 
				+                                d - t) * 1.8:
			
 
				+                            down_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            if down_txt[-1] == x[1][0][0]:
			
 
				+                                down_txt += x[1][0][1:]
			
 
				+                            else:
			
 
				+                                down_txt += x[1][0]
			
 
				+                            down_conf_list.append(x[1][1])
			
 
				+                        # print(down_txt)
			
 
				+        down_txt = down_txt.replace('经营范围', '')
			
 
				+        if len(down_conf_list):
			
 
				+            down_conf = sum(down_conf_list) / len(down_conf_list)
			
 
				+
			
 
				+        # cv2.imshow('11', down_img)
			
 
				+        # cv2.waitKey(0)
			
 
				+        return down_txt, down_conf
			
 
				+
			
 
				+    def ad_detection(self, image, raw_results):
			
 
				+        h, w, _ = image.shape
			
 
				+        top_list1 = []
			
 
				+        top_list2 = []
			
 
				+        type_key = False
			
 
				+        for i in raw_results:
			
 
				+            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
			
 
				+                [_, y0] = i.lt
			
 
				+                top_list2.append(y0)
			
 
				+            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                top_list1.append([x0, y1])
			
 
				+            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
			
 
				+                type_key = True
			
 
				+                [x0, _] = i.lt
			
 
				+                [_, y1] = i.rb
			
 
				+                top_list1.append([x0, y1])
			
 
				+            elif bool(re.match(r'注册资本', i.txt)):
			
 
				+                [_, y0] = i.lt
			
 
				+                top_list2.append(y0)
			
 
				+            elif bool(re.search(r'日期', i.txt)):
			
 
				+                [_, y0] = i.lt
			
 
				+                top_list2.append(y0)
			
 
				+            elif bool(re.match(r'营业期限', i.txt)):
			
 
				+                [_, y0] = i.lt
			
 
				+                top_list2.append(y0)
			
 
				+        t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
			
 
				+        l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
			
 
				+        d1 = sorted(top_list2)[0]
			
 
				+
			
 
				+        top_img = image[int(t1): int(d1), int(l1): w]
			
 
				+        top_result = self.ocr.ocr(top_img)
			
 
				+
			
 
				+        top_conf_list = []
			
 
				+        top_conf = 0.0
			
 
				+        top_txt = ''
			
 
				+        last_key = ''
			
 
				+        # 住所信息
			
 
				+        for idx, res in enumerate(top_result):
			
 
				+            # print(res)
			
 
				+            if bool(re.match(r'所', res[1][0])):
			
 
				+                top_txt = ''
			
 
				+                t = res[0][0][1]
			
 
				+                d = res[0][2][1]
			
 
				+                if len(last_key):
			
 
				+                    top_txt += last_key
			
 
				+                    print('top_txt', top_txt)
			
 
				+                if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
			
 
				+                    top_position = top_result[idx + 1][0][0][0]
			
 
				+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
			
 
				+                else:
			
 
				+                    top_txt += res[1][0]
			
 
				+                    top_conf_list.append(res[1][1])
			
 
				+                    top_position = top_result[idx][0][0][0]
			
 
				+                    top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
			
 
				+                if len(top_result) - 1 != idx:
			
 
				+                    for x in top_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
			
 
				+                                d - t) * 1.2:
			
 
				+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            top_txt += x[1][0]
			
 
				+                            top_conf_list.append(x[1][1])
			
 
				+                # print('top_txt', top_txt)
			
 
				+            elif bool(re.match(r'住', res[1][0])):
			
 
				+                top_txt = ''
			
 
				+                t = res[0][0][1]
			
 
				+                d = res[0][2][1]
			
 
				+                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
			
 
				+                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
			
 
				+                        top_result[idx - 1][1][0] and idx != 0:
			
 
				+                    last_key = top_result[idx - 1][1][0]
			
 
				+                if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
			
 
				+                    # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
			
 
				+                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
			
 
				+                    top_position = top_result[idx + 1][0][0][0]
			
 
				+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
			
 
				+                else:
			
 
				+                    # 此情况为长文本，则采用框的左右坐标的1/5为标准
			
 
				+                    standard = abs(res[0][1][0] - res[0][0][0]) // 5
			
 
				+                    # 长文本直接添加至结果输出
			
 
				+                    top_txt += res[1][0]
			
 
				+                    top_conf_list.append(res[1][1])
			
 
				+                    top_position = res[0][0][0]
			
 
				+                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
			
 
				+                if len(top_result) - 1 != idx:
			
 
				+                    for x in top_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
			
 
				+                                d - t) * 1.2:
			
 
				+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            top_txt += x[1][0]
			
 
				+                            top_conf_list.append(x[1][1])
			
 
				+                # print(top_txt)
			
 
				+            elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
			
 
				+                top_txt = ''
			
 
				+                t = res[0][0][1]
			
 
				+                d = res[0][2][1]
			
 
				+                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
			
 
				+                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
			
 
				+                        top_result[idx - 1][1][0] and idx != 0:
			
 
				+                    top_txt += top_result[idx - 1][1][0]
			
 
				+                if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
			
 
				+                    # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
			
 
				+                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
			
 
				+                    top_position = top_result[idx + 1][0][0][0]
			
 
				+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
			
 
				+                else:
			
 
				+                    # 此情况为长文本，则采用框的左右坐标的1/2为标准
			
 
				+                    standard = abs(res[0][1][0] - res[0][0][0]) // 2
			
 
				+                    # 长文本直接添加至结果输出
			
 
				+                    top_txt += res[1][0]
			
 
				+                    top_conf_list.append(res[1][1])
			
 
				+                    top_position = res[0][0][0]
			
 
				+                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
			
 
				+                if len(top_result) - 1 != idx:
			
 
				+                    for x in top_result[idx + 1:]:
			
 
				+                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
			
 
				+                                d - t) * 1.2:
			
 
				+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
			
 
				+                            top_txt += x[1][0]
			
 
				+                            top_conf_list.append(x[1][1])
			
 
				+                            top_conf_list.append(x[1][1])
			
 
				+        if len(top_txt) == 0 and type_key:
			
 
				+            for res in top_result:
			
 
				+                top_txt += res[1][0]
			
 
				+        top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
			
 
				+        if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
			
 
				+            top_txt = top_txt.replace('所', '')
			
 
				+            top_txt = top_txt.replace('住', '')
			
 
				+        if len(top_conf_list):
			
 
				+            top_conf = sum(top_conf_list) / len(top_conf_list)
			
 
				+
			
 
				+        # cv2.imshow('11', top_img)
			
 
				+        # cv2.waitKey(0)
			
 
				+        return top_txt, top_conf
			
--- a/core/direction.py
+++ b/core/direction.py
@@ -163,11 +163,11 @@ class AngleDetector(object):
 
				     # -> angle       result(ocr生)
			
 
				     @sxtimeit
			
 
				     def detect_angle(self, img):
			
 
				-        image_type, result = self.detect_img(img)
			
 
				+        result = self.ocr.ocr(img, cls=True)
			
 
				         ocr_anchor = BusinessLicenseAnchor('营业执照', [Direction.TOP])
			
 
				         try:
			
 
				             angle = detect_angle(result, ocr_anchor)
			
 
				-            return angle, result, image_type
			
 
				+            return angle, result
			
 
				 
			
 
				         except Exception as e:
			
 
				             print(e)
			
@@ -176,8 +176,4 @@ class AngleDetector(object):
 
				             result = self.ocr.ocr(img, cls=True)
			
 
				             angle = detect_angle(result, ocr_anchor)
			
 
				             # 旋转90度之后要重新计算角度
			
 
				-            return (angle - 1 + 4) % 4, result, image_type
			
 
				-
			
 
				-    def detect_img(self, img):
			
 
				-        result = self.ocr.ocr(img, cls=True)
			
 
				-        return 1, result
			
 
				+            return (angle - 1 + 4) % 4, result
			
--- a/core/ocr.py
+++ b/core/ocr.py
@@ -27,17 +27,21 @@ class BusinessLicenseOcr:
 
				     def predict(self, image: np.ndarray) -> ():
			
 
				 
			
 
				         # 旋转后img angle result(生ocr)
			
 
				-        image, angle, result, image_type = self._pre_process(image)
			
 
				+        image, angle, result = self._pre_process(image)
			
 
				         print(f'---------- detect angle: {angle} 角度 --------')
			
 
				         if angle != 0:
			
 
				             _, _, result = self._ocr(image)
			
 
				 
			
 
				+        # 判断类型
			
 
				+        image_type = self._type(result)
			
 
				+
			
 
				         # 去除 市场监督 水印
			
 
				         for i_k, i_v in enumerate(result):
			
 
				             if '市场监督' in i_v[1][0] and len(i_v[1][0]) < 7:
			
 
				                 del result[i_k]
			
 
				                 break
			
 
				 
			
 
				+
			
 
				         return self._post_process(result, angle, image_type, image)
			
 
				 
			
 
				     # 预处理(旋转图片)
			
@@ -45,7 +49,7 @@ class BusinessLicenseOcr:
 
				     # -> 正向的img(旋转后) 源img角度 result(ocr生)
			
 
				     def _pre_process(self, image) -> (np.ndarray, int, Any):
			
 
				         # pic角度 result(ocr生)
			
 
				-        angle, result, image_type = self.angle_detector.detect_angle(image)
			
 
				+        angle, result= self.angle_detector.detect_angle(image)
			
 
				 
			
 
				         if angle == 1:
			
 
				             image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
			
@@ -53,7 +57,18 @@ class BusinessLicenseOcr:
 
				             image = cv2.rotate(image, cv2.ROTATE_180)
			
 
				         if angle == 3:
			
 
				             image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
			
 
				-        return image, angle, result, image_type
			
 
				+        return image, angle, result
			
 
				+
			
 
				+    def _type(self, result):
			
 
				+        anchor = False
			
 
				+        code = False
			
 
				+        for res in result:
			
 
				+            txt = res[1][0]
			
 
				+            if "营业执照" in txt:
			
 
				+                anchor = res
			
 
				+            if "统一社" in txt or "会信用" in txt or "用代码" in txt:
			
 
				+                code = res
			
 
				+        return 0 if (code and anchor) and (code[0][0][0] < anchor[0][0][0]) else 1
			
 
				 
			
 
				     # 获取模型检测结果
			
 
				     def _ocr(self, image):
			
@@ -83,8 +98,10 @@ class BusinessLicenseOcr:
 
				         print('-------------')
			
 
				         conf = line_parser.confidence
			
 
				 
			
 
				-        parser = BusinessLicenseParser(line_result, image, result)
			
 
				-        # if int(image_type) == 0:
			
 
				+        if image_type == 0:
			
 
				+            parser = BusinessLicenseParser0(line_result, image, result)
			
 
				+        if image_type == 1:
			
 
				+            parser = BusinessLicenseParser1(line_result, image, result)
			
 
				 
			
 
				         ocr_res = parser.parse()
			
 
				 
			
@@ -96,7 +113,4 @@ class BusinessLicenseOcr:
 
				         print(res)
			
 
				         return res
			
 
				 
			
 
				-    # def _get_type(self, image) -> int:
			
 
				-
			
 
				-
			
 
				-
			
 
				+    # def _get_type(self, image) -> int:
			
--- a/core/parser.py
+++ b/core/parser.py
@@ -7,8 +7,12 @@ import cpca
 
				 import cv2
			
 
				 import numpy as np
			
 
				 import string
			
 
				+
			
 
				+from paddleocr import PaddleOCR
			
 
				 from zhon.hanzi import punctuation
			
 
				 import cn2an
			
 
				+
			
 
				+from core.business_parse import BussinessParse0, BussinessParse1
			
 
				 from core.line_parser import OcrResult
			
 
				 from core.square_parser import parser_xy
			
 
				 from stamp.d_stamp import send_request
			
@@ -76,7 +80,7 @@ class Parser(object):
 
				         return self.res
			
 
				 
			
 
				 
			
 
				-class BusinessLicenseParser(Parser):
			
 
				+class BusinessLicenseParser0(Parser):
			
 
				 
			
 
				     def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
			
 
				         Parser.__init__(self, ocr_results, raw_results)
			
@@ -249,11 +253,29 @@ class BusinessLicenseParser(Parser):
 
				         """
			
 
				         经营范围
			
 
				         """
			
 
				-        sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
			
 
				-        if bool(sb_or):
			
 
				-            self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
			
 
				+        ocr = PaddleOCR(use_gpu=True)
			
 
				+        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(ocr).detection(self.image, self.raw_results)
			
 
				+        if bool(bs_txt):
			
 
				+            self.res['business_scope'] = RecItem(bs_txt, bs_conf)
			
 
				+
			
 
				+        add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
			
 
				+        if add_or_0:
			
 
				+            add_or = add_or_0
			
 
				         else:
			
 
				-            self.res['business_scope'] = RecItem('经营范围', random.random())
			
 
				+            add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
			
 
				+            if add_or_1:
			
 
				+                add_or = add_or_1
			
 
				+            else:
			
 
				+                return
			
 
				+        txt = add_or.txt
			
 
				+        if '所' in txt[:3] or '厂' in txt[:3]:
			
 
				+            txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
			
 
				+
			
 
				+        if len(ad_txt) >= len(txt):
			
 
				+            self.res['address'] = RecItem(ad_txt, ad_conf)
			
 
				+        else:
			
 
				+            self.res['address'] = RecItem(txt, add_or.conf)
			
 
				+
			
 
				         return
			
 
				 
			
 
				     def address(self):  # sourcery skip: use-named-expression
			
@@ -317,6 +339,271 @@ class BusinessLicenseParser(Parser):
 
				         self.start_date()
			
 
				         self.expire_date()
			
 
				         self.business_scope()
			
 
				-        self.address()
			
 
				+        # self.address()
			
 
				         self.stamp()
			
 
				         return {key: self.res[key].to_dict() for key in self.keys}
			
 
				+
			
 
				+
			
 
				+class BusinessLicenseParser1(Parser):
			
 
				+
			
 
				+    def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
			
 
				+        Parser.__init__(self, ocr_results, raw_results)
			
 
				+        self.image = image
			
 
				+        self.ocr = PaddleOCR(use_gpu=True)
			
 
				+
			
 
				+    def social_code(self):
			
 
				+        """
			
 
				+        社会信用代码
			
 
				+        """
			
 
				+        # 得在"营业执照"以下
			
 
				+        result = []
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            if "统一社" in txt or "会信用" in txt or "用代码" in txt:
			
 
				+                result = self.result[i:]
			
 
				+                break
			
 
				+
			
 
				+        for i in range(len(result)):
			
 
				+            res = result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            code = ''.join(re.findall(u'[\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]{15,18}', txt))
			
 
				+            if len(code):
			
 
				+                self.res['social_code'] = RecItem(code, conf)
			
 
				+                return
			
 
				+
			
 
				+    def company_name(self):
			
 
				+        """
			
 
				+        公司名称
			
 
				+        """
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            if '称尔' in txt: txt = txt.replace('称尔', '称')
			
 
				+            if '名' in txt[:4] and '称' in txt[:4]:
			
 
				+                txt = '名称' + txt.split('称')[-1]
			
 
				+
			
 
				+            if '名称' in txt:
			
 
				+                company_name = txt.split('名称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
			
 
				+                self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
			
 
				+                return
			
 
				+            if '称' in txt and txt[0] == '称' and len(txt) > 5:
			
 
				+                company_name = txt.split('称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
			
 
				+                self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
			
 
				+                return
			
 
				+
			
 
				+    def legal_person(self):
			
 
				+        """
			
 
				+        法人姓名
			
 
				+        """
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0].replace('市场监督', '')
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            if '法定代表人' in txt or '代表人' in txt:
			
 
				+                legal_person = txt.split('代表人')[-1].split('营业')[0]
			
 
				+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
			
 
				+                return
			
 
				+            if '经营者' in txt:
			
 
				+                legal_person = txt.split('经营者')[-1].split('经营')[0]
			
 
				+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
			
 
				+                return
			
 
				+            if '负责人' in txt:
			
 
				+                legal_person = txt.split('负责人')[-1].split('责人')[0]
			
 
				+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
			
 
				+                return
			
 
				+
			
 
				+    def registered_capital(self):
			
 
				+        """
			
 
				+        注册资本
			
 
				+        """
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            txt = fix_text(txt)
			
 
				+
			
 
				+            if '注册资本' in txt:
			
 
				+                if '人民币' in txt[:4]:
			
 
				+                    registered_capital = txt.split('人民币')[-1].split('万元')[0]
			
 
				+                    txt = f'人民币{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
			
 
				+                elif '美元' in txt[:4]:
			
 
				+                    registered_capital = txt.split('美元')[-1].split('万元')[0]
			
 
				+                    txt = f'美元{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
			
 
				+                elif '人民币' in txt[-4:]:
			
 
				+                    registered_capital = txt.split('资本')[-1].split('人民币')[0].split('万元')[0]
			
 
				+                    txt = f'{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}人民币'
			
 
				+                else:
			
 
				+                    registered_capital = txt.split('资本')[-1].split('币')[-1].split('万')[0]
			
 
				+                    txt = self.cn_to_an(fix_text(clear_punctuation(registered_capital)))
			
 
				+
			
 
				+                self.res['registered_capital'] = RecItem(txt, conf)
			
 
				+                return
			
 
				+
			
 
				+    def type(self):  # sourcery skip: hoist-similar-statement-from-if
			
 
				+        """
			
 
				+        类型
			
 
				+        """
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            txt = fix_text(clear_punctuation(txt))
			
 
				+
			
 
				+            if '类型' in txt:
			
 
				+                txt = txt.split('类型')[-1].split('成立')[0].split('注册')[0]
			
 
				+                if '公司' in txt:
			
 
				+                    t_s, s_e = txt.split('公司')[0], txt.split('公司')[-1].replace('(', '').replace(')', '').replace('（',
			
 
				+                                                                                                                 '').replace(
			
 
				+                        '）', '')
			
 
				+
			
 
				+                    # 分公司
			
 
				+                    if '分公司' in txt:
			
 
				+                        t_s = f'{t_s}公司分'
			
 
				+
			
 
				+                    txt = f'{t_s}公司（{s_e}）' if s_e else f'{t_s}公司'
			
 
				+
			
 
				+                if txt[0] == '型': txt = txt[1:]
			
 
				+                self.res['type'] = RecItem(txt, conf)
			
 
				+                return
			
 
				+
			
 
				+    def start_date(self):
			
 
				+        """
			
 
				+        成立日期 ⚠️ 注册日期
			
 
				+        """
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            txt = fix_text(txt)
			
 
				+
			
 
				+            if '日期' in txt:
			
 
				+                txt = txt.split('日期')[-1]
			
 
				+                date = self.to_date(txt)
			
 
				+                self.res['start_date'] = RecItem(date, conf)
			
 
				+
			
 
				+    def expire_date(self):  # sourcery skip: hoist-similar-statement-from-if
			
 
				+        """
			
 
				+        有效期
			
 
				+        """
			
 
				+        for i in range(len(self.result)):
			
 
				+            res = self.result[i]
			
 
				+            txt = res[-1][0]
			
 
				+            conf = res[-1][1]
			
 
				+
			
 
				+            if '期限' in txt:
			
 
				+                if '至' in txt:
			
 
				+                    txt = ''.join(txt.split('期限')[1:]).replace('*', '')
			
 
				+                    date_from = txt.split('至')[0]
			
 
				+                    date_to = txt.split('至')[-1]
			
 
				+                    date = f'{self.to_date(date_from)} 至 {self.to_date(date_to)}'
			
 
				+                    self.res['expire_date'] = RecItem(date, conf)
			
 
				+                    return
			
 
				+                if '长期' in txt:
			
 
				+                    self.res['expire_date'] = RecItem('长期', conf)
			
 
				+                    return
			
 
				+                else:
			
 
				+                    self.res['expire_date'] = RecItem('', conf)
			
 
				+                    return
			
 
				+
			
 
				+    def business_scope(self):
			
 
				+        """
			
 
				+        经营范围
			
 
				+        """
			
 
				+        print('-------------经营范围处理开始--------------')
			
 
				+
			
 
				+        bs_txt, bs_conf = BussinessParse1(self.ocr).bs_detection(self.image, self.raw_results)
			
 
				+
			
 
				+        if bool(bs_txt):
			
 
				+            self.res['business_scope'] = RecItem(bs_txt, bs_conf)
			
 
				+
			
 
				+        # sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
			
 
				+        # if bool(sb_or):
			
 
				+        #     self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
			
 
				+        # else:
			
 
				+        #     self.res['business_scope'] = RecItem('经营范围', random.random())
			
 
				+
			
 
				+        print('-------------经营范围处理结束--------------')
			
 
				+
			
 
				+        return
			
 
				+
			
 
				+    def address(self):  # sourcery skip: use-named-expression
			
 
				+        """
			
 
				+        住所
			
 
				+        """
			
 
				+        # 切割方案
			
 
				+        ad_txt, ad_conf = BussinessParse1(self.ocr).ad_detection(self.image, self.raw_results)
			
 
				+        # 关键字方案
			
 
				+        add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
			
 
				+        if add_or_0:
			
 
				+            add_or = add_or_0
			
 
				+        else:
			
 
				+            add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
			
 
				+            if add_or_1:
			
 
				+                add_or = add_or_1
			
 
				+            else:
			
 
				+                return
			
 
				+
			
 
				+        txt = add_or.txt
			
 
				+
			
 
				+        if '所' in txt[:3] or '厂' in txt[:3]:
			
 
				+            txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
			
 
				+
			
 
				+        if len(ad_txt) >= len(txt):
			
 
				+            self.res['address'] = RecItem(ad_txt, ad_conf)
			
 
				+        else:
			
 
				+            self.res['address'] = RecItem(txt, add_or.conf)
			
 
				+
			
 
				+        return
			
 
				+
			
 
				+    def stamp(self):
			
 
				+        """
			
 
				+        印章检测
			
 
				+        """
			
 
				+        self.res['stamp'] = RecItem(send_request(self.image), 1.)
			
 
				+        return
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def cn_to_an(num):
			
 
				+        try:
			
 
				+            num = int(num)
			
 
				+        except ValueError:
			
 
				+            num = str(cn2an.cn2an(f'{num}万'))[:-4]
			
 
				+        except Exception:
			
 
				+            raise Exception('注册资本转化出错')
			
 
				+        finally:
			
 
				+            return f'{num}万元'
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def to_date(txt):
			
 
				+        if '长期' in txt: return '长期'
			
 
				+        if '永久' in txt: return '永久'
			
 
				+        if '不约定' in txt: return '不约定期限'
			
 
				+        date_in = re.findall(r"\d+", txt)
			
 
				+        if len(date_in) == 3:
			
 
				+            return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2]}日'
			
 
				+        else:
			
 
				+            return ''
			
 
				+
			
 
				+    # ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
			
 
				+    #  "business_scope", 'expire_date', 'address', 'stamp']
			
 
				+    def parse(self):
			
 
				+        self.social_code()
			
 
				+        self.company_name()
			
 
				+        self.legal_person()
			
 
				+        self.registered_capital()
			
 
				+        self.type()
			
 
				+        self.start_date()
			
 
				+        self.expire_date()
			
 
				+        self.business_scope()
			
 
				+        self.address()
			
 
				+        self.stamp()
			
 
				+        return {key: self.res[key].to_dict() for key in self.keys}
			
--- a/core/square_parser.py
+++ b/core/square_parser.py
@@ -122,14 +122,12 @@ def get_key_other_or(res_raw_list, key_heard: OcrResult, key_title):
 
				 
			
 
				 def parser_xy(res_line, res_raw, key):
			
 
				     # 在 res_line 中找到 key 对应的坐标
			
 
				-    print('res_line', res_line)
			
 
				     key_row = []
			
 
				     for row in res_line:
			
 
				         print(row[-1])
			
 
				         if key in row[-1][0]:
			
 
				             key_row = row
			
 
				             break
			
 
				-    print(key_row)
			
 
				     if not bool(key_row): return
			
 
				     key_heard, key_title = get_key_fist_line(key_row, key)
			
 
				     return get_key_other_or(res_raw, key_heard, key_title)