Kaynağa Gözat

拼接优化

xujiayue 2 yıl önce
ebeveyn
işleme
e16414b390
5 değiştirilmiş dosya ile 691 ekleme ve 24 silme
  1. 372 0
      core/business_parse.py
  2. 3 7
      core/direction.py
  3. 23 9
      core/ocr.py
  4. 293 6
      core/parser.py
  5. 0 2
      core/square_parser.py

+ 372 - 0
core/business_parse.py

@@ -0,0 +1,372 @@
+import re
+from dataclasses import dataclass
+import cv2
+from paddleocr import PaddleOCR
+
+
+@dataclass
+class BussinessParse0(object):
+    """
+    经营范围
+    """
+    ocr: PaddleOCR
+
+    def detection(self, image, raw_results):
+        h, w, _ = image.shape
+        left_list = []
+        right_list = []
+        for i in raw_results:
+            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                left_list.append([x0, y1])
+            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                left_list.append([x0, y1])
+            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                left_list.append([x0, y1])
+            elif bool(re.match(r'注册', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                right_list.append([x0, y1])
+            elif bool(re.search(r'日期', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                right_list.append([x0, y1])
+            elif bool(re.match(r'营业期限', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                right_list.append([x0, y1])
+        t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
+        t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
+        l1 = sorted(left_list, key=lambda x: x[0])[0][0]
+        r1 = sorted(right_list, key=lambda x: x[0])[0][0]
+
+        left_img = image[int(t1): h, int(l1): int(r1)]
+        right_img = image[int(t2): h, int(r1): w]
+        left_result = self.ocr.ocr(left_img)
+        right_result = self.ocr.ocr(right_img)
+
+        left_conf_list = []
+        right_conf_list = []
+        left_conf = 0.0
+        right_conf = 0.0
+        left_txt = ''
+        right_txt = ''
+        for idx, res in enumerate(left_result):
+            if len(left_result) - 1 != idx:
+                if bool(re.match(r'经营范围', res[1][0])):
+                    t = res[0][0][1]
+                    d = res[0][2][1]
+                    # 判断上一条信息是否为经营范围内容
+                    if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
+                            d - t) * 1.8:
+                        left_txt += left_result[idx - 1][1][0]
+
+                    left_txt += res[1][0]
+                    left_conf_list.append(res[1][1])
+
+                    left_position = left_result[idx + 1][0][0][0]
+                    left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
+                    for x in left_result[idx + 1:]:
+                        if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
+                            left_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            if left_txt[-1] == x[1][0][0]:
+                                left_txt += x[1][0][1:]
+                            else:
+                                left_txt += x[1][0]
+                            left_conf_list.append(x[1][1])
+                # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
+            left_txt = left_txt.replace('经营范围', '')
+            if len(left_conf_list):
+                left_conf = sum(left_conf_list) / len(left_conf_list)
+        # 住所信息
+        for idx, res in enumerate(right_result):
+            if len(right_result) - 1 != idx:
+                if bool(re.match(r'所', res[1][0])):
+                    right_txt = ''
+                    t = res[0][0][1]
+                    d = res[0][2][1]
+                    if len(res[1][0]) == 1:
+                        right_position = right_result[idx + 1][0][0][0]
+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
+                    else:
+                        right_txt += res[1][0]
+                        right_conf_list.append(res[1][1])
+                        right_position = right_result[idx][0][0][0]
+                        right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
+                    for x in right_result[idx + 1:]:
+                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            right_txt += x[1][0]
+                            right_conf_list.append(x[1][1])
+                elif bool(re.match(r'住', res[1][0])):
+                    right_txt = ''
+                    t = res[0][0][1]
+                    d = res[0][2][1]
+
+                    if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
+                        # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
+                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
+                        right_position = right_result[idx + 1][0][0][0]
+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
+                    else:
+                        # 此情况为长文本,则采用框的左右坐标的1/5为标准
+                        standard = abs(res[0][1][0] - res[0][0][0]) // 5
+                        # 长文本直接添加至结果输出
+                        right_txt += res[1][0]
+                        right_conf_list.append(res[1][1])
+                        right_position = res[0][0][0]
+                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2
+
+                    for x in right_result[idx + 1:]:
+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            right_txt += x[1][0]
+                            right_conf_list.append(x[1][1])
+                elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
+                    right_txt = ''
+                    t = res[0][0][1]
+                    d = res[0][2][1]
+                    if len(res[1][0]) == 4:
+                        # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
+                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
+                        right_position = right_result[idx + 1][0][0][0]
+                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
+                    else:
+                        # 此情况为长文本,则采用框的左右坐标的1/5为标准
+                        standard = abs(res[0][1][0] - res[0][0][0]) // 2
+                        # 长文本直接添加至结果输出
+                        right_txt += res[1][0]
+                        right_conf_list.append(res[1][1])
+                        right_position = res[0][0][0]
+                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2
+
+                    for x in right_result[idx + 1:]:
+                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
+                                d - t) * 1.2 and '登记机关' not in x[1][0]:
+                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            right_txt += x[1][0]
+                            right_conf_list.append(x[1][1])
+            right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
+            right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
+            if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
+                right_txt = right_txt.replace('所', '')
+                right_txt = right_txt.replace('住', '')
+            if len(right_conf_list):
+                right_conf = sum(right_conf_list) / len(right_conf_list)
+        return left_txt, left_conf, right_txt, right_conf
+
+
+@dataclass
+class BussinessParse1(object):
+    """
+    经营范围
+    """
+    ocr: PaddleOCR
+
+    def bs_detection(self, image, raw_results):
+        h, w, _ = image.shape
+        down_list = []
+        down_list2 = []
+        for i in raw_results:
+            if bool(re.match(r'注册资本', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                down_list.append([x0, y1])
+            elif bool(re.search(r'日期', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                down_list.append([x0, y1])
+            elif bool(re.match(r'营业期限', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                down_list.append([x0, y1])
+            elif bool(re.match(r'登记', i.txt)):
+                [_, y0] = i.lt
+                down_list2.append(y0)
+
+        t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
+        l2 = sorted(down_list, key=lambda x: x[0])[0][0]
+        d2 = int(down_list2[0]) if len(down_list2) else h
+        down_img = image[int(t2): d2, int(l2): w]
+
+        down_result = self.ocr.ocr(down_img)
+
+        down_conf_list = []
+        down_conf = 0.0
+        down_txt = ''
+        for idx, res in enumerate(down_result):
+            # print(res)
+            if len(down_result) - 1 != idx:
+                if bool(re.match(r'经营范围', res[1][0])):
+                    t = res[0][0][1]
+                    d = res[0][2][1]
+                    if len(down_result[idx - 1][1][0]) > 15 and abs(
+                            down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
+                        d - t) * 1.8:
+                        down_txt += down_result[idx - 1][1][0]
+                    down_txt += res[1][0]
+                    down_conf_list.append(res[1][1])
+                    down_position = down_result[idx + 1][0][0][0]
+                    down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
+                    for x in down_result[idx + 1:]:
+                        print(abs(down_down_position - x[0][0][1]))
+                        print(abs(d - t) * 1.2)
+                        if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
+                                d - t) * 1.8:
+                            down_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            if down_txt[-1] == x[1][0][0]:
+                                down_txt += x[1][0][1:]
+                            else:
+                                down_txt += x[1][0]
+                            down_conf_list.append(x[1][1])
+                        # print(down_txt)
+        down_txt = down_txt.replace('经营范围', '')
+        if len(down_conf_list):
+            down_conf = sum(down_conf_list) / len(down_conf_list)
+
+        # cv2.imshow('11', down_img)
+        # cv2.waitKey(0)
+        return down_txt, down_conf
+
+    def ad_detection(self, image, raw_results):
+        h, w, _ = image.shape
+        top_list1 = []
+        top_list2 = []
+        type_key = False
+        for i in raw_results:
+            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
+                [_, y0] = i.lt
+                top_list2.append(y0)
+            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                top_list1.append([x0, y1])
+            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
+                type_key = True
+                [x0, _] = i.lt
+                [_, y1] = i.rb
+                top_list1.append([x0, y1])
+            elif bool(re.match(r'注册资本', i.txt)):
+                [_, y0] = i.lt
+                top_list2.append(y0)
+            elif bool(re.search(r'日期', i.txt)):
+                [_, y0] = i.lt
+                top_list2.append(y0)
+            elif bool(re.match(r'营业期限', i.txt)):
+                [_, y0] = i.lt
+                top_list2.append(y0)
+        t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
+        l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
+        d1 = sorted(top_list2)[0]
+
+        top_img = image[int(t1): int(d1), int(l1): w]
+        top_result = self.ocr.ocr(top_img)
+
+        top_conf_list = []
+        top_conf = 0.0
+        top_txt = ''
+        last_key = ''
+        # 住所信息
+        for idx, res in enumerate(top_result):
+            # print(res)
+            if bool(re.match(r'所', res[1][0])):
+                top_txt = ''
+                t = res[0][0][1]
+                d = res[0][2][1]
+                if len(last_key):
+                    top_txt += last_key
+                    print('top_txt', top_txt)
+                if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
+                    top_position = top_result[idx + 1][0][0][0]
+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
+                else:
+                    top_txt += res[1][0]
+                    top_conf_list.append(res[1][1])
+                    top_position = top_result[idx][0][0][0]
+                    top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
+                if len(top_result) - 1 != idx:
+                    for x in top_result[idx + 1:]:
+                        if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
+                                d - t) * 1.2:
+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            top_txt += x[1][0]
+                            top_conf_list.append(x[1][1])
+                # print('top_txt', top_txt)
+            elif bool(re.match(r'住', res[1][0])):
+                top_txt = ''
+                t = res[0][0][1]
+                d = res[0][2][1]
+                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
+                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
+                        top_result[idx - 1][1][0] and idx != 0:
+                    last_key = top_result[idx - 1][1][0]
+                if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
+                    # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
+                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
+                    top_position = top_result[idx + 1][0][0][0]
+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
+                else:
+                    # 此情况为长文本,则采用框的左右坐标的1/5为标准
+                    standard = abs(res[0][1][0] - res[0][0][0]) // 5
+                    # 长文本直接添加至结果输出
+                    top_txt += res[1][0]
+                    top_conf_list.append(res[1][1])
+                    top_position = res[0][0][0]
+                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
+                if len(top_result) - 1 != idx:
+                    for x in top_result[idx + 1:]:
+                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
+                                d - t) * 1.2:
+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            top_txt += x[1][0]
+                            top_conf_list.append(x[1][1])
+                # print(top_txt)
+            elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
+                top_txt = ''
+                t = res[0][0][1]
+                d = res[0][2][1]
+                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
+                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
+                        top_result[idx - 1][1][0] and idx != 0:
+                    top_txt += top_result[idx - 1][1][0]
+                if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
+                    # 若‘住所’或‘经营场所’为独立框,则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
+                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
+                    top_position = top_result[idx + 1][0][0][0]
+                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
+                else:
+                    # 此情况为长文本,则采用框的左右坐标的1/2为标准
+                    standard = abs(res[0][1][0] - res[0][0][0]) // 2
+                    # 长文本直接添加至结果输出
+                    top_txt += res[1][0]
+                    top_conf_list.append(res[1][1])
+                    top_position = res[0][0][0]
+                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
+                if len(top_result) - 1 != idx:
+                    for x in top_result[idx + 1:]:
+                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
+                                d - t) * 1.2:
+                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
+                            top_txt += x[1][0]
+                            top_conf_list.append(x[1][1])
+                            top_conf_list.append(x[1][1])
+        if len(top_txt) == 0 and type_key:
+            for res in top_result:
+                top_txt += res[1][0]
+        top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
+        if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
+            top_txt = top_txt.replace('所', '')
+            top_txt = top_txt.replace('住', '')
+        if len(top_conf_list):
+            top_conf = sum(top_conf_list) / len(top_conf_list)
+
+        # cv2.imshow('11', top_img)
+        # cv2.waitKey(0)
+        return top_txt, top_conf

+ 3 - 7
core/direction.py

@@ -163,11 +163,11 @@ class AngleDetector(object):
     # -> angle       result(ocr生)
     @sxtimeit
     def detect_angle(self, img):
-        image_type, result = self.detect_img(img)
+        result = self.ocr.ocr(img, cls=True)
         ocr_anchor = BusinessLicenseAnchor('营业执照', [Direction.TOP])
         try:
             angle = detect_angle(result, ocr_anchor)
-            return angle, result, image_type
+            return angle, result
 
         except Exception as e:
             print(e)
@@ -176,8 +176,4 @@ class AngleDetector(object):
             result = self.ocr.ocr(img, cls=True)
             angle = detect_angle(result, ocr_anchor)
             # 旋转90度之后要重新计算角度
-            return (angle - 1 + 4) % 4, result, image_type
-
-    def detect_img(self, img):
-        result = self.ocr.ocr(img, cls=True)
-        return 1, result
+            return (angle - 1 + 4) % 4, result

+ 23 - 9
core/ocr.py

@@ -27,17 +27,21 @@ class BusinessLicenseOcr:
     def predict(self, image: np.ndarray) -> ():
 
         # 旋转后img angle result(生ocr)
-        image, angle, result, image_type = self._pre_process(image)
+        image, angle, result = self._pre_process(image)
         print(f'---------- detect angle: {angle} 角度 --------')
         if angle != 0:
             _, _, result = self._ocr(image)
 
+        # 判断类型
+        image_type = self._type(result)
+
         # 去除 市场监督 水印
         for i_k, i_v in enumerate(result):
             if '市场监督' in i_v[1][0] and len(i_v[1][0]) < 7:
                 del result[i_k]
                 break
 
+
         return self._post_process(result, angle, image_type, image)
 
     # 预处理(旋转图片)
@@ -45,7 +49,7 @@ class BusinessLicenseOcr:
     # -> 正向的img(旋转后) 源img角度 result(ocr生)
     def _pre_process(self, image) -> (np.ndarray, int, Any):
         # pic角度 result(ocr生)
-        angle, result, image_type = self.angle_detector.detect_angle(image)
+        angle, result= self.angle_detector.detect_angle(image)
 
         if angle == 1:
             image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
@@ -53,7 +57,18 @@ class BusinessLicenseOcr:
             image = cv2.rotate(image, cv2.ROTATE_180)
         if angle == 3:
             image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
-        return image, angle, result, image_type
+        return image, angle, result
+
+    def _type(self, result):
+        anchor = False
+        code = False
+        for res in result:
+            txt = res[1][0]
+            if "营业执照" in txt:
+                anchor = res
+            if "统一社" in txt or "会信用" in txt or "用代码" in txt:
+                code = res
+        return 0 if (code and anchor) and (code[0][0][0] < anchor[0][0][0]) else 1
 
     # 获取模型检测结果
     def _ocr(self, image):
@@ -83,8 +98,10 @@ class BusinessLicenseOcr:
         print('-------------')
         conf = line_parser.confidence
 
-        parser = BusinessLicenseParser(line_result, image, result)
-        # if int(image_type) == 0:
+        if image_type == 0:
+            parser = BusinessLicenseParser0(line_result, image, result)
+        if image_type == 1:
+            parser = BusinessLicenseParser1(line_result, image, result)
 
         ocr_res = parser.parse()
 
@@ -96,7 +113,4 @@ class BusinessLicenseOcr:
         print(res)
         return res
 
-    # def _get_type(self, image) -> int:
-
-
-
+    # def _get_type(self, image) -> int:

+ 293 - 6
core/parser.py

@@ -7,8 +7,12 @@ import cpca
 import cv2
 import numpy as np
 import string
+
+from paddleocr import PaddleOCR
 from zhon.hanzi import punctuation
 import cn2an
+
+from core.business_parse import BussinessParse0, BussinessParse1
 from core.line_parser import OcrResult
 from core.square_parser import parser_xy
 from stamp.d_stamp import send_request
@@ -76,7 +80,7 @@ class Parser(object):
         return self.res
 
 
-class BusinessLicenseParser(Parser):
+class BusinessLicenseParser0(Parser):
 
     def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
         Parser.__init__(self, ocr_results, raw_results)
@@ -249,11 +253,29 @@ class BusinessLicenseParser(Parser):
         """
         经营范围
         """
-        sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
-        if bool(sb_or):
-            self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
+        ocr = PaddleOCR(use_gpu=True)
+        bs_txt, bs_conf, ad_txt, ad_conf = BussinessParse0(ocr).detection(self.image, self.raw_results)
+        if bool(bs_txt):
+            self.res['business_scope'] = RecItem(bs_txt, bs_conf)
+
+        add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
+        if add_or_0:
+            add_or = add_or_0
         else:
-            self.res['business_scope'] = RecItem('经营范围', random.random())
+            add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
+            if add_or_1:
+                add_or = add_or_1
+            else:
+                return
+        txt = add_or.txt
+        if '所' in txt[:3] or '厂' in txt[:3]:
+            txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
+
+        if len(ad_txt) >= len(txt):
+            self.res['address'] = RecItem(ad_txt, ad_conf)
+        else:
+            self.res['address'] = RecItem(txt, add_or.conf)
+
         return
 
     def address(self):  # sourcery skip: use-named-expression
@@ -317,6 +339,271 @@ class BusinessLicenseParser(Parser):
         self.start_date()
         self.expire_date()
         self.business_scope()
-        self.address()
+        # self.address()
         self.stamp()
         return {key: self.res[key].to_dict() for key in self.keys}
+
+
+class BusinessLicenseParser1(Parser):
+
+    def __init__(self, ocr_results: List[List[OcrResult]], image, raw_results: List):
+        Parser.__init__(self, ocr_results, raw_results)
+        self.image = image
+        self.ocr = PaddleOCR(use_gpu=True)
+
+    def social_code(self):
+        """
+        社会信用代码
+        """
+        # 得在"营业执照"以下
+        result = []
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0]
+            if "统一社" in txt or "会信用" in txt or "用代码" in txt:
+                result = self.result[i:]
+                break
+
+        for i in range(len(result)):
+            res = result[i]
+            txt = res[-1][0]
+            conf = res[-1][1]
+
+            code = ''.join(re.findall(u'[\u0030-\u0039+\u0041-\u005a+\u0061-\u007a]{15,18}', txt))
+            if len(code):
+                self.res['social_code'] = RecItem(code, conf)
+                return
+
+    def company_name(self):
+        """
+        公司名称
+        """
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0]
+            conf = res[-1][1]
+
+            if '称尔' in txt: txt = txt.replace('称尔', '称')
+            if '名' in txt[:4] and '称' in txt[:4]:
+                txt = '名称' + txt.split('称')[-1]
+
+            if '名称' in txt:
+                company_name = txt.split('名称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
+                self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
+                return
+            if '称' in txt and txt[0] == '称' and len(txt) > 5:
+                company_name = txt.split('称')[-1].split('注册')[0].split('组织')[0].split('组成')[0]
+                self.res['company_name'] = RecItem(fix_text(clear_punctuation(company_name)), conf)
+                return
+
+    def legal_person(self):
+        """
+        法人姓名
+        """
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0].replace('市场监督', '')
+            conf = res[-1][1]
+
+            if '法定代表人' in txt or '代表人' in txt:
+                legal_person = txt.split('代表人')[-1].split('营业')[0]
+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
+                return
+            if '经营者' in txt:
+                legal_person = txt.split('经营者')[-1].split('经营')[0]
+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
+                return
+            if '负责人' in txt:
+                legal_person = txt.split('负责人')[-1].split('责人')[0]
+                self.res['legal_person'] = RecItem(fix_text(clear_punctuation(legal_person)), conf)
+                return
+
+    def registered_capital(self):
+        """
+        注册资本
+        """
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0]
+            conf = res[-1][1]
+
+            txt = fix_text(txt)
+
+            if '注册资本' in txt:
+                if '人民币' in txt[:4]:
+                    registered_capital = txt.split('人民币')[-1].split('万元')[0]
+                    txt = f'人民币{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
+                elif '美元' in txt[:4]:
+                    registered_capital = txt.split('美元')[-1].split('万元')[0]
+                    txt = f'美元{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}'
+                elif '人民币' in txt[-4:]:
+                    registered_capital = txt.split('资本')[-1].split('人民币')[0].split('万元')[0]
+                    txt = f'{self.cn_to_an(fix_text(clear_punctuation(registered_capital)))}人民币'
+                else:
+                    registered_capital = txt.split('资本')[-1].split('币')[-1].split('万')[0]
+                    txt = self.cn_to_an(fix_text(clear_punctuation(registered_capital)))
+
+                self.res['registered_capital'] = RecItem(txt, conf)
+                return
+
+    def type(self):  # sourcery skip: hoist-similar-statement-from-if
+        """
+        类型
+        """
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0]
+            conf = res[-1][1]
+
+            txt = fix_text(clear_punctuation(txt))
+
+            if '类型' in txt:
+                txt = txt.split('类型')[-1].split('成立')[0].split('注册')[0]
+                if '公司' in txt:
+                    t_s, s_e = txt.split('公司')[0], txt.split('公司')[-1].replace('(', '').replace(')', '').replace('(',
+                                                                                                                 '').replace(
+                        ')', '')
+
+                    # 分公司
+                    if '分公司' in txt:
+                        t_s = f'{t_s}公司分'
+
+                    txt = f'{t_s}公司({s_e})' if s_e else f'{t_s}公司'
+
+                if txt[0] == '型': txt = txt[1:]
+                self.res['type'] = RecItem(txt, conf)
+                return
+
+    def start_date(self):
+        """
+        成立日期 ⚠️ 注册日期
+        """
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0]
+            conf = res[-1][1]
+
+            txt = fix_text(txt)
+
+            if '日期' in txt:
+                txt = txt.split('日期')[-1]
+                date = self.to_date(txt)
+                self.res['start_date'] = RecItem(date, conf)
+
+    def expire_date(self):  # sourcery skip: hoist-similar-statement-from-if
+        """
+        有效期
+        """
+        for i in range(len(self.result)):
+            res = self.result[i]
+            txt = res[-1][0]
+            conf = res[-1][1]
+
+            if '期限' in txt:
+                if '至' in txt:
+                    txt = ''.join(txt.split('期限')[1:]).replace('*', '')
+                    date_from = txt.split('至')[0]
+                    date_to = txt.split('至')[-1]
+                    date = f'{self.to_date(date_from)} 至 {self.to_date(date_to)}'
+                    self.res['expire_date'] = RecItem(date, conf)
+                    return
+                if '长期' in txt:
+                    self.res['expire_date'] = RecItem('长期', conf)
+                    return
+                else:
+                    self.res['expire_date'] = RecItem('', conf)
+                    return
+
+    def business_scope(self):
+        """
+        经营范围
+        """
+        print('-------------经营范围处理开始--------------')
+
+        bs_txt, bs_conf = BussinessParse1(self.ocr).bs_detection(self.image, self.raw_results)
+
+        if bool(bs_txt):
+            self.res['business_scope'] = RecItem(bs_txt, bs_conf)
+
+        # sb_or: OcrResult = parser_xy(self.result, self.raw_results, '经营范围')
+        # if bool(sb_or):
+        #     self.res['business_scope'] = RecItem(sb_or.txt, sb_or.conf)
+        # else:
+        #     self.res['business_scope'] = RecItem('经营范围', random.random())
+
+        print('-------------经营范围处理结束--------------')
+
+        return
+
+    def address(self):  # sourcery skip: use-named-expression
+        """
+        住所
+        """
+        # 切割方案
+        ad_txt, ad_conf = BussinessParse1(self.ocr).ad_detection(self.image, self.raw_results)
+        # 关键字方案
+        add_or_0: OcrResult = parser_xy(self.result, self.raw_results, '住所')
+        if add_or_0:
+            add_or = add_or_0
+        else:
+            add_or_1: OcrResult = parser_xy(self.result, self.raw_results, '场所')
+            if add_or_1:
+                add_or = add_or_1
+            else:
+                return
+
+        txt = add_or.txt
+
+        if '所' in txt[:3] or '厂' in txt[:3]:
+            txt = txt[:3].split('所')[-1].replace('厂', '') + txt[3:]
+
+        if len(ad_txt) >= len(txt):
+            self.res['address'] = RecItem(ad_txt, ad_conf)
+        else:
+            self.res['address'] = RecItem(txt, add_or.conf)
+
+        return
+
+    def stamp(self):
+        """
+        印章检测
+        """
+        self.res['stamp'] = RecItem(send_request(self.image), 1.)
+        return
+
+    @staticmethod
+    def cn_to_an(num):
+        try:
+            num = int(num)
+        except ValueError:
+            num = str(cn2an.cn2an(f'{num}万'))[:-4]
+        except Exception:
+            raise Exception('注册资本转化出错')
+        finally:
+            return f'{num}万元'
+
+    @staticmethod
+    def to_date(txt):
+        if '长期' in txt: return '长期'
+        if '永久' in txt: return '永久'
+        if '不约定' in txt: return '不约定期限'
+        date_in = re.findall(r"\d+", txt)
+        if len(date_in) == 3:
+            return f'{date_in[0][-4:]}年{date_in[1]}月{date_in[2]}日'
+        else:
+            return ''
+
+    # ["social_code", "company_name", "legal_person", "registered_capital", 'type', 'start_date',
+    #  "business_scope", 'expire_date', 'address', 'stamp']
+    def parse(self):
+        self.social_code()
+        self.company_name()
+        self.legal_person()
+        self.registered_capital()
+        self.type()
+        self.start_date()
+        self.expire_date()
+        self.business_scope()
+        self.address()
+        self.stamp()
+        return {key: self.res[key].to_dict() for key in self.keys}

+ 0 - 2
core/square_parser.py

@@ -122,14 +122,12 @@ def get_key_other_or(res_raw_list, key_heard: OcrResult, key_title):
 
 def parser_xy(res_line, res_raw, key):
     # 在 res_line 中找到 key 对应的坐标
-    print('res_line', res_line)
     key_row = []
     for row in res_line:
         print(row[-1])
         if key in row[-1][0]:
             key_row = row
             break
-    print(key_row)
     if not bool(key_row): return
     key_heard, key_title = get_key_fist_line(key_row, key)
     return get_key_other_or(res_raw, key_heard, key_title)