chenguilong
/
hr-ocr-business_license


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
							import re
from dataclasses import dataclass
import cv2
from paddleocr import PaddleOCR


@dataclass
class BussinessParse0(object):
    """
    经营范围
    """
    ocr: PaddleOCR

    def detection(self, image, raw_results):
        h, w, _ = image.shape
        left_list = []
        right_list = []
        for i in raw_results:
            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'经营者', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                left_list.append([x0, y1])
            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                left_list.append([x0, y1])
            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                left_list.append([x0, y1])
            elif bool(re.match(r'注册', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                right_list.append([x0, y1])
            elif bool(re.search(r'日期', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                right_list.append([x0, y1])
            elif bool(re.match(r'营业期限', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                right_list.append([x0, y1])
        t1 = sorted(left_list, key=lambda x: x[1], reverse=True)[0][1]
        t2 = sorted(right_list, key=lambda x: x[1], reverse=True)[0][1]
        l1 = sorted(left_list, key=lambda x: x[0])[0][0]
        r1 = sorted(right_list, key=lambda x: x[0])[0][0]

        left_img = image[int(t1): h, int(l1): int(r1)]
        right_img = image[int(t2): h, int(r1): w]
        left_result = self.ocr.ocr(left_img)
        right_result = self.ocr.ocr(right_img)

        left_conf_list = []
        right_conf_list = []
        left_conf = 0.0
        right_conf = 0.0
        left_txt = ''
        right_txt = ''
        for idx, res in enumerate(left_result):
            if len(left_result) - 1 != idx:
                if bool(re.match(r'经营范围', res[1][0])):
                    t = res[0][0][1]
                    d = res[0][2][1]
                    # 判断上一条信息是否为经营范围内容
                    if len(left_result[idx - 1][1][0]) > 15 and abs(left_result[idx - 1][0][2][1] - res[0][0][1]) < abs(
                            d - t) * 1.8:
                        left_txt += left_result[idx - 1][1][0]

                    left_txt += res[1][0]
                    left_conf_list.append(res[1][1])

                    left_position = left_result[idx + 1][0][0][0]
                    left_down_position = (left_result[idx + 1][0][2][1] + left_result[idx + 1][0][3][1]) // 2
                    for x in left_result[idx + 1:]:
                        if abs(x[0][0][0] - left_position) < 130 and abs(left_down_position - x[0][0][1]) < abs(d - t):
                            left_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            if left_txt[-1] == x[1][0][0]:
                                left_txt += x[1][0][1:]
                            else:
                                left_txt += x[1][0]
                            left_conf_list.append(x[1][1])
                # filter_res = filter(lambda x: abs(x[0][0][0] - left_position) < 50, left_result[idx + 1:])
            left_txt = left_txt.replace('经营范围', '')
            if len(left_conf_list):
                left_conf = sum(left_conf_list) / len(left_conf_list)
        # 住所信息
        for idx, res in enumerate(right_result):
            if len(right_result) - 1 != idx:
                if bool(re.match(r'所', res[1][0])):
                    right_txt = ''
                    t = res[0][0][1]
                    d = res[0][2][1]
                    if len(res[1][0]) == 1:
                        right_position = right_result[idx + 1][0][0][0]
                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
                    else:
                        right_txt += res[1][0]
                        right_conf_list.append(res[1][1])
                        right_position = right_result[idx][0][0][0]
                        right_down_position = (right_result[idx][0][2][1] + right_result[idx][0][3][1]) // 2
                    for x in right_result[idx + 1:]:
                        if abs(x[0][0][0] - right_position) < 250 and abs(right_down_position - x[0][0][1]) < abs(
                                d - t) * 1.2 and '登记机关' not in x[1][0]:
                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            right_txt += x[1][0]
                            right_conf_list.append(x[1][1])
                elif bool(re.match(r'住', res[1][0])):
                    right_txt = ''
                    t = res[0][0][1]
                    d = res[0][2][1]

                    if len(res[1][0]) <= 2 or len(res[1][0]) == 4:
                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0])
                        right_position = right_result[idx + 1][0][0][0]
                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
                    else:
                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
                        standard = abs(res[0][1][0] - res[0][0][0]) // 5
                        # 长文本直接添加至结果输出
                        right_txt += res[1][0]
                        right_conf_list.append(res[1][1])
                        right_position = res[0][0][0]
                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2

                    for x in right_result[idx + 1:]:
                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
                                d - t) * 1.2 and '登记机关' not in x[1][0]:
                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            right_txt += x[1][0]
                            right_conf_list.append(x[1][1])
                elif bool(re.match(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', res[1][0])):
                    right_txt = ''
                    t = res[0][0][1]
                    d = res[0][2][1]
                    if len(res[1][0]) == 4:
                        # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
                        standard = abs(res[0][1][0] - right_result[idx + 1][0][0][0]) * 2
                        right_position = right_result[idx + 1][0][0][0]
                        right_down_position = (right_result[idx + 1][0][2][1] + right_result[idx + 1][0][3][1]) // 2
                    else:
                        # 此情况为长文本，则采用框的左右坐标的1/5为标准
                        standard = abs(res[0][1][0] - res[0][0][0]) // 2
                        # 长文本直接添加至结果输出
                        right_txt += res[1][0]
                        right_conf_list.append(res[1][1])
                        right_position = res[0][0][0]
                        right_down_position = (res[0][2][1] + res[0][3][1]) // 2

                    for x in right_result[idx + 1:]:
                        if abs(x[0][0][0] - right_position) < standard and abs(right_down_position - x[0][0][1]) < abs(
                                d - t) * 1.2 and '登记机关' not in x[1][0]:
                            right_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            right_txt += x[1][0]
                            right_conf_list.append(x[1][1])
            right_txt = right_txt.replace('广北京市朝阳区', '北京市朝阳区')
            right_txt = re.sub(r'经[\s]{0,3}营[\s]{0,3}场[\s]{0,3}所', '', right_txt)
            if bool(re.match(r'所', right_txt)) or bool(re.match(r'住', right_txt)):
                right_txt = right_txt.replace('所', '')
                right_txt = right_txt.replace('住', '')
            if len(right_conf_list):
                right_conf = sum(right_conf_list) / len(right_conf_list)
        return left_txt, left_conf, right_txt, right_conf


@dataclass
class BussinessParse1(object):
    """
    经营范围
    """
    ocr: PaddleOCR

    def bs_detection(self, image, raw_results):
        h, w, _ = image.shape
        down_list = []
        down_list2 = []
        raw_txt = ''
        down_txt = ''
        raw_conf_list = []
        down_conf_list = []
        down_conf = 0.0
        simple_key = False
        for i in raw_results:
            if bool(re.search(r'日期', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                down_list.append([x0, y1])
            elif bool(re.match(r'营业期限', i.txt)):
                simple_key = True
                [x0, _] = i.lt
                [_, y1] = i.rb
                down_list.append([x0, y1])
            elif bool(re.match(r'登记', i.txt)):
                [_, y0] = i.lt
                down_list2.append(y0)
            elif bool(re.match(r'经营范围', i.txt)):
                [x0, y0] = i.lt
                [x1, _] = i.rb
                # 第一方案：
                for j in raw_results:
                    [x, _] = j.lt
                    [_, y] = j.rb
                    if abs(x - x1) <= abs(x1 - x0) and y >= y0 and '登记' not in j.txt:
                        raw_txt += j.txt
                        raw_conf_list.append(j.conf)
        if len(down_list) and len(down_list2):
            t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
            l2 = sorted(down_list, key=lambda x: x[0])[0][0]
            d2 = int(down_list2[0]) if len(down_list2) else h
            down_img = image[int(t2): d2, int(l2): w]
            h1, w1, _ = down_img.shape
            down_result = self.ocr.ocr(down_img)
            # print('simple_key', simple_key)
            # 第二方案(检索到‘营业期限’关键词)
            if simple_key:
                # print('111')
                for res in down_result:
                    l1 = res[0][0][0]
                    if l1 < (7 * w1 // 24):
                        down_txt += res[1][0]
                        down_conf_list.append(res[1][1])
                # print(down_txt)
            # 第三套方案
            else:
                for idx, res in enumerate(down_result):
                    if bool(re.match(r'经营范围', res[1][0])):
                        t = res[0][0][1]
                        for i in down_result:
                            if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
                                down_txt += res[1][0]
                                down_conf_list.append(res[1][1])
        down_txt = down_txt.replace('经营范围', '')
        raw_txt = raw_txt.replace('经营范围', '')
        if len(down_conf_list):
            down_conf = sum(down_conf_list) / len(down_conf_list)
        if len(raw_txt) > len(down_txt):
            down_txt = raw_txt
            down_conf = sum(raw_conf_list) / len(raw_conf_list)
        # cv2.imshow('11', down_img)
        # cv2.waitKey(0)
        return down_txt, down_conf

    def ad_detection(self, image, raw_results):
        h, w, _ = image.shape
        top_list1 = []
        top_list2 = []
        top_conf_list = []
        top_conf = 0.0
        top_txt = ''
        last_key = ''
        type_key = False
        for i in raw_results:
            if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
                [_, y0] = i.lt
                top_list2.append(y0)
            elif bool(re.match(r'名', i.txt)) or bool(re.match(r'称', i.txt)):
                [x0, _] = i.lt
                [_, y1] = i.rb
                top_list1.append([x0, y1])
            elif bool(re.match(r'类', i.txt)) or bool(re.match(r'型', i.txt)):
                type_key = True
                [x0, _] = i.lt
                [_, y1] = i.rb
                top_list1.append([x0, y1])
            elif bool(re.match(r'注册资本', i.txt)):
                [_, y0] = i.lt
                top_list2.append(y0)
            elif bool(re.search(r'日期', i.txt)):
                [_, y0] = i.lt
                top_list2.append(y0)
            elif bool(re.match(r'营业期限', i.txt)):
                [_, y0] = i.lt
                top_list2.append(y0)
        t1 = sorted(top_list1, key=lambda x: x[1], reverse=True)[0][1] * 0.99
        l1 = sorted(top_list1, key=lambda x: x[0])[0][0]
        d1 = sorted(top_list2)[0]

        top_img = image[int(t1): int(d1), int(l1): w]
        top_result = self.ocr.ocr(top_img)

        # 住所信息
        for idx, res in enumerate(top_result):
            # print(res)
            if bool(re.match(r'所', res[1][0])):
                top_txt = ''
                t = res[0][0][1]
                d = res[0][2][1]
                if len(last_key):
                    top_txt += last_key
                    print('top_txt', top_txt)
                if len(res[1][0]) == 1 and len(top_result) - 1 != idx:
                    top_position = top_result[idx + 1][0][0][0]
                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
                else:
                    top_txt += res[1][0]
                    top_conf_list.append(res[1][1])
                    top_position = top_result[idx][0][0][0]
                    top_down_position = (top_result[idx][0][2][1] + top_result[idx][0][3][1]) // 2
                if len(top_result) - 1 != idx:
                    for x in top_result[idx + 1:]:
                        if abs(x[0][0][0] - top_position) < 250 and abs(top_down_position - x[0][0][1]) <= abs(
                                d - t) * 1.2:
                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            top_txt += x[1][0]
                            top_conf_list.append(x[1][1])
                # print('top_txt', top_txt)
            elif bool(re.match(r'住', res[1][0])):
                top_txt = ''
                t = res[0][0][1]
                d = res[0][2][1]
                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
                        top_result[idx - 1][1][0] and idx != 0:
                    last_key = top_result[idx - 1][1][0]
                if (len(res[1][0]) <= 2 or len(res[1][0]) == 4) and len(top_result) - 1 != idx:
                    # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0])
                    top_position = top_result[idx + 1][0][0][0]
                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
                else:
                    # 此情况为长文本，则采用框的左右坐标的1/5为标准
                    standard = abs(res[0][1][0] - res[0][0][0]) // 5
                    # 长文本直接添加至结果输出
                    top_txt += res[1][0]
                    top_conf_list.append(res[1][1])
                    top_position = res[0][0][0]
                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
                if len(top_result) - 1 != idx:
                    for x in top_result[idx + 1:]:
                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
                                d - t) * 1.2:
                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            top_txt += x[1][0]
                            top_conf_list.append(x[1][1])
                # print(top_txt)
            elif bool(re.match(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', res[1][0])):
                top_txt = ''
                t = res[0][0][1]
                d = res[0][2][1]
                if len(top_result[idx - 1][1][0]) > 15 and abs(top_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
                        d - t) * 1.8 and '有限' not in top_result[idx - 1][1][0] and '型' not in \
                        top_result[idx - 1][1][0] and idx != 0:
                    top_txt += top_result[idx - 1][1][0]
                if len(res[1][0]) == 4 and len(top_result) - 1 != idx:
                    # 若‘住所’或‘经营场所’为独立框，则采用独立框的右坐标与下一个检测框的左坐标的差值为标准
                    standard = abs(res[0][1][0] - top_result[idx + 1][0][0][0]) * 2
                    top_position = top_result[idx + 1][0][0][0]
                    top_down_position = (top_result[idx + 1][0][2][1] + top_result[idx + 1][0][3][1]) // 2
                else:
                    # 此情况为长文本，则采用框的左右坐标的1/2为标准
                    standard = abs(res[0][1][0] - res[0][0][0]) // 2
                    # 长文本直接添加至结果输出
                    top_txt += res[1][0]
                    top_conf_list.append(res[1][1])
                    top_position = res[0][0][0]
                    top_down_position = (res[0][2][1] + res[0][3][1]) // 2
                if len(top_result) - 1 != idx:
                    for x in top_result[idx + 1:]:
                        if abs(x[0][0][0] - top_position) < standard and abs(top_down_position - x[0][0][1]) <= abs(
                                d - t) * 1.2:
                            top_down_position = (x[0][2][1] + x[0][3][1]) // 2
                            top_txt += x[1][0]
                            top_conf_list.append(x[1][1])
                            top_conf_list.append(x[1][1])
        if len(top_txt) == 0 and type_key:
            for res in top_result:
                top_txt += res[1][0]
        top_txt = re.sub(r'营[\s]{0,3}业[\s]{0,3}场[\s]{0,3}所', '', top_txt)
        if bool(re.match(r'所', top_txt)) or bool(re.match(r'住', top_txt)):
            top_txt = top_txt.replace('所', '')
            top_txt = top_txt.replace('住', '')
        if len(top_conf_list):
            top_conf = sum(top_conf_list) / len(top_conf_list)

        # cv2.imshow('11', top_img)
        # cv2.waitKey(0)
        return top_txt, top_conf