from typing import List import difflib import numpy as np from dataclasses import dataclass from blfe_core.line_parser import OcrResult # 确定key的第一行 def get_key_fist_line(res_line_list, key): def string_similar(s1, s2): return difflib.SequenceMatcher(None, s1, s2).quick_ratio() # 需改动 if key == '经营范围': print(res_line_list[-1][0]) key_str = res_line_list[-1][0].split('市')[0].split('住所')[0].split('经营范围')[-1] print('key_str', key_str) else: key_str = res_line_list[-1][0].split(key)[-1] # title key_title = False key_title_list = [] # print(res_line_list[:-1]) for r in res_line_list[:-1]: # print(r.txt) if string_similar(r.txt, key) > 0.7: if len(r.txt) > len(key_str) + 2: box = r.box raw_w = box[1][0] - box[0][0] ratio = len(key) / len(r.txt) title_w = raw_w * ratio box[1][0] = box[0][0] + title_w box[2][0] = box[0][0] + title_w key_title = OcrResult(np.array(box), key, r.txt) break else: key_title = r break elif string_similar(r.txt, key) > 0.5 and len(r.txt) == 1: key_title_list.append(r) if key_title_list: key_title = key_title_list[-1] # 特殊处理 if type(res_line_list[0]) == OcrResult and res_line_list[0].txt == '经营范围' and key == '经营范围': return res_line_list[1], key_title or res_line_list[1] max_num = 0 max_or = None for rll_k, rll_v in enumerate(res_line_list[:-1]): m_num = string_similar(key_str, rll_v.txt) m_or = rll_v if m_num > max_num: max_num = m_num max_or = m_or max_or.txt = max_or.txt.split(key)[-1] return max_or, key_title if key_title else max_or def get_key_other_or(res_raw_list, key_heard: OcrResult, key_title): def h_range(): h_list = [] for key in keys_list: h_list.append(key.wh[1]) mean_h = np.mean(h_list) h_range = (mean_h * 0, mean_h * 1.1) return h_range def is_title(r: OcrResult): left_len = h_range()[0] * 2 r_point = [r.lt[0] - left_len, (r.lt[1] + r.wh[1]) / 2] title_list = [] for res in res_raw_list: if res.lt[0] < r_point[0] < res.rb[0] and res.lt[1] < r_point[1] < res.rb[1]: title_list.append(res) if not title_list: return True for t in title_list: if t.txt == key_title.txt: return True return False def merge_box(boxes: List[OcrResult]): txt = boxes[0].txt box = boxes[0].box conf = boxes[0].conf for l_b in boxes[1:]: txt = txt + l_b.txt l, t = np.min(np.min([box, l_b.box], 0), 0) r, b = np.max(np.max([box, l_b.box], 0), 0) box = np.array([[l, t], [r, t], [r, b], [l, b]]) conf = np.mean([conf, l_b.conf]) return OcrResult(box, txt, conf) keys_list = [key_heard] x_line_list = [key_heard] anchor_key: OcrResult = key_heard for cell_y_k, cell_y_v in enumerate(res_raw_list): cell_x_line = [] for cell_x_k, cell_x_v in enumerate(res_raw_list[cell_y_k:]): # cell 0