123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- from typing import List
- import difflib
- import numpy as np
- from dataclasses import dataclass
- from blfe_core.line_parser import OcrResult
- # 确定key的第一行
- def get_key_fist_line(res_line_list, key):
- def string_similar(s1, s2):
- return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
- # 需改动
- if key == '经营范围':
- print(res_line_list[-1][0])
- key_str = res_line_list[-1][0].split('市')[0].split('住所')[0].split('经营范围')[-1]
- print('key_str', key_str)
- else:
- key_str = res_line_list[-1][0].split(key)[-1]
- # title
- key_title = False
- key_title_list = []
- # print(res_line_list[:-1])
- for r in res_line_list[:-1]:
- # print(r.txt)
- if string_similar(r.txt, key) > 0.7:
- if len(r.txt) > len(key_str) + 2:
- box = r.box
- raw_w = box[1][0] - box[0][0]
- ratio = len(key) / len(r.txt)
- title_w = raw_w * ratio
- box[1][0] = box[0][0] + title_w
- box[2][0] = box[0][0] + title_w
- key_title = OcrResult(np.array(box), key, r.txt)
- break
- else:
- key_title = r
- break
- elif string_similar(r.txt, key) > 0.5 and len(r.txt) == 1:
- key_title_list.append(r)
- if key_title_list:
- key_title = key_title_list[-1]
- # 特殊处理
- if type(res_line_list[0]) == OcrResult and res_line_list[0].txt == '经营范围' and key == '经营范围':
- return res_line_list[1], key_title or res_line_list[1]
- max_num = 0
- max_or = None
- for rll_k, rll_v in enumerate(res_line_list[:-1]):
- m_num = string_similar(key_str, rll_v.txt)
- m_or = rll_v
- if m_num > max_num:
- max_num = m_num
- max_or = m_or
- max_or.txt = max_or.txt.split(key)[-1]
- return max_or, key_title if key_title else max_or
- def get_key_other_or(res_raw_list, key_heard: OcrResult, key_title):
- def h_range():
- h_list = []
- for key in keys_list:
- h_list.append(key.wh[1])
- mean_h = np.mean(h_list)
- h_range = (mean_h * 0, mean_h * 1.1)
- return h_range
- def is_title(r: OcrResult):
- left_len = h_range()[0] * 2
- r_point = [r.lt[0] - left_len, (r.lt[1] + r.wh[1]) / 2]
- title_list = []
- for res in res_raw_list:
- if res.lt[0] < r_point[0] < res.rb[0] and res.lt[1] < r_point[1] < res.rb[1]:
- title_list.append(res)
- if not title_list:
- return True
- for t in title_list:
- if t.txt == key_title.txt:
- return True
- return False
- def merge_box(boxes: List[OcrResult]):
- txt = boxes[0].txt
- box = boxes[0].box
- conf = boxes[0].conf
- for l_b in boxes[1:]:
- txt = txt + l_b.txt
- l, t = np.min(np.min([box, l_b.box], 0), 0)
- r, b = np.max(np.max([box, l_b.box], 0), 0)
- box = np.array([[l, t], [r, t], [r, b], [l, b]])
- conf = np.mean([conf, l_b.conf])
- return OcrResult(box, txt, conf)
- keys_list = [key_heard]
- x_line_list = [key_heard]
- anchor_key: OcrResult = key_heard
- for cell_y_k, cell_y_v in enumerate(res_raw_list):
- cell_x_line = []
- for cell_x_k, cell_x_v in enumerate(res_raw_list[cell_y_k:]):
- # cell 0<y<h的均值 竖直方向上
- # cell a_l<x<a_r 水平方向上 or
- if (h_range()[0] < (cell_x_v.center[1] - anchor_key.center[1]) < h_range()[1] and anchor_key.lt[0] <
- cell_x_v.center[0] < anchor_key.rb[0]) or \
- (abs(cell_x_v.center[1] - anchor_key.center[1]) < h_range()[1] and 0 < cell_x_v.lt[0] -
- anchor_key.rb[0] < h_range()[1] * 3):
- if is_title(cell_x_v):
- cell_x_line.append(cell_x_v)
- # 合并单元格
- if bool(cell_x_line):
- x_line_list.append(merge_box(cell_x_line))
- anchor_key = merge_box(cell_x_line)
- result = merge_box(x_line_list)
- return result
- def parser_xy(res_line, res_raw, key):
- # 在 res_line 中找到 key 对应的坐标
- key_row = []
- for row in res_line:
- print(row[-1])
- if key in row[-1][0]:
- key_row = row
- break
- if not bool(key_row): return
- key_heard, key_title = get_key_fist_line(key_row, key)
- return get_key_other_or(res_raw, key_heard, key_title)
|