import copy import re from itertools import chain from pathlib import Path import numpy as np import pandas as pd import json from mdutils.mdutils import MdUtils import requests from YQ_OCR.config import keyDict url = 'http://192.168.199.107:18087' url_path = '/ocr_system/identify' imgs_path = '/Users/sxkj/to_md/YQ_OCR/img' # 1. xlsx -> 正确json文件(写入厂家信息) # 2. 发送图片(带正确json文件) # 3. 把返回的json 和正确的json 进行对比(有key--用返回结果与正确结果比对,无key--用正确结果与返回结果比对) # 编辑距离 def Levenshtein_Distance(str1, str2): matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)] for i in range(1, len(str1) + 1): for j in range(1, len(str2) + 1): d = 0 if (str1[i - 1] == str2[j - 1]) else 1 matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d) return matrix[len(str1)][len(str2)] # 发送请求 带正确答案参数 def send_request(img_path: Path, img_json: str): file = {'file': (img_path.name, open(img_path, 'rb'), img_path)} payload = {'docDataStr': img_json} r = requests.post(url + url_path, files=file, data=payload) return r.json() # 处理返回结果 def _parse_result(r): # sourcery skip: dict-comprehension if r['status'] == '000': result = r['result'] res = {} for field in keyDict: if field in result: res[field] = result[field] res['noKeyList'] = result['noKeyList'] res['logoList'] = result['logoList'] logoFileName = [log['logoFileName'] for log in res['logoList']] res['logoList'] = logoFileName return res elif r['status'] == '101': return "101" # 比较两个json文件 并在md文件中写入对比结果 def evaluate_one(xlsx_dict, res_dict): true_num = 0 xlsx_dict_no_space: dict = copy.deepcopy(xlsx_dict) for index, text in xlsx_dict_no_space.items(): if type(xlsx_dict_no_space[index]) is str: xlsx_dict_no_space[index] = text.replace(' ', '') elif type(xlsx_dict_no_space[index]) is list: for k, v in enumerate(xlsx_dict_no_space[index]): xlsx_dict_no_space[index][k] = v.replace(' ', '') # 有key值的比较 for key_yes in res_dict: if type(res_dict[key_yes]) is str: if Levenshtein_Distance(res_dict[key_yes], xlsx_dict_no_space[key_yes]) == 0: table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅']) true_num += 1 else: table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌']) # 无key值的比较 key_no_dict = {} for key_no_xlsx_no_space, key_no_xlsx in zip(xlsx_dict_no_space['noKeyList'], xlsx_dict['noKeyList']): key_no_dict[key_no_xlsx_no_space] = [] for key_no_res in res_dict['noKeyList']: key_no_dict[key_no_xlsx_no_space].append((Levenshtein_Distance(key_no_xlsx_no_space, key_no_res), key_no_res)) sort_NoKey = sorted(key_no_dict[key_no_xlsx_no_space], key=lambda x: x[0]) NoKey_min_distance = sort_NoKey[0][0] if NoKey_min_distance == 0: table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅']) true_num += 1 else: table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌']) # 算正确率 all_num = len(table_result) // 4 - 1 rate = true_num / all_num all_rate.append(rate) statistics = f'共{all_num}个字段,正确{true_num}个,错误{all_num - true_num}个' return "{:.2f}%".format(rate * 100), statistics # def evaluate_one(xlsx_dict, res_dict): # true_num = 0 # # 有key值的比较 # for key_yes in res_dict: # if type(res_dict[key_yes]) is str: # if Levenshtein_Distance(res_dict[key_yes], xlsx_dict[key_yes]) == 0: # table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅']) # true_num += 1 # else: # table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌']) # # 无key值的比较 # key_no_dict = {} # for key_no_xlsx in xlsx_dict['noKeyList']: # key_no_dict[key_no_xlsx] = [] # for key_no_res in res_dict['noKeyList']: # key_no_dict[key_no_xlsx].append((Levenshtein_Distance(key_no_xlsx, key_no_res), key_no_res)) # sort_NoKey = sorted(key_no_dict[key_no_xlsx], key=lambda x: x[0]) # NoKey_min_distance = sort_NoKey[0][0] # if NoKey_min_distance == 0: # table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅']) # true_num += 1 # else: # table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌']) # # 算正确率 # rate = true_num / (len(table_result) / 4) # all_rate.append(rate) # statistics = f'共{len(table_result) // 4}个字段,正确{true_num}个,错误{len(table_result) // 4 - true_num}个' # return "{:.2f}%".format(rate * 100), statistics # 打开正确的json文件 def open_true_json(j_path): with j_path.open('r') as f: j_dict = json.load(f) j_json_str = json.dumps(j_dict, ensure_ascii=False) return j_dict, j_json_str if __name__ == '__main__': img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg', 'PNG', 'JPG', 'JPEG']]) all_rate = [] for img_path in img_paths: print(img_path) # json result true_d, true_json = open_true_json(img_path.with_suffix('.json')) result = send_request(img_path, true_json) res_d = _parse_result(result) # md md_file_path = img_path.parent / (img_path.with_suffix('.md')) MD = MdUtils(file_name=str(md_file_path)) table_result = ['key值', '正确答案', 'ocr返回结果', '是否正确'] rate, statistics = evaluate_one(true_d, res_d) MD.new_header(level=1, title='测试结果') MD.new_header(level=2, title=f'正确率:{rate}') MD.new_header(level=3, title=statistics) print(f'正确率:{rate}') MD.new_table(columns=4, rows=len(table_result) // 4, text=table_result, text_align='center') MD.create_md_file() print('-------------------------------') all_rate = "{:.2f}%".format(np.mean(all_rate) * 100) print(f'总体正确率:{all_rate}')