convert_MD.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. import copy
  2. import re
  3. from itertools import chain
  4. from pathlib import Path
  5. import numpy as np
  6. import pandas as pd
  7. import json
  8. from mdutils.mdutils import MdUtils
  9. import requests
  10. from YQ_OCR.config import keyDict
  11. url = 'http://192.168.199.107:18087'
  12. url_path = '/ocr_system/identify'
  13. imgs_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/YQ_OCR/img'
  14. # 1. xlsx -> 正确json文件(写入厂家信息)
  15. # 2. 发送图片(带正确json文件)
  16. # 3. 把返回的json 和正确的json 进行对比(有key--用返回结果与正确结果比对,无key--用正确结果与返回结果比对)
  17. # 编辑距离
  18. def Levenshtein_Distance(str1, str2):
  19. matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
  20. for i in range(1, len(str1) + 1):
  21. for j in range(1, len(str2) + 1):
  22. d = 0 if (str1[i - 1] == str2[j - 1]) else 1
  23. matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
  24. return matrix[len(str1)][len(str2)]
  25. # 发送请求 带正确答案参数
  26. def send_request(img_path: Path, img_json: str):
  27. file = {'file': (img_path.name, open(img_path, 'rb'), img_path)}
  28. payload = {'docDataStr': img_json}
  29. r = requests.post(url + url_path, files=file, data=payload)
  30. return r.json()
  31. # 处理返回结果
  32. def _parse_result(r): # sourcery skip: dict-comprehension
  33. if r['status'] == '000':
  34. result = r['result']
  35. res = {}
  36. for field in keyDict:
  37. if field in result:
  38. res[field] = result[field]
  39. res['noKeyList'] = result['noKeyList']
  40. res['logoList'] = result['logoList']
  41. logoFileName = [log['logoFileName'] for log in res['logoList']]
  42. res['logoList'] = logoFileName
  43. return res
  44. elif r['status'] == '101':
  45. return "101"
  46. # 比较两个json文件 并在md文件中写入对比结果
  47. def evaluate_one(xlsx_dict, res_dict):
  48. true_num = 0
  49. xlsx_dict_no_space = copy.deepcopy(xlsx_dict)
  50. for index, text in enumerate(xlsx_dict_no_space):
  51. xlsx_dict_no_space[index] = text.replace(' ', '')
  52. for key_yes in res_dict:
  53. if type(res_dict[key_yes]) is str:
  54. if Levenshtein_Distance(res_dict[key_yes], xlsx_dict_no_space[key_yes]) == 0:
  55. table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
  56. true_num += 1
  57. else:
  58. table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
  59. key_no_dict = {}
  60. for key_no_xlsx in xlsx_dict_no_space['noKeyList']:
  61. key_no_dict[key_no_xlsx] = []
  62. for key_no_res in res_dict['noKeyList']:
  63. key_no_dict[key_no_xlsx].append((Levenshtein_Distance(key_no_xlsx, key_no_res), key_no_res))
  64. sort_NoKey = sorted(key_no_dict[key_no_xlsx], key=lambda x: x[0])
  65. NoKey_min_distance = sort_NoKey[0][0]
  66. if NoKey_min_distance == 0:
  67. table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
  68. true_num += 1
  69. else:
  70. table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
  71. rate = true_num / (len(table_result) / 4)
  72. all_rate.append(rate)
  73. statistics = f'共{len(table_result) // 4}个字段,正确{true_num}个,错误{len(table_result) // 4 - true_num}个'
  74. return "{:.2f}%".format(rate * 100), statistics
  75. # 打开正确的json文件
  76. def open_true_json(j_path):
  77. with j_path.open('r') as f:
  78. j_dict = json.load(f)
  79. j_json_str = json.dumps(j_dict, ensure_ascii=False)
  80. return j_dict, j_json_str
  81. if __name__ == '__main__':
  82. img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg', 'PNG', 'JPG', 'JPEG']])
  83. all_rate = []
  84. for img_path in img_paths:
  85. print(img_path)
  86. # json result
  87. true_d, true_json = open_true_json(img_path.with_suffix('.json'))
  88. result = send_request(img_path, true_json)
  89. res_d = _parse_result(result)
  90. # md
  91. md_file_path = img_path.parent / (img_path.with_suffix('.md'))
  92. MD = MdUtils(file_name=str(md_file_path))
  93. table_result = ['key值', '正确答案', 'ocr返回答案', '是否正确']
  94. rate, statistics = evaluate_one(true_d, res_d)
  95. MD.new_header(level=1, title='测试结果')
  96. MD.new_header(level=2, title=f'正确率:{rate}')
  97. MD.new_header(level=3, title=statistics)
  98. print(f'正确率:{rate}')
  99. MD.new_table(columns=4, rows=len(table_result) // 4, text=table_result, text_align='center')
  100. MD.create_md_file()
  101. print('-------------------------------')
  102. all_rate = "{:.2f}%".format(np.mean(all_rate) * 100)
  103. print(f'总体正确率:{all_rate}')