xujiayue пре 2 година
родитељ
комит
c7b839ec12

+ 1 - 1
HR_OCR/TestAllOcr/config.py

@@ -3,7 +3,7 @@ Author: zeke-chin zeke-chin@icloud.com
 Date: 2022-09-26 14:58:10
 LastEditors: zeke-chin zeke-chin@icloud.com
 LastEditTime: 2022-09-30 09:59:43
-FilePath: /to_md/HR_OCR/TestAllOcr/config.py
+FilePath: /utils/HR_OCR/TestAllOcr/config.py
 Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 '''
 import base64

+ 1 - 1
HR_OCR/TestAllOcr/test_interface.py

@@ -3,7 +3,7 @@ Author: zeke-chin zeke-chin@icloud.com
 Date: 2022-09-28 20:28:41
 LastEditors: zeke-chin zeke-chin@icloud.com
 LastEditTime: 2022-09-30 10:06:35
-FilePath: /to_md/HR_OCR/TestAllOcr/test_interface.py
+FilePath: /utils/HR_OCR/TestAllOcr/test_interface.py
 Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 '''
 from pathlib import Path

+ 1 - 1
HR_OCR/test_script/to_md/README.md

@@ -39,7 +39,7 @@
   filed = 'regbook'
   
   # 若md_path为None 则默认使用图片父路径为markdown保存路径
-  # md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+  # md_path = '/Users/zeke/work/sx/OCR/HROCR/utils/example' or image_path.parent
   md_path = None or image_path.parent
   ```
 

+ 2 - 2
HR_OCR/test_script/to_md/use.py

@@ -3,7 +3,7 @@ Author: zeke-chin zeke-chin@icloud.com
 Date: 2022-09-28 20:28:41
 LastEditors: zeke-chin zeke-chin@icloud.com
 LastEditTime: 2022-09-30 15:08:48
-FilePath: /to_md/HR_OCR/to_md/use.py
+FilePath: /utils/HR_OCR/utils/use.py
 Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 '''
 
@@ -24,7 +24,7 @@ md_name = 'CET'
 filed = 'cet'
 
 # 若md_path为None 则默认使用图片父路径为markdown保存路径
-# md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+# md_path = '/Users/zeke/work/sx/OCR/HROCR/utils/example' or image_path.parent
 md_path = None or image_path.parent
 
 md_file = parser_path(Path(md_path) / Path(md_name + image_path.stem), image_rotate)

+ 2 - 2
HR_OCR/test_script/tools/README.md

@@ -6,7 +6,7 @@
 # 项目url
 url = 'http://192.168.199.27:18040'
 # 目标文件夹
-imgs_path = './HR_OCR/to_md/example/img'
+imgs_path = './HR_OCR/utils/example/img'
 
 def send_request(img_path, image_type = 0):
     with open(img_path, 'rb') as f:
@@ -29,6 +29,6 @@ def send_request(img_path, image_type = 0):
 
 ```python
 # 需要格式化的目的文件夹路径
-target_path = './HR_OCR/to_md/example/img'
+target_path = './HR_OCR/utils/example/img'
 ```
 

+ 1 - 1
HR_OCR/test_script/tools/convert_json.py

@@ -6,7 +6,7 @@ import base64
 from itertools import chain
 
 url = 'http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr'
-imgs_path = './HR_OCR/to_md/example/img'
+imgs_path = './HR_OCR/utils/example/img'
 
 def send_request(img_path, image_type = 0):
     with open(img_path, 'rb') as f:

+ 1 - 1
HR_OCR/test_script/tools/suffix.py

@@ -4,7 +4,7 @@ from itertools import chain
 import sys
 
 # conf
-target_path = './HR_OCR/to_md/example/img'
+target_path = './HR_OCR/utils/example/img'
 #suffix = sys.argv[2]
 suffix = 'jpg' if len(sys.argv) != 3 else sys.argv[2]
 

+ 1 - 1
HR_OCR/to_md/README.md

@@ -39,7 +39,7 @@
   filed = 'regbook'
   
   # 若md_path为None 则默认使用图片父路径为markdown保存路径
-  # md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+  # md_path = '/Users/zeke/work/sx/OCR/HROCR/utils/example' or image_path.parent
   md_path = None or image_path.parent
   ```
 

+ 2 - 2
HR_OCR/to_md/use.py

@@ -3,7 +3,7 @@ Author: zeke-chin zeke-chin@icloud.com
 Date: 2022-09-28 20:28:41
 LastEditors: zeke-chin zeke-chin@icloud.com
 LastEditTime: 2022-09-30 15:08:48
-FilePath: /to_md/HR_OCR/to_md/use.py
+FilePath: /utils/HR_OCR/utils/use.py
 Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 '''
 
@@ -26,7 +26,7 @@ md_name = 'CET'
 filed = 'cet'
 
 # 若md_path为None 则默认使用图片父路径为markdown保存路径
-# md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+# md_path = '/Users/zeke/work/sx/OCR/HROCR/utils/example' or image_path.parent
 md_path = None or image_path.parent
 
 md_file = parser_path(Path(md_path) / Path(md_name + image_path.stem), image_rotate)

+ 2 - 2
HR_OCR/tools/README.md

@@ -6,7 +6,7 @@
 # 项目url
 url = 'http://192.168.199.27:18040'
 # 目标文件夹
-imgs_path = './HR_OCR/to_md/example/img'
+imgs_path = './HR_OCR/utils/example/img'
 
 def send_request(img_path, image_type = 0):
     with open(img_path, 'rb') as f:
@@ -29,6 +29,6 @@ def send_request(img_path, image_type = 0):
 
 ```python
 # 需要格式化的目的文件夹路径
-target_path = './HR_OCR/to_md/example/img'
+target_path = './HR_OCR/utils/example/img'
 ```
 

+ 1 - 1
HR_OCR/tools/convert_json.py

@@ -6,7 +6,7 @@ import base64
 from itertools import chain
 
 url = 'http://192.168.199.27:18060'
-imgs_path = '/Users/sxkj/to_md/img'
+imgs_path = '/Users/sxkj/utils/img'
 
 def send_request(img_path):
     with open(img_path, 'rb') as f:

+ 2 - 2
HR_OCR/tools/suffix.py

@@ -5,9 +5,9 @@ import sys
 
 # conf
 <<<<<<< HEAD
-target_path = '/Users/sxkj/to_md/9.29/1'
+target_path = '/Users/sxkj/utils/9.29/1'
 =======
-target_path = './HR_OCR/to_md/example/img'
+target_path = './HR_OCR/utils/example/img'
 >>>>>>> a0b1c051142a2687d8ae2c63f543f021d3239f1f
 #suffix = sys.argv[2]
 suffix = 'jpg' if len(sys.argv) != 3 else sys.argv[2]

+ 0 - 0
YQ_OCR/__init__.py


+ 5 - 0
YQ_OCR/configs/__init__.py

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author : xujiayue
+# @File   : __init__.py.py
+
+

+ 6 - 1
YQ_OCR/config.py → YQ_OCR/configs/config.py

@@ -1,3 +1,8 @@
+url = 'http://192.168.199.107:18087'
+url_path = '/ocr_system/identify'
+# imgs_path = '/Users/sxkj/utils/YQ_OCR/img'
+imgs_path = './img'
+
 keyDict = {
     "productCategory": '产品种类',
     "ingredients": '配料',
@@ -8,4 +13,4 @@ keyDict = {
     "conSerHotline": '消费者服务热线',
     "tips": '温馨提示|友情提示',
     "welcome": '欢迎访问'
-}
+}

+ 43 - 0
YQ_OCR/main.py

@@ -0,0 +1,43 @@
+import numpy as np
+from itertools import chain
+from mdutils.mdutils import MdUtils
+from YQ_OCR.utils.datasets import Dataset
+from YQ_OCR.utils.text2md import TableMD
+from YQ_OCR.utils.utils import *
+
+# 1. xlsx -> 正确json文件(写入厂家信息)
+# 2. 发送图片(带正确json文件)
+# 3. 把返回的json 和正确的json 进行对比(有key--用返回结果与正确结果比对,无key--用正确结果与返回结果比对)
+
+
+if __name__ == '__main__':
+    img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg']])
+    all_rate = []
+    table_mean_acc = []
+    for img_path in img_paths:
+        print(img_path)
+
+        true_d, true_json = open_true_json(img_path.with_suffix('.json'))
+        result = send_request(img_path, true_json)
+        res_d = parse_result(result)
+
+        markdown = TableMD(img_path.name)
+        markdown.write_header(title='推理结果', level=2)
+        # json result
+        rate, statistics = markdown.evaluate_one(true_d, res_d)
+        all_rate.append(rate)
+        print(f'文字识别正确率:{rate:.2f}%')
+
+        # table gt result
+        dataset = Dataset(gt_file=img_path.with_suffix('.txt'), img_name=img_path.name, results=res_d)
+        markdown.write_table_accuracy(ds=dataset, key='new')
+        table_acc = markdown.get_table_accuracy()
+        table_mean_acc.append(table_acc)
+        print(f'表格识别正确率:{table_acc:.2f}%')
+        markdown.f.create_md_file()
+
+    print('----------------------------------------')
+    all_rate = "{:.2f}%".format(np.mean(all_rate))
+    all_table_rate = "{:.2f}%".format(np.mean(table_mean_acc))
+    print(f'文字识别总体正确率:{all_rate}')
+    print(f'表格识别总体正确率:{all_table_rate}')

+ 32 - 2
YQ_OCR/output/03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版-表格识别结果.md

@@ -2,11 +2,41 @@
 
 
 
-# 表格识别结果测试报告
+# 测试结果报告
 
 ## 推理结果
 
-### 03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版.jpg,共检测27处,正确24,错误3,表格正确率:88.89%
+## 文字识别正确率:80.95238095238095
+
+### 共21个字段,正确17个,错误4个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:调制豆乳|产品种类:调制豆乳|✅|
+|ingredients|配料:饮用水、大豆(非转基因)、白砂糖|配料:饮用水、大豆(非转基因)白砂糖大豆添加量:44g/瓶营美成分表|❌|
+|proStanCode|产品标准代号:GB/T30885|产品标准代号:GB/T30885|✅|
+|productionDate|生产日期:见瓶盖|生产日期:见瓶盖|✅|
+|shelfLife|保质期:常温密闭条件下9个月|保质期:常温密闭条件下9个月|✅|
+|storageConditions|贮存条件:请保存于阴凉干燥处,避免阳光直晒、高温|贮存条件:请保存十阴凉干燥处避免阳光直晒、高温。|❌|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|tips|温馨提示:请勿带包装置于微波炉中加热。|温馨提示:请勿带包装置于微波炉中加热。|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|植选|植选|✅|
+|无key值|浓香豆乳畅饮系列|浓香豆乳畅饮系列|✅|
+|无key值|大豆添加量:44g/瓶|大豆添加量:44g/瓶|✅|
+|无key值|原味|原味|✅|
+|无key值|全程非转基因可追溯大豆|全程非转基因可追溯大豆|✅|
+|无key值|3.0g/100mL|3.0g/100mL|✅|
+|无key值|优质植物蛋白|优质植物蛋白|✅|
+|无key值|保持环境清洁请勿乱抛空瓶|保持环境清洁请勿乱抛空瓶|✅|
+|无key值|为保证产品风味,开启后需冷藏并尽快饮用完毕。|为保证产品风味,开启后需冷藏并尽快饮用完毕。|✅|
+|无key值|可能会有少量蛋白沉淀和脂肪上浮,属正常现象,请放心饮用。如发现涨瓶,请勿开启。|可能会有少量蛋日沉淀和脂肪上浮属正常现象,请放心饮用。如发现胀瓶,请勿开启。|❌|
+|无key值|净含量:315mL|净含量:315mL|❌|
+|无key值|6907992515007|6907992515007|✅|
+
+## 表格识别正确率:88.89%
+
+### 共检测27处,正确24,错误3
 
 |位置|标注结果|新模型推理|是否一致|
 | :---: | :---: | :---: | :---: |

+ 29 - 2
YQ_OCR/output/巧克力味牛奶饮品-表格识别结果.md

@@ -2,11 +2,38 @@
 
 
 
-# 表格识别结果测试报告
+# 测试结果报告
 
 ## 推理结果
 
-### 巧克力味牛奶饮品.jpg,共检测18处,正确18,错误0,表格正确率:100.00%
+## 文字识别正确率:72.22222222222221
+
+### 共18个字段,正确13个,错误5个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:配制型含乳饮料|产品种类:配制型含乳饮料|✅|
+|proStanCode|产品标准代号:GB/T21732|产品标准代号:GB/T21732|✅|
+|productionDate|生产日期:见盒顶部|生产日期:见盒顶部|✅|
+|shelfLife|保质期:常温密闭条件下6个月|保质期:常温密闭条件下6个月|✅|
+|storageConditions|贮存条件:未开启前,无需冷藏,开启之后,立即饮用。|贮存条件:未开启前无需冷藏开启之后立即饮用|❌|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|tips|友情提示:喝前摇一摇|友情提示:喝前摇一摇|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|牛奶饮品|牛奶饮品|✅|
+|无key值|产品名称:巧克力味牛奶饮品|产品名称:巧克力味牛奶饮品|✅|
+|无key值|生产日期:见箱体|生产日期:见盒顶部|❌|
+|无key值|切勿带包装置于微波炉中加热|勿带包装置子微波炉中加热|❌|
+|无key值|清真|清真|✅|
+|无key值|保持环境清洁请勿乱抛空包|保持环境清洁请勿乱抛空包|✅|
+|无key值|伊利|伊利|✅|
+|无key值|(具体生产商/产地见生产日期末端代码)|(具体生产商/产地见生产日期末端代码)|❌|
+|无key值|净含量:250mL|净含量:250mL|❌|
+|无key值|6907992500102|6907992500102|✅|
+
+## 表格识别正确率:100.00%
+
+### 共检测18处,正确18,错误0
 
 |位置|标注结果|新模型推理|是否一致|
 | :---: | :---: | :---: | :---: |

+ 29 - 2
YQ_OCR/output/餐饮纯牛奶 内包-表格识别结果.md

@@ -2,11 +2,38 @@
 
 
 
-# 表格识别结果测试报告
+# 测试结果报告
 
 ## 推理结果
 
-### 餐饮纯牛奶 内包.jpg,共检测21处,正确21,错误0,表格正确率:100.00%
+## 文字识别正确率:88.88888888888889
+
+### 共18个字段,正确16个,错误2个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:全脂灭菌纯牛乳|产品种类:全脂灭菌纯牛乳|✅|
+|ingredients|配料:生牛乳|配料:生牛乳|✅|
+|proStanCode|产品标准代号:GB25190|产品标准代号:GB25190|✅|
+|productionDate|生产日期:见盒顶部|生产日期:见盒顶部|✅|
+|shelfLife|保质期:常温密闭条件下6个月|保质期:常温密闭条件下6个月|✅|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|纯牛奶|纯牛奶|✅|
+|无key值|餐饮之选|餐饮之选|✅|
+|无key值|非脂乳固体≥8.5%|非脂乳固体≥8.5%|✅|
+|无key值|保持环境清洁请勿乱抛空包|保持环境清洁请勿乱抛空包|✅|
+|无key值|切勿带包装置于微波炉中加热。|切勿带包装置于微波炉中加热|❌|
+|无key值|净含量:1L|净含量:1L|❌|
+|无key值|6907992513621|6907992513621|✅|
+|无key值|内蒙古伊利实业集团股份有限公司出品 地址:内蒙古自治区呼和浩特市金山开发区金山大街1号|内蒙古伊利实业集团股份有限公司出品地址:内蒙古自治区呼和浩特市金山开发区金山大街1号|✅|
+|无key值|宁夏伊利乳业有限责任公司(A12) 产地及地址:宁夏吴忠市利通区金积工业园区 食品生产许可证编号:SC10564030200130|宁夏伊利乳业有限责任公司(A12)产地及地址:宁夏吴忠市利通区金积工业园区食品生产许可证编号:SC10564030200130|✅|
+|无key值|阜新伊利乳品有限责任公司(B6) 产地及地址:辽宁省阜新市阜蒙县园区路2号 食品生产许可证编号:SC10521090000011|阜新伊利乳品有限责任公司(B6)产地及地址:辽宁省阜新市阜蒙县园区路2号食品生产许可证编号:SC10521090000011|✅|
+|无key值|定州伊利乳业有限责任公司(C1) 产地及地址:河北省定州市伊利工业园区 食品生产许可证编号:SC10613068200020|定州伊利乳业有限责任公司(C1)产地及地址:河北省定州市伊利工业园区食品生产许可证编号:SC10613068200020|✅|
+
+## 表格识别正确率:100.00%
+
+### 共检测21处,正确21,错误0
 
 |位置|标注结果|新模型推理|是否一致|
 | :---: | :---: | :---: | :---: |

+ 0 - 177
YQ_OCR/to_md/convert_MD.py

@@ -1,177 +0,0 @@
-import copy
-import re
-from itertools import chain
-from pathlib import Path
-import numpy as np
-import pandas as pd
-import json
-from mdutils.mdutils import MdUtils
-import requests
-import html2text
-from YQ_OCR.config import keyDict
-from YQ_OCR.to_md.datasets import Dataset
-from YQ_OCR.to_md.text2md import TableMD
-
-url = 'http://192.168.199.107:18087'
-url_path = '/ocr_system/identify'
-# imgs_path = '/Users/sxkj/to_md/YQ_OCR/img'
-imgs_path = '../img'
-
-
-# 1. xlsx -> 正确json文件(写入厂家信息)
-# 2. 发送图片(带正确json文件)
-# 3. 把返回的json 和正确的json 进行对比(有key--用返回结果与正确结果比对,无key--用正确结果与返回结果比对)
-
-# 编辑距离
-def Levenshtein_Distance(str1, str2):
-    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
-    for i in range(1, len(str1) + 1):
-        for j in range(1, len(str2) + 1):
-            d = 0 if (str1[i - 1] == str2[j - 1]) else 1
-            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
-    return matrix[len(str1)][len(str2)]
-
-
-# 发送请求 带正确答案参数
-def send_request(img_path: Path, img_json: str):
-    file = {'file': (img_path.name, open(img_path, 'rb'), img_path)}
-    payload = {'docDataStr': img_json}
-    r = requests.post(url + url_path, files=file, data=payload)
-    return r.json()
-
-
-# 处理返回结果
-def _parse_result(r):  # sourcery skip: dict-comprehension
-    if r['status'] == '000':
-        result = r['result']
-        res = {}
-        for field in keyDict:
-            if field in result:
-                res[field] = result[field]
-        res['noKeyList'] = result['noKeyList']
-        res['logoList'] = result['logoList']
-        res['tableList'] = result['tableList']
-        logoFileName = [log['logoFileName'] for log in res['logoList']]
-        res['logoList'] = logoFileName
-        return res
-    elif r['status'] == '101':
-        return "101"
-
-
-# 比较两个json文件 并在md文件中写入对比结果
-def evaluate_one(xlsx_dict, res_dict):
-    true_num = 0
-    xlsx_dict_no_space: dict = copy.deepcopy(xlsx_dict)
-    for index, text in xlsx_dict_no_space.items():
-        if type(xlsx_dict_no_space[index]) is str:
-            xlsx_dict_no_space[index] = text.replace(' ', '')
-        elif type(xlsx_dict_no_space[index]) is list:
-            for k, v in enumerate(xlsx_dict_no_space[index]):
-                xlsx_dict_no_space[index][k] = v.replace(' ', '')
-    # 有key值的比较
-    for key_yes in res_dict:
-        if type(res_dict[key_yes]) is str:
-            if Levenshtein_Distance(res_dict[key_yes], xlsx_dict_no_space[key_yes]) == 0:
-                table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
-                true_num += 1
-            else:
-                table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
-    # 无key值的比较
-    key_no_dict = {}
-    for key_no_xlsx_no_space, key_no_xlsx in zip(xlsx_dict_no_space['noKeyList'], xlsx_dict['noKeyList']):
-        key_no_dict[key_no_xlsx_no_space] = []
-        for key_no_res in res_dict['noKeyList']:
-            key_no_dict[key_no_xlsx_no_space].append(
-                (Levenshtein_Distance(key_no_xlsx_no_space, key_no_res), key_no_res))
-        sort_NoKey = sorted(key_no_dict[key_no_xlsx_no_space], key=lambda x: x[0])
-        NoKey_min_distance = sort_NoKey[0][0]
-        if NoKey_min_distance == 0:
-            table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
-            true_num += 1
-        else:
-            table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
-
-    # 算正确率
-    all_num = len(table_result) // 4 - 1
-    rate = true_num / all_num
-    all_rate.append(rate)
-    statistics = f'共{all_num}个字段,正确{true_num}个,错误{all_num - true_num}个'
-    return "{:.2f}%".format(rate * 100), statistics
-
-
-# def evaluate_one(xlsx_dict, res_dict):
-#     true_num = 0
-#     # 有key值的比较
-#     for key_yes in res_dict:
-#         if type(res_dict[key_yes]) is str:
-#             if Levenshtein_Distance(res_dict[key_yes], xlsx_dict[key_yes]) == 0:
-#                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
-#                 true_num += 1
-#             else:
-#                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
-#     # 无key值的比较
-#     key_no_dict = {}
-#     for key_no_xlsx in xlsx_dict['noKeyList']:
-#         key_no_dict[key_no_xlsx] = []
-#         for key_no_res in res_dict['noKeyList']:
-#             key_no_dict[key_no_xlsx].append((Levenshtein_Distance(key_no_xlsx, key_no_res), key_no_res))
-#         sort_NoKey = sorted(key_no_dict[key_no_xlsx], key=lambda x: x[0])
-#         NoKey_min_distance = sort_NoKey[0][0]
-#         if NoKey_min_distance == 0:
-#             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
-#             true_num += 1
-#         else:
-#             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
-#     # 算正确率
-#     rate = true_num / (len(table_result) / 4)
-#     all_rate.append(rate)
-#     statistics = f'共{len(table_result) // 4}个字段,正确{true_num}个,错误{len(table_result) // 4 - true_num}个'
-#     return "{:.2f}%".format(rate * 100), statistics
-
-
-# 打开正确的json文件
-def open_true_json(j_path):
-    with j_path.open('r', encoding='utf-8') as f:
-        j_dict = json.load(f)
-        j_json_str = json.dumps(j_dict, ensure_ascii=False)
-        return j_dict, j_json_str
-
-
-if __name__ == '__main__':
-    img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg']])
-    all_rate = []
-    table_mean_acc = []
-    for img_path in img_paths:
-        print(img_path)
-        # json result
-        true_d, true_json = open_true_json(img_path.with_suffix('.json'))
-        result = send_request(img_path, true_json)
-        res_d = _parse_result(result)
-
-        # md
-        md_file_path = img_path.parent / (img_path.with_suffix('.md'))
-        MD = MdUtils(file_name=str(md_file_path))
-        table_result = ['key值', '正确答案', 'ocr返回结果', '是否正确']
-        rate, statistics = evaluate_one(true_d, res_d)
-        MD.new_header(level=1, title='测试结果')
-        MD.new_header(level=2, title=f'正确率:{rate}')
-        MD.new_header(level=3, title=statistics)
-        print(f'文字识别正确率:{rate}')
-        MD.new_table(columns=4, rows=len(table_result) // 4, text=table_result, text_align='center')
-        MD.create_md_file()
-
-        # table gt result
-        markdown = TableMD(img_path.name)
-        dataset = Dataset(gt_file=img_path.with_suffix('.txt'), img_name=img_path.name, results=res_d)
-        markdown.write_header(title='推理结果', level=2)
-        markdown.write_table_accuracy(ds=dataset, key='new')
-        table_acc = markdown.get_table_accuracy()
-        table_mean_acc.append(table_acc)
-        print(f'表格识别正确率:{table_acc:.2f}%')
-        markdown.f.create_md_file()
-
-    print('----------------------------------------')
-    all_rate = "{:.2f}%".format(np.mean(all_rate) * 100)
-    all_table_rate = "{:.2f}%".format(np.mean(table_mean_acc))
-    print(f'文字识别总体正确率:{all_rate}')
-    print(f'表格识别总体正确率:{all_table_rate}')

+ 0 - 112
YQ_OCR/to_md/text2md.py

@@ -1,112 +0,0 @@
-from typing import List
-from mdutils.mdutils import MdUtils
-from YQ_OCR.to_md.datasets import Dataset
-
-
-class TableMD(object):
-    def __init__(self, img_name):
-        self.img_name = img_name
-        self.acc = 0
-        self.f = MdUtils(file_name='../output/' + self.img_name.split('.')[0] + '-表格识别结果')
-
-        self.table_structure: List = ['原模型表格正确率', '新模型表格准确率']
-        self.new_table_text: List = ['位置', '标注结果', '新模型推理', '是否一致']
-        self.old_table_text: List = ['位置', '标注结果', '原模型推理', '是否一致']
-        self.write_header(f'表格识别结果测试报告')
-
-    def write_header(self, title, level=1):
-        self.f.new_header(level=level, title=title)
-
-    def write_table_accuracy(self, ds: Dataset, key, columns=4, text_align='center'):
-        def get_format_table_accuracy(str1, str2):
-            n1 = len(str1)
-            n2 = len(str2)
-            if n1 == 0 or n2 == 0:
-                return ''
-            dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
-            Max = 0
-            pos = 0
-            for i in range(1, n1 + 1):
-                for j in range(1, n2 + 1):
-                    if str1[i - 1] == str2[j - 1]:
-                        dp[i][j] = dp[i - 1][j - 1] + 1
-                    else:
-                        dp[i][j] = 0
-                    if dp[i][j] > Max:
-                        Max = dp[i][j]
-                        pos = i - 1
-            return str1[pos - Max + 1:pos + 1]
-
-        pre_list = ds.get_pre_list()
-        gt_list = ds.get_gt_list()
-        # print(pre_list)
-        # print(gt_list)
-        correct = 0
-        count = 0
-        n = len(pre_list)
-        m = len(gt_list)
-        if n < m:
-            pre_list.extend(['' for _ in range(m - n)])
-        else:
-            gt_list.extend(['' for _ in range(n - m)])
-
-        for x in range(len(gt_list)):
-            gt_parse_list = gt_list[x].split('*')
-            gt_parse_list.pop()
-            pre_parse_list = pre_list[x].split('*')
-            pre_parse_list.pop()
-            # print(gt_parse_list)
-            # print(pre_parse_list)
-            n1 = len(pre_parse_list)
-            m1 = len(gt_parse_list)
-            # print(n1, m1)
-            if n1 < m1:
-                pre_parse_list.extend(['' for _ in range(m1 - n1)])
-            else:
-                gt_parse_list.extend(['' for _ in range(n1 - m1)])
-
-            for j in range(len(gt_parse_list)):
-                count += 1
-                # infer = get_format_table_accuracy(gt_list[x], pre_list[x])
-                if gt_parse_list[j] == pre_parse_list[j] or \
-                        gt_parse_list[j].replace(' ', '') == pre_parse_list[j].replace(' ', ''):
-                    correct += 1
-                if key == 'new':
-                    self.new_table_text.extend(
-                        [f'{x + 1}行',
-                         gt_parse_list[j],
-                         pre_parse_list[j],
-                         '✅' if gt_parse_list[j] == pre_parse_list[j] else '❌'])
-                elif key == 'old':
-                    self.old_table_text.extend(
-                        [f'{x + 1}行',
-                         gt_parse_list[j],
-                         pre_parse_list[j],
-                         '✅' if gt_parse_list[j] == pre_parse_list[j] else '❌'])
-
-        acc = correct / count * 100
-        self.acc = acc
-        if key == 'new':
-            rows = len(self.new_table_text) // columns
-            self.write_header(level=3,
-                              title=f'{self.img_name},'
-                                    f'共检测{count}处,'
-                                    f'正确{correct},'
-                                    f'错误{count - correct},'
-                                    f'表格正确率:{acc:.2f}%')
-            self.f.new_table(columns=columns, rows=rows, text=self.new_table_text, text_align=text_align)
-        elif key == 'old':
-            rows = len(self.old_table_text) // columns
-            self.f.new_header(level=3,
-                              title=f'{self.img_name},'
-                                    f'共检测{count}处,'
-                                    f'正确{correct},'
-                                    f'错误{count - correct},'
-                                    f'表格正确率:{acc:.2f}%')
-            self.f.new_table(columns=columns, rows=rows, text=self.old_table_text, text_align=text_align)
-
-    def get_table_accuracy(self):
-        if self.acc < 0.6:
-            with open('../output/worst.txt', 'a') as f:
-                f.write(self.img_name + '\n')
-        return self.acc

+ 0 - 0
YQ_OCR/to_md/datasets.py → YQ_OCR/utils/datasets.py


+ 183 - 0
YQ_OCR/utils/text2md.py

@@ -0,0 +1,183 @@
+import copy
+from typing import List
+from mdutils.mdutils import MdUtils
+from YQ_OCR.utils.datasets import Dataset
+from YQ_OCR.utils.utils import Levenshtein_Distance
+
+
+class TableMD(object):
+    def __init__(self, img_name):
+        self.img_name = img_name
+        self.acc = 0
+        self.f = MdUtils(file_name='./output/' + self.img_name.split('.')[0] + '-表格识别结果')
+
+        self.table_structure: List = ['原模型表格正确率', '新模型表格准确率']
+        self.table_result: List = ['key值', '正确答案', 'ocr返回结果', '是否正确']
+        self.new_table_text: List = ['位置', '标注结果', '新模型推理', '是否一致']
+        self.old_table_text: List = ['位置', '标注结果', '原模型推理', '是否一致']
+        self.write_header(f'测试结果报告')
+
+    def write_header(self, title, level=1):
+        self.f.new_header(level=level, title=title)
+
+    def write_table_accuracy(self, ds: Dataset, key, columns=4, text_align='center'):
+        def get_format_table_accuracy(str1, str2):
+            n1 = len(str1)
+            n2 = len(str2)
+            if n1 == 0 or n2 == 0:
+                return ''
+            dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
+            Max = 0
+            pos = 0
+            for i in range(1, n1 + 1):
+                for j in range(1, n2 + 1):
+                    if str1[i - 1] == str2[j - 1]:
+                        dp[i][j] = dp[i - 1][j - 1] + 1
+                    else:
+                        dp[i][j] = 0
+                    if dp[i][j] > Max:
+                        Max = dp[i][j]
+                        pos = i - 1
+            return str1[pos - Max + 1:pos + 1]
+
+        pre_list = ds.get_pre_list()
+        gt_list = ds.get_gt_list()
+        # print(pre_list)
+        # print(gt_list)
+        correct = 0
+        count = 0
+        n = len(pre_list)
+        m = len(gt_list)
+        if n < m:
+            pre_list.extend(['' for _ in range(m - n)])
+        else:
+            gt_list.extend(['' for _ in range(n - m)])
+
+        for x in range(len(gt_list)):
+            gt_parse_list = gt_list[x].split('*')
+            gt_parse_list.pop()
+            pre_parse_list = pre_list[x].split('*')
+            pre_parse_list.pop()
+            # print(gt_parse_list)
+            # print(pre_parse_list)
+            n1 = len(pre_parse_list)
+            m1 = len(gt_parse_list)
+            # print(n1, m1)
+            if n1 < m1:
+                pre_parse_list.extend(['' for _ in range(m1 - n1)])
+            else:
+                gt_parse_list.extend(['' for _ in range(n1 - m1)])
+
+            for j in range(len(gt_parse_list)):
+                count += 1
+                # infer = get_format_table_accuracy(gt_list[x], pre_list[x])
+                if gt_parse_list[j] == pre_parse_list[j] or \
+                        gt_parse_list[j].replace(' ', '') == pre_parse_list[j].replace(' ', ''):
+                    correct += 1
+                if key == 'new':
+                    self.new_table_text.extend(
+                        [f'{x + 1}行',
+                         gt_parse_list[j],
+                         pre_parse_list[j],
+                         '✅' if gt_parse_list[j] == pre_parse_list[j] else '❌'])
+                elif key == 'old':
+                    self.old_table_text.extend(
+                        [f'{x + 1}行',
+                         gt_parse_list[j],
+                         pre_parse_list[j],
+                         '✅' if gt_parse_list[j] == pre_parse_list[j] else '❌'])
+
+        acc = correct / count * 100
+        self.acc = acc
+        if key == 'new':
+            rows = len(self.new_table_text) // columns
+            self.write_header(level=2, title=f'表格识别正确率:{acc:.2f}%')
+            self.write_header(level=3, title=f'共检测{count}处,'
+                                             f'正确{correct},'
+                                             f'错误{count - correct}')
+            self.f.new_table(columns=columns, rows=rows, text=self.new_table_text, text_align=text_align)
+        elif key == 'old':
+            rows = len(self.old_table_text) // columns
+            self.write_header(level=2, title=f'表格识别正确率:{acc:.2f}%')
+            self.write_header(level=3, title=f'共检测{count}处,'
+                                             f'正确{correct},'
+                                             f'错误{count - correct}')
+            self.f.new_table(columns=columns, rows=rows, text=self.old_table_text, text_align=text_align)
+
+    def get_table_accuracy(self):
+        if self.acc < 0.6:
+            with open('../output/worst.txt', 'a') as f:
+                f.write(self.img_name + '\n')
+        return self.acc
+
+    # 比较两个json文件 并在md文件中写入对比结果
+    def evaluate_one(self, xlsx_dict, res_dict):
+        true_num = 0
+        xlsx_dict_no_space: dict = copy.deepcopy(xlsx_dict)
+        for index, text in xlsx_dict_no_space.items():
+            if type(xlsx_dict_no_space[index]) is str:
+                xlsx_dict_no_space[index] = text.replace(' ', '')
+            elif type(xlsx_dict_no_space[index]) is list:
+                for k, v in enumerate(xlsx_dict_no_space[index]):
+                    xlsx_dict_no_space[index][k] = v.replace(' ', '')
+        # 有key值的比较
+        for key_yes in res_dict:
+            if type(res_dict[key_yes]) is str:
+                if Levenshtein_Distance(res_dict[key_yes], xlsx_dict_no_space[key_yes]) == 0:
+                    self.table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
+                    true_num += 1
+                else:
+                    self.table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
+        # 无key值的比较
+        key_no_dict = {}
+        for key_no_xlsx_no_space, key_no_xlsx in zip(xlsx_dict_no_space['noKeyList'], xlsx_dict['noKeyList']):
+            key_no_dict[key_no_xlsx_no_space] = []
+            for key_no_res in res_dict['noKeyList']:
+                key_no_dict[key_no_xlsx_no_space].append(
+                    (Levenshtein_Distance(key_no_xlsx_no_space, key_no_res), key_no_res))
+            sort_NoKey = sorted(key_no_dict[key_no_xlsx_no_space], key=lambda x: x[0])
+            NoKey_min_distance = sort_NoKey[0][0]
+            if NoKey_min_distance == 0:
+                self.table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
+                true_num += 1
+            else:
+                self.table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
+
+        # 算正确率
+        all_num = len(self.table_result) // 4 - 1
+        rate = true_num / all_num * 100
+        # all_rate.append(rate)
+        statistics = f'共{all_num}个字段,正确{true_num}个,错误{all_num - true_num}个'
+        self.write_header(level=2, title=f'文字识别正确率:{rate}')
+        self.write_header(level=3, title=statistics)
+        self.f.new_table(columns=4, rows=len(self.table_result) // 4, text=self.table_result, text_align='center')
+        return rate, statistics
+
+    # def evaluate_one(xlsx_dict, res_dict):
+    #     true_num = 0
+    #     # 有key值的比较
+    #     for key_yes in res_dict:
+    #         if type(res_dict[key_yes]) is str:
+    #             if Levenshtein_Distance(res_dict[key_yes], xlsx_dict[key_yes]) == 0:
+    #                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
+    #                 true_num += 1
+    #             else:
+    #                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
+    #     # 无key值的比较
+    #     key_no_dict = {}
+    #     for key_no_xlsx in xlsx_dict['noKeyList']:
+    #         key_no_dict[key_no_xlsx] = []
+    #         for key_no_res in res_dict['noKeyList']:
+    #             key_no_dict[key_no_xlsx].append((Levenshtein_Distance(key_no_xlsx, key_no_res), key_no_res))
+    #         sort_NoKey = sorted(key_no_dict[key_no_xlsx], key=lambda x: x[0])
+    #         NoKey_min_distance = sort_NoKey[0][0]
+    #         if NoKey_min_distance == 0:
+    #             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
+    #             true_num += 1
+    #         else:
+    #             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
+    #     # 算正确率
+    #     rate = true_num / (len(table_result) / 4)
+    #     all_rate.append(rate)
+    #     statistics = f'共{len(table_result) // 4}个字段,正确{true_num}个,错误{len(table_result) // 4 - true_num}个'
+    #     return "{:.2f}%".format(rate * 100), statistics

+ 48 - 0
YQ_OCR/utils/utils.py

@@ -0,0 +1,48 @@
+import json
+import requests
+from pathlib import Path
+from YQ_OCR.configs.config import *
+
+
+# 发送请求 带正确答案参数
+def send_request(img_path: Path, img_json: str):
+    file = {'file': (img_path.name, open(img_path, 'rb'), img_path)}
+    payload = {'docDataStr': img_json}
+    r = requests.post(url + url_path, files=file, data=payload)
+    return r.json()
+
+
+# 编辑距离
+def Levenshtein_Distance(str1, str2):
+    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
+    for i in range(1, len(str1) + 1):
+        for j in range(1, len(str2) + 1):
+            d = 0 if (str1[i - 1] == str2[j - 1]) else 1
+            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
+    return matrix[len(str1)][len(str2)]
+
+
+# 处理返回结果
+def parse_result(r):  # sourcery skip: dict-comprehension
+    if r['status'] == '000':
+        result = r['result']
+        res = {}
+        for field in keyDict:
+            if field in result:
+                res[field] = result[field]
+        res['noKeyList'] = result['noKeyList']
+        res['logoList'] = result['logoList']
+        res['tableList'] = result['tableList']
+        logoFileName = [log['logoFileName'] for log in res['logoList']]
+        res['logoList'] = logoFileName
+        return res
+    elif r['status'] == '101':
+        return "101"
+
+
+# 打开正确的json文件
+def open_true_json(j_path):
+    with j_path.open('r', encoding='utf-8') as f:
+        j_dict = json.load(f)
+        j_json_str = json.dumps(j_dict, ensure_ascii=False)
+        return j_dict, j_json_str

+ 2 - 3
YQ_OCR/to_md/xlsx_convert_json.py → YQ_OCR/utils/xlsx_convert_json.py

@@ -2,14 +2,13 @@ import json
 import re
 from itertools import chain
 from pathlib import Path
-
 import pandas as pd
-from YQ_OCR.config import keyDict
+from YQ_OCR.configs.config import keyDict
 
 # 把xlsx转成json
 
 
-excels_path = '/Users/sxkj/to_md/YQ_OCR/img'
+excels_path = '/Users/sxkj/utils/YQ_OCR/img'
 
 
 # 返回文档里所以所需识别字符串