Bläddra i källkod

fix send_request token错误

zeke-chin 2 år sedan
förälder
incheckning
affce2daf6

+ 1 - 1
HR_OCR/test_script/to_md/README.md

@@ -39,7 +39,7 @@
   filed = 'regbook'
   
   # 若md_path为None 则默认使用图片父路径为markdown保存路径
-  # md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+  # md_path = '/Users/zeke/work/sx/OCR/HROCR/utils/example' or image_path.parent
   md_path = None or image_path.parent
   ```
 

+ 82 - 4
HR_OCR/test_script/to_md/ocr_config.py

@@ -19,11 +19,80 @@ class Configs:
     request: RequestConfig
     type: Type
 
+# idcard
+idcard_local_config = RequestConfig(url='http://192.168.199.27:18050/ocr_system/cet', token='')
+idcard_TXtest_config = RequestConfig(
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/sfzsbtest/idcard',
+    token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
+idcard_TXsb_config = RequestConfig(
+    url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/sfzsb/idcard',
+    token='dcae8cc6-0e49-4db8-a2d2-94ef84da3636')
+idcard_DXtest_config = RequestConfig(
+    url='http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/sfzsbtest/idcard',
+    token='4e00c444-620b-4d3c-85f4-777e64276f0e')
+idcard_DXsb_config = RequestConfig(
+    url='http://aihub-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/sfzsb/idcard',
+    token='e045de0a-e97f-4f23-b4d5-6a032c39a81e')
+
+IDCARD_CONFIGS = {
+    'local': idcard_local_config,
+    'TXtest': idcard_TXtest_config,
+    'TXsb': idcard_TXsb_config,
+    'DXtest': idcard_DXtest_config,
+    'DXsb': idcard_DXsb_config
+}
+
+# bankcard
+bankcard_local_config = RequestConfig(url='http://192.168.199.27:18050/ocr_system/cet', token='')
+bankcard_TXtest_config = RequestConfig(
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/yhksbtest/bankcard',
+    token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
+bankcard_TXsb_config = RequestConfig(
+    url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/yhksb/bankcard',
+    token='dcae8cc6-0e49-4db8-a2d2-94ef84da3636')
+bankcard_DXtest_config = RequestConfig(
+    url='http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/yhksbtest/bankcard',
+    token='4e00c444-620b-4d3c-85f4-777e64276f0e')
+bankcard_DXsb_config = RequestConfig(
+    url='http://aihub-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/yhksb/bankcard',
+    token='e045de0a-e97f-4f23-b4d5-6a032c39a81e')
+
+BANKCARD_CONFIGS = {
+    'local': bankcard_local_config,
+    'TXtest': bankcard_TXtest_config,
+    'TXsb': bankcard_TXsb_config,
+    'DXtest': bankcard_DXtest_config,
+    'DXsb': bankcard_DXsb_config
+}
+
+# school
+school_local_config = RequestConfig(url='http://192.168.199.27:18050/ocr_system/cet', token='')
+school_TXtest_config = RequestConfig(
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/xxwtest/schoolcert',
+    token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
+school_TXsb_config = RequestConfig(
+    url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/xxw/schoolcert',
+    token='dcae8cc6-0e49-4db8-a2d2-94ef84da3636')
+school_DXtest_config = RequestConfig(
+    url='http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/xxwtest/schoolcert',
+    token='4e00c444-620b-4d3c-85f4-777e64276f0e')
+school_DXsb_config = RequestConfig(
+    url='http://aihub-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/xxw/schoolcert',
+    token='e045de0a-e97f-4f23-b4d5-6a032c39a81e')
+
+SCHOOL_CONFIGS = {
+    'local': school_local_config,
+    'TXtest': school_TXtest_config,
+    'TXsb': school_TXsb_config,
+    'DXtest': school_DXtest_config,
+    'DXsb': school_DXsb_config
+}
+
 
 # cet
 cet_local_config = RequestConfig(url='http://192.168.199.27:18050/ocr_system/cet', token='')
 cet_TXtest_config = RequestConfig(
-    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm//cettest/cet',
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/cettest/cet',
     token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
 cet_TXsb_config = RequestConfig(
     url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/cet/cet',
@@ -46,7 +115,7 @@ CET_CONFIGS = {
 # regbook
 regbook_local_config = RequestConfig(url='http://192.168.199.27:18040/ocr_system/regbook', token='')
 regbook_TXtest_config = RequestConfig(
-    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm//hkbsbtest/regbook',
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/hkbsbtest/regbook',
     token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
 regbook_TXsb_config = RequestConfig(
     url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/hkbsb/regbook',
@@ -92,10 +161,16 @@ BLFE_CONFIGS = {
 OCR_CONFIGS = {
     'cet': CET_CONFIGS,
     'regbook': REGBOOK_CONFIGS,
-    'business_license': BLFE_CONFIGS
+    'business_license': BLFE_CONFIGS,
+    'schoolcert': SCHOOL_CONFIGS,
+    'idcard': IDCARD_CONFIGS,
+    'bankcard': BANKCARD_CONFIGS
 }
 
 # 字段
+idcard_fieid = ['orientation', 'name', 'id', 'ethnicity', 'gender', 'birthday', 'address', 'address_province', 'address_city', 'address_region', 'address_detail', 'expire_date']
+bankcard_fieid = ['orientation', 'number']
+schoolcert_fieid = ['orientation', 'name', 'gender', 'admission_time', 'education_time', 'education_level', 'education_type', 'learning_type', 'school', 'major', 'number']
 cet_field = ['orientation', 'name', 'id', 'language', 'level', 'exam_time', 'score']
 regbook_field = ['orientation', 'name', 'id', 'gender', 'birthplace', 'birthplace_province', 'birthplace_city',
                  'birthplace_region', 'native_place', 'native_place_province', 'native_place_city',
@@ -105,5 +180,8 @@ business_license = ['orientation', 'social_code', 'company_name', 'legal_person'
 Filed = {
     'cet': cet_field,
     'regbook': regbook_field,
-    'business_license': business_license
+    'business_license': business_license,
+    'schoolcert': schoolcert_fieid,
+    'idcard': idcard_fieid,
+    'bankcard': bankcard_fieid
 }

+ 2 - 2
HR_OCR/test_script/to_md/use.py

@@ -3,7 +3,7 @@ Author: zeke-chin zeke-chin@icloud.com
 Date: 2022-09-28 20:28:41
 LastEditors: zeke-chin zeke-chin@icloud.com
 LastEditTime: 2022-09-30 15:08:48
-FilePath: /to_md/HR_OCR/to_md/use.py
+FilePath: /utils/HR_OCR/utils/use.py
 Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 '''
 
@@ -24,7 +24,7 @@ md_name = 'CET'
 filed = 'cet'
 
 # 若md_path为None 则默认使用图片父路径为markdown保存路径
-# md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+# md_path = '/Users/zeke/work/sx/OCR/HROCR/utils/example' or image_path.parent
 md_path = None or image_path.parent
 
 md_file = parser_path(Path(md_path) / Path(md_name + image_path.stem), image_rotate)

+ 2 - 2
HR_OCR/test_script/tools/README.md

@@ -6,7 +6,7 @@
 # 项目url
 url = 'http://192.168.199.27:18040'
 # 目标文件夹
-imgs_path = './HR_OCR/to_md/example/img'
+imgs_path = './HR_OCR/utils/example/img'
 
 def send_request(img_path, image_type = 0):
     with open(img_path, 'rb') as f:
@@ -29,6 +29,6 @@ def send_request(img_path, image_type = 0):
 
 ```python
 # 需要格式化的目的文件夹路径
-target_path = './HR_OCR/to_md/example/img'
+target_path = './HR_OCR/utils/example/img'
 ```
 

+ 1 - 1
HR_OCR/test_script/tools/convert_json.py

@@ -6,7 +6,7 @@ import base64
 from itertools import chain
 
 url = 'http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr'
-imgs_path = './HR_OCR/to_md/example/img'
+imgs_path = './HR_OCR/utils/example/img'
 
 def send_request(img_path, image_type = 0):
     with open(img_path, 'rb') as f:

+ 1 - 1
HR_OCR/test_script/tools/suffix.py

@@ -4,7 +4,7 @@ from itertools import chain
 import sys
 
 # conf
-target_path = './HR_OCR/to_md/example/img'
+target_path = './HR_OCR/utils/example/img'
 #suffix = sys.argv[2]
 suffix = 'jpg' if len(sys.argv) != 3 else sys.argv[2]
 

+ 0 - 1
HR_OCR/to_md/use.py

@@ -14,7 +14,6 @@ from new import MD, Image, Dataset, parser_path
 # config
 # 图片路径
 image_path = Path('/Users/zeke/Downloads/户口本测试样本1011-常住人口页')
-image_type = 0
 # 是否旋转
 image_rotate = False
 ocr_address = 'DXtest'  # 'local' 'TXtest' 'TXsb' 'DXtest' 'DXsb'

+ 3 - 3
HR_OCR/tools/convert_json.py

@@ -8,12 +8,12 @@ from itertools import chain
 url = 'http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr'
 imgs_path = '/Users/zeke/Downloads/户口本测试样本1011-常住人口页'
 
-def send_request(img_path, image_type = 0):
+
+def send_request(img_path):
     with open(img_path, 'rb') as f:
         img_str: str = base64.encodebytes(f.read()).decode('utf-8')
         data = {
-            'image': img_str,
-            'image_type': image_type
+            'image': img_str
         }
         idc_header = {
             'Content-Type': 'application/json',

+ 0 - 0
YQ_OCR/__init__.py


+ 5 - 0
YQ_OCR/configs/__init__.py

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author : xujiayue
+# @File   : __init__.py.py
+
+

+ 6 - 1
YQ_OCR/config.py → YQ_OCR/configs/config.py

@@ -1,3 +1,8 @@
+url = 'http://192.168.199.107:18087'
+url_path = '/ocr_system/identify'
+# imgs_path = '/Users/sxkj/utils/YQ_OCR/img'
+imgs_path = './img'
+
 keyDict = {
     "productCategory": '产品种类',
     "ingredients": '配料',
@@ -8,4 +13,4 @@ keyDict = {
     "conSerHotline": '消费者服务热线',
     "tips": '温馨提示|友情提示',
     "welcome": '欢迎访问'
-}
+}

+ 25 - 0
YQ_OCR/img/03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版.json

@@ -0,0 +1,25 @@
+{
+    "productCategory": "产品种类:调制豆乳",
+    "ingredients": "配料:饮用水、大豆(非转基因)、白砂糖",
+    "proStanCode": "产品标准代号:GB/T30885",
+    "productionDate": "生产日期:见瓶盖",
+    "shelfLife": "保质期:常温密闭条件下9个月",
+    "storageConditions": "贮存条件:请保存于阴凉干燥处,避免阳光直晒、高温",
+    "conSerHotline": "消费者服务热线:4008169999",
+    "tips": "温馨提示:请勿带包装置于微波炉中加热。",
+    "welcome": "欢迎访问:www.yili.com",
+    "noKeyList": [
+        "植选",
+        "浓香豆乳畅饮系列",
+        "大豆添加量:44g/瓶",
+        "原味",
+        "全程非转基因可追溯大豆",
+        "3.0g/100mL",
+        "优质植物蛋白",
+        "保持环境清洁请勿乱抛空瓶",
+        "为保证产品风味,开启后需冷藏并尽快饮用完毕。",
+        "可能会有少量蛋白沉淀和脂肪上浮,属正常现象,请放心饮用。如发现涨瓶,请勿开启。",
+        "净含量:315mL",
+        "6907992515007"
+    ]
+}

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 0 - 0
YQ_OCR/img/03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版.txt


+ 23 - 0
YQ_OCR/img/巧克力味牛奶饮品.json

@@ -0,0 +1,23 @@
+{
+    "productCategory": "产品种类:配制型含乳饮料",
+    "ingredients": "配料:生牛乳、饮用水、白砂糖、可可粉、食品添加剂(微晶纤维素、单,双甘油脂肪酸酯、蔗糖脂肪酸酯、柠檬酸钠、结冷胶、安赛蜜、三氯蔗糖、食品用香精)",
+    "proStanCode": "产品标准代号:GB/T21732",
+    "productionDate": "生产日期:见盒顶部",
+    "shelfLife": "保质期:常温密闭条件下6个月",
+    "storageConditions": "贮存条件:未开启前,无需冷藏,开启之后,立即饮用。",
+    "conSerHotline": "消费者服务热线:4008169999",
+    "tips": "友情提示:喝前摇一摇",
+    "welcome": "欢迎访问:www.yili.com",
+    "noKeyList": [
+        "牛奶饮品",
+        "产品名称:巧克力味牛奶饮品",
+        "生产日期:见箱体",
+        "切勿带包装置于微波炉中加热",
+        "清真",
+        "保持环境清洁请勿乱抛空包",
+        "伊利",
+        "(具体生产商/产地见生产日期末端代码)",
+        "净含量:250mL",
+        "6907992500102"
+    ]
+}

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 0 - 0
YQ_OCR/img/巧克力味牛奶饮品.txt


+ 23 - 0
YQ_OCR/img/餐饮纯牛奶 内包.json

@@ -0,0 +1,23 @@
+{
+    "productCategory": "产品种类:全脂灭菌纯牛乳",
+    "ingredients": "配料:生牛乳",
+    "proStanCode": "产品标准代号:GB25190",
+    "productionDate": "生产日期:见盒顶部",
+    "shelfLife": "保质期:常温密闭条件下6个月",
+    "storageConditions": "贮存条件:未开启前无需冷藏开启之后请贮存于2-6℃并于2日内饮用完毕",
+    "conSerHotline": "消费者服务热线:4008169999",
+    "welcome": "欢迎访问:www.yili.com",
+    "noKeyList": [
+        "纯牛奶",
+        "餐饮之选",
+        "非脂乳固体≥8.5%",
+        "保持环境清洁请勿乱抛空包",
+        "切勿带包装置于微波炉中加热。",
+        "净含量:1L",
+        "6907992513621",
+        "内蒙古伊利实业集团股份有限公司出品 地址:内蒙古自治区呼和浩特市金山开发区金山大街1号",
+        "宁夏伊利乳业有限责任公司(A12) 产地及地址:宁夏吴忠市利通区金积工业园区 食品生产许可证编号:SC10564030200130",
+        "阜新伊利乳品有限责任公司(B6) 产地及地址:辽宁省阜新市阜蒙县园区路2号 食品生产许可证编号:SC10521090000011",
+        "定州伊利乳业有限责任公司(C1) 产地及地址:河北省定州市伊利工业园区 食品生产许可证编号:SC10613068200020"
+    ]
+}

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 0 - 0
YQ_OCR/img/餐饮纯牛奶 内包.txt


+ 43 - 0
YQ_OCR/main.py

@@ -0,0 +1,43 @@
+import numpy as np
+from itertools import chain
+from mdutils.mdutils import MdUtils
+from YQ_OCR.utils.datasets import Dataset
+from YQ_OCR.utils.text2md import TableMD
+from YQ_OCR.utils.utils import *
+
+# 1. xlsx -> 正确json文件(写入厂家信息)
+# 2. 发送图片(带正确json文件)
+# 3. 把返回的json 和正确的json 进行对比(有key--用返回结果与正确结果比对,无key--用正确结果与返回结果比对)
+
+
+if __name__ == '__main__':
+    img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg']])
+    all_rate = []
+    table_mean_acc = []
+    for img_path in img_paths:
+        print(img_path)
+
+        true_d, true_json = open_true_json(img_path.with_suffix('.json'))
+        result = send_request(img_path, true_json)
+        res_d = parse_result(result)
+
+        markdown = TableMD(img_path.name)
+        markdown.write_header(title='推理结果', level=2)
+        # json result
+        rate, statistics = markdown.evaluate_one(true_d, res_d)
+        all_rate.append(rate)
+        print(f'文字识别正确率:{rate:.2f}%')
+
+        # table gt result
+        dataset = Dataset(gt_file=img_path.with_suffix('.txt'), img_name=img_path.name, results=res_d)
+        markdown.write_table_accuracy(ds=dataset, key='new')
+        table_acc = markdown.get_table_accuracy()
+        table_mean_acc.append(table_acc)
+        print(f'表格识别正确率:{table_acc:.2f}%')
+        markdown.f.create_md_file()
+
+    print('----------------------------------------')
+    all_rate = "{:.2f}%".format(np.mean(all_rate))
+    all_table_rate = "{:.2f}%".format(np.mean(table_mean_acc))
+    print(f'文字识别总体正确率:{all_rate}')
+    print(f'表格识别总体正确率:{all_table_rate}')

+ 69 - 0
YQ_OCR/output/03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版-表格识别结果.md

@@ -0,0 +1,69 @@
+
+
+
+
+# 测试结果报告
+
+## 推理结果
+
+## 文字识别正确率:80.95238095238095
+
+### 共21个字段,正确17个,错误4个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:调制豆乳|产品种类:调制豆乳|✅|
+|ingredients|配料:饮用水、大豆(非转基因)、白砂糖|配料:饮用水、大豆(非转基因)白砂糖大豆添加量:44g/瓶营美成分表|❌|
+|proStanCode|产品标准代号:GB/T30885|产品标准代号:GB/T30885|✅|
+|productionDate|生产日期:见瓶盖|生产日期:见瓶盖|✅|
+|shelfLife|保质期:常温密闭条件下9个月|保质期:常温密闭条件下9个月|✅|
+|storageConditions|贮存条件:请保存于阴凉干燥处,避免阳光直晒、高温|贮存条件:请保存十阴凉干燥处避免阳光直晒、高温。|❌|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|tips|温馨提示:请勿带包装置于微波炉中加热。|温馨提示:请勿带包装置于微波炉中加热。|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|植选|植选|✅|
+|无key值|浓香豆乳畅饮系列|浓香豆乳畅饮系列|✅|
+|无key值|大豆添加量:44g/瓶|大豆添加量:44g/瓶|✅|
+|无key值|原味|原味|✅|
+|无key值|全程非转基因可追溯大豆|全程非转基因可追溯大豆|✅|
+|无key值|3.0g/100mL|3.0g/100mL|✅|
+|无key值|优质植物蛋白|优质植物蛋白|✅|
+|无key值|保持环境清洁请勿乱抛空瓶|保持环境清洁请勿乱抛空瓶|✅|
+|无key值|为保证产品风味,开启后需冷藏并尽快饮用完毕。|为保证产品风味,开启后需冷藏并尽快饮用完毕。|✅|
+|无key值|可能会有少量蛋白沉淀和脂肪上浮,属正常现象,请放心饮用。如发现涨瓶,请勿开启。|可能会有少量蛋日沉淀和脂肪上浮属正常现象,请放心饮用。如发现胀瓶,请勿开启。|❌|
+|无key值|净含量:315mL|净含量:315mL|❌|
+|无key值|6907992515007|6907992515007|✅|
+
+## 表格识别正确率:88.89%
+
+### 共检测27处,正确24,错误3
+
+|位置|标注结果|新模型推理|是否一致|
+| :---: | :---: | :---: | :---: |
+|1行|项目|项目|✅|
+|1行|每100ml|每100ml|✅|
+|1行|NRV%|NRV%|✅|
+|2行|能量|能量|✅|
+|2行|207kJ|207kJ|✅|
+|2行|2%|2%|✅|
+|3行|蛋白质|蛋白质|✅|
+|3行|3.0g|3.0g|✅|
+|3行|5%|5%|✅|
+|4行|脂肪|脂肪|✅|
+|4行|2.0g|2.0g|✅|
+|4行|3%|3%|✅|
+|5行|一饱和脂肪|-饱和脂肪|❌|
+|5行|0.4g|0.4g|✅|
+|5行|2%|2%|✅|
+|6行|一反式脂肪|-反式脂肪|❌|
+|6行|0g|0g|✅|
+|6行|||✅|
+|7行|胆固醇|胆固醇|✅|
+|7行|0mg|Omg|❌|
+|7行|0%|0%|✅|
+|8行|碳水化合物|碳水化合物|✅|
+|8行|4.8g|4.8g|✅|
+|8行|2%|2%|✅|
+|9行|钠|钠|✅|
+|9行|35mg|35mg|✅|
+|9行|2%|2%|✅|

+ 57 - 0
YQ_OCR/output/巧克力味牛奶饮品-表格识别结果.md

@@ -0,0 +1,57 @@
+
+
+
+
+# 测试结果报告
+
+## 推理结果
+
+## 文字识别正确率:72.22222222222221
+
+### 共18个字段,正确13个,错误5个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:配制型含乳饮料|产品种类:配制型含乳饮料|✅|
+|proStanCode|产品标准代号:GB/T21732|产品标准代号:GB/T21732|✅|
+|productionDate|生产日期:见盒顶部|生产日期:见盒顶部|✅|
+|shelfLife|保质期:常温密闭条件下6个月|保质期:常温密闭条件下6个月|✅|
+|storageConditions|贮存条件:未开启前,无需冷藏,开启之后,立即饮用。|贮存条件:未开启前无需冷藏开启之后立即饮用|❌|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|tips|友情提示:喝前摇一摇|友情提示:喝前摇一摇|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|牛奶饮品|牛奶饮品|✅|
+|无key值|产品名称:巧克力味牛奶饮品|产品名称:巧克力味牛奶饮品|✅|
+|无key值|生产日期:见箱体|生产日期:见盒顶部|❌|
+|无key值|切勿带包装置于微波炉中加热|勿带包装置子微波炉中加热|❌|
+|无key值|清真|清真|✅|
+|无key值|保持环境清洁请勿乱抛空包|保持环境清洁请勿乱抛空包|✅|
+|无key值|伊利|伊利|✅|
+|无key值|(具体生产商/产地见生产日期末端代码)|(具体生产商/产地见生产日期末端代码)|❌|
+|无key值|净含量:250mL|净含量:250mL|❌|
+|无key值|6907992500102|6907992500102|✅|
+
+## 表格识别正确率:100.00%
+
+### 共检测18处,正确18,错误0
+
+|位置|标注结果|新模型推理|是否一致|
+| :---: | :---: | :---: | :---: |
+|1行|项目|项目|✅|
+|1行|每100mL|每100mL|✅|
+|1行|NRV%|NRV%|✅|
+|2行|能量|能量|✅|
+|2行|244kJ|244kJ|✅|
+|2行|3%|3%|✅|
+|3行|蛋白质|蛋白质|✅|
+|3行|1.3g|1.3g|✅|
+|3行|2%|2%|✅|
+|4行|脂肪|脂肪|✅|
+|4行|2.1g|2.1g|✅|
+|4行|4%|4%|✅|
+|5行|碳水化合物|碳水化合物|✅|
+|5行|8.5g|8.5g|✅|
+|5行|3%|3%|✅|
+|6行|钠|钠|✅|
+|6行|40mg|40mg|✅|
+|6行|2%|2%|✅|

+ 60 - 0
YQ_OCR/output/餐饮纯牛奶 内包-表格识别结果.md

@@ -0,0 +1,60 @@
+
+
+
+
+# 测试结果报告
+
+## 推理结果
+
+## 文字识别正确率:88.88888888888889
+
+### 共18个字段,正确16个,错误2个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:全脂灭菌纯牛乳|产品种类:全脂灭菌纯牛乳|✅|
+|ingredients|配料:生牛乳|配料:生牛乳|✅|
+|proStanCode|产品标准代号:GB25190|产品标准代号:GB25190|✅|
+|productionDate|生产日期:见盒顶部|生产日期:见盒顶部|✅|
+|shelfLife|保质期:常温密闭条件下6个月|保质期:常温密闭条件下6个月|✅|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|纯牛奶|纯牛奶|✅|
+|无key值|餐饮之选|餐饮之选|✅|
+|无key值|非脂乳固体≥8.5%|非脂乳固体≥8.5%|✅|
+|无key值|保持环境清洁请勿乱抛空包|保持环境清洁请勿乱抛空包|✅|
+|无key值|切勿带包装置于微波炉中加热。|切勿带包装置于微波炉中加热|❌|
+|无key值|净含量:1L|净含量:1L|❌|
+|无key值|6907992513621|6907992513621|✅|
+|无key值|内蒙古伊利实业集团股份有限公司出品 地址:内蒙古自治区呼和浩特市金山开发区金山大街1号|内蒙古伊利实业集团股份有限公司出品地址:内蒙古自治区呼和浩特市金山开发区金山大街1号|✅|
+|无key值|宁夏伊利乳业有限责任公司(A12) 产地及地址:宁夏吴忠市利通区金积工业园区 食品生产许可证编号:SC10564030200130|宁夏伊利乳业有限责任公司(A12)产地及地址:宁夏吴忠市利通区金积工业园区食品生产许可证编号:SC10564030200130|✅|
+|无key值|阜新伊利乳品有限责任公司(B6) 产地及地址:辽宁省阜新市阜蒙县园区路2号 食品生产许可证编号:SC10521090000011|阜新伊利乳品有限责任公司(B6)产地及地址:辽宁省阜新市阜蒙县园区路2号食品生产许可证编号:SC10521090000011|✅|
+|无key值|定州伊利乳业有限责任公司(C1) 产地及地址:河北省定州市伊利工业园区 食品生产许可证编号:SC10613068200020|定州伊利乳业有限责任公司(C1)产地及地址:河北省定州市伊利工业园区食品生产许可证编号:SC10613068200020|✅|
+
+## 表格识别正确率:100.00%
+
+### 共检测21处,正确21,错误0
+
+|位置|标注结果|新模型推理|是否一致|
+| :---: | :---: | :---: | :---: |
+|1行|项目|项目|✅|
+|1行|每100mL|每100mL|✅|
+|1行|NRV%|NRV% |✅|
+|2行|能量|能量|✅|
+|2行|280kJ|280kJ|✅|
+|2行|3%|3% |✅|
+|3行|蛋白质|蛋白质|✅|
+|3行|3.2g|3.2g|✅|
+|3行|5%|5% |✅|
+|4行|脂肪|脂肪|✅|
+|4行|3.8g|3.8g|✅|
+|4行|6%|6% |✅|
+|5行|碳水化合物|碳水化合物|✅|
+|5行|5.0g|5.0g|✅|
+|5行|2%|2% |✅|
+|6行|钠|钠|✅|
+|6行|53mg|53mg|✅|
+|6行|3%|3% |✅|
+|7行|钙|钙|✅|
+|7行|100mg|100mg|✅|
+|7行|13%|13%|✅|

+ 0 - 159
YQ_OCR/to_md/convert_MD.py

@@ -1,159 +0,0 @@
-import copy
-import re
-from itertools import chain
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import json
-from mdutils.mdutils import MdUtils
-import requests
-
-from YQ_OCR.config import keyDict
-
-url = 'http://192.168.199.107:18087'
-url_path = '/ocr_system/identify'
-imgs_path = '/Users/sxkj/to_md/YQ_OCR/img'
-
-
-# 1. xlsx -> 正确json文件(写入厂家信息)
-# 2. 发送图片(带正确json文件)
-# 3. 把返回的json 和正确的json 进行对比(有key--用返回结果与正确结果比对,无key--用正确结果与返回结果比对)
-
-# 编辑距离
-def Levenshtein_Distance(str1, str2):
-    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
-    for i in range(1, len(str1) + 1):
-        for j in range(1, len(str2) + 1):
-            d = 0 if (str1[i - 1] == str2[j - 1]) else 1
-            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
-    return matrix[len(str1)][len(str2)]
-
-
-# 发送请求 带正确答案参数
-def send_request(img_path: Path, img_json: str):
-    file = {'file': (img_path.name, open(img_path, 'rb'), img_path)}
-    payload = {'docDataStr': img_json}
-    r = requests.post(url + url_path, files=file, data=payload)
-    return r.json()
-
-
-# 处理返回结果
-def _parse_result(r):  # sourcery skip: dict-comprehension
-    if r['status'] == '000':
-        result = r['result']
-        res = {}
-        for field in keyDict:
-            if field in result:
-                res[field] = result[field]
-        res['noKeyList'] = result['noKeyList']
-        res['logoList'] = result['logoList']
-        logoFileName = [log['logoFileName'] for log in res['logoList']]
-        res['logoList'] = logoFileName
-        return res
-    elif r['status'] == '101':
-        return "101"
-
-
-# 比较两个json文件 并在md文件中写入对比结果
-def evaluate_one(xlsx_dict, res_dict):
-    true_num = 0
-    xlsx_dict_no_space: dict = copy.deepcopy(xlsx_dict)
-    for index, text in xlsx_dict_no_space.items():
-        if type(xlsx_dict_no_space[index]) is str:
-            xlsx_dict_no_space[index] = text.replace(' ', '')
-        elif type(xlsx_dict_no_space[index]) is list:
-            for k, v in enumerate(xlsx_dict_no_space[index]):
-                xlsx_dict_no_space[index][k] = v.replace(' ', '')
-    # 有key值的比较
-    for key_yes in res_dict:
-        if type(res_dict[key_yes]) is str:
-            if Levenshtein_Distance(res_dict[key_yes], xlsx_dict_no_space[key_yes]) == 0:
-                table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
-                true_num += 1
-            else:
-                table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
-    # 无key值的比较
-    key_no_dict = {}
-    for key_no_xlsx_no_space, key_no_xlsx in zip(xlsx_dict_no_space['noKeyList'], xlsx_dict['noKeyList']):
-        key_no_dict[key_no_xlsx_no_space] = []
-        for key_no_res in res_dict['noKeyList']:
-            key_no_dict[key_no_xlsx_no_space].append((Levenshtein_Distance(key_no_xlsx_no_space, key_no_res), key_no_res))
-        sort_NoKey = sorted(key_no_dict[key_no_xlsx_no_space], key=lambda x: x[0])
-        NoKey_min_distance = sort_NoKey[0][0]
-        if NoKey_min_distance == 0:
-            table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
-            true_num += 1
-        else:
-            table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
-
-    # 算正确率
-    all_num = len(table_result) // 4 - 1
-    rate = true_num / all_num
-    all_rate.append(rate)
-    statistics = f'共{all_num}个字段,正确{true_num}个,错误{all_num - true_num}个'
-    return "{:.2f}%".format(rate * 100), statistics
-
-
-# def evaluate_one(xlsx_dict, res_dict):
-#     true_num = 0
-#     # 有key值的比较
-#     for key_yes in res_dict:
-#         if type(res_dict[key_yes]) is str:
-#             if Levenshtein_Distance(res_dict[key_yes], xlsx_dict[key_yes]) == 0:
-#                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
-#                 true_num += 1
-#             else:
-#                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
-#     # 无key值的比较
-#     key_no_dict = {}
-#     for key_no_xlsx in xlsx_dict['noKeyList']:
-#         key_no_dict[key_no_xlsx] = []
-#         for key_no_res in res_dict['noKeyList']:
-#             key_no_dict[key_no_xlsx].append((Levenshtein_Distance(key_no_xlsx, key_no_res), key_no_res))
-#         sort_NoKey = sorted(key_no_dict[key_no_xlsx], key=lambda x: x[0])
-#         NoKey_min_distance = sort_NoKey[0][0]
-#         if NoKey_min_distance == 0:
-#             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
-#             true_num += 1
-#         else:
-#             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
-#     # 算正确率
-#     rate = true_num / (len(table_result) / 4)
-#     all_rate.append(rate)
-#     statistics = f'共{len(table_result) // 4}个字段,正确{true_num}个,错误{len(table_result) // 4 - true_num}个'
-#     return "{:.2f}%".format(rate * 100), statistics
-
-
-# 打开正确的json文件
-def open_true_json(j_path):
-    with j_path.open('r') as f:
-        j_dict = json.load(f)
-        j_json_str = json.dumps(j_dict, ensure_ascii=False)
-        return j_dict, j_json_str
-
-
-if __name__ == '__main__':
-    img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg', 'png', 'jpeg', 'PNG', 'JPG', 'JPEG']])
-    all_rate = []
-    for img_path in img_paths:
-        print(img_path)
-        # json result
-        true_d, true_json = open_true_json(img_path.with_suffix('.json'))
-        result = send_request(img_path, true_json)
-        res_d = _parse_result(result)
-        # md
-        md_file_path = img_path.parent / (img_path.with_suffix('.md'))
-        MD = MdUtils(file_name=str(md_file_path))
-        table_result = ['key值', '正确答案', 'ocr返回结果', '是否正确']
-        rate, statistics = evaluate_one(true_d, res_d)
-        MD.new_header(level=1, title='测试结果')
-        MD.new_header(level=2, title=f'正确率:{rate}')
-        MD.new_header(level=3, title=statistics)
-        print(f'正确率:{rate}')
-        MD.new_table(columns=4, rows=len(table_result) // 4, text=table_result, text_align='center')
-        MD.create_md_file()
-
-    print('-------------------------------')
-    all_rate = "{:.2f}%".format(np.mean(all_rate) * 100)
-    print(f'总体正确率:{all_rate}')

+ 80 - 0
YQ_OCR/utils/datasets.py

@@ -0,0 +1,80 @@
+import html2text
+import jsonlines
+
+
+class Dataset(object):
+    def __init__(self, gt_file, img_name, results):
+        self.gt_file = gt_file
+        self.img_name = img_name
+        self.results = results
+        self.pre_list = []
+        self.gt_list = []
+
+    def __len__(self):
+        return [len(self.pre_list), len(self.gt_list)]
+
+    def get_pre_list(self):
+        pre_xml = self.results['tableList'][0]
+        self.pre_list = parse_pre_str(pre_xml)
+        return self.pre_list
+
+    def get_pre_structure(self):
+        pre_xml = self.results['tableList'][0]
+        # print('gt', pre_xml)
+        pre_html = html2text.html2text(pre_xml)  # str
+        return pre_html
+
+    def get_gt_list(self):
+        with jsonlines.open(self.gt_file, 'r') as rfd:
+            for data in rfd:
+                gt_xml = data['gt']
+                # print(gt_xml)
+                self.gt_list = parse_gt_str(gt_xml)
+        return self.gt_list
+
+    def get_gt_structure(self):
+        with jsonlines.open(self.gt_file, 'r') as rfd:
+            for data in rfd:
+                gt_html = html2text.html2text(data['gt'])  # str
+                return gt_html
+            gt_html = 'Error:并未找到需要该图片的标注信息!'
+            return gt_html
+
+
+def parse_gt_str(text):
+    text = text.replace('<td colspan="3">', '')
+    text = text.replace('<td colspan="2">', '')
+    text = text.replace('<td rowspan="2">', '')
+    text = text.replace('<html>', '')
+    text = text.replace('</html>', '')
+    text = text.replace('<body>', '')
+    text = text.replace('</body>', '')
+    text = text.replace('<table>', '')
+    text = text.replace('</table>', '')
+    text = text.replace('<tbody>', '')
+    text = text.replace('</tbody>', '')
+    # print('gt', text)
+    text = text.replace('<td>', '')
+    text = text.replace('</td>', '*')
+    text = text.replace('<tr>', '')
+    return text.strip('</tr>').split('</tr>')
+
+
+def parse_pre_str(text):
+    text = text.replace('<td colspan="3">', '')
+    text = text.replace('<td colspan="2">', '')
+    text = text.replace('<td rowspan="2">', '')
+    text = text.replace('<html>', '')
+    text = text.replace('</html>', '')
+    text = text.replace('<body>', '')
+    text = text.replace('</body>', '')
+    text = text.replace('<table>', '')
+    text = text.replace('</table>', '')
+    text = text.replace('<tbody>', '')
+    text = text.replace('</tbody>', '')
+    # print('pre', text)
+    text = text.replace('<td>', '')
+    text = text.replace('</td>', '*')
+    text = text.replace('<tr>', '')
+    # return text.strip('</tr>').split('</tr>')
+    return text.strip('</tr>').split('</tr>')

+ 185 - 0
YQ_OCR/utils/text2md.py

@@ -0,0 +1,185 @@
+import copy
+from typing import List
+from mdutils.mdutils import MdUtils
+from YQ_OCR.utils.datasets import Dataset
+from YQ_OCR.utils.utils import Levenshtein_Distance
+
+
+class TableMD(object):
+    def __init__(self, img_name):
+        self.img_name = img_name
+        self.acc = 0
+        self.f = MdUtils(file_name='./output/' + self.img_name.split('.')[0] + '-表格识别结果')
+
+        self.table_structure: List = ['原模型表格正确率', '新模型表格准确率']
+        self.table_result: List = ['key值', '正确答案', 'ocr返回结果', '是否正确']
+        self.new_table_text: List = ['位置', '标注结果', '新模型推理', '是否一致']
+        self.old_table_text: List = ['位置', '标注结果', '原模型推理', '是否一致']
+        self.write_header(f'测试结果报告')
+
+    def write_header(self, title, level=1):
+        self.f.new_header(level=level, title=title)
+
+    def write_table_accuracy(self, ds: Dataset, key, columns=4, text_align='center'):
+        def get_format_table_accuracy(str1, str2):
+            n1 = len(str1)
+            n2 = len(str2)
+            if n1 == 0 or n2 == 0:
+                return ''
+            dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
+            Max = 0
+            pos = 0
+            for i in range(1, n1 + 1):
+                for j in range(1, n2 + 1):
+                    if str1[i - 1] == str2[j - 1]:
+                        dp[i][j] = dp[i - 1][j - 1] + 1
+                    else:
+                        dp[i][j] = 0
+                    if dp[i][j] > Max:
+                        Max = dp[i][j]
+                        pos = i - 1
+            return str1[pos - Max + 1:pos + 1]
+
+        pre_list = ds.get_pre_list()
+        gt_list = ds.get_gt_list()
+        # print(pre_list)
+        # print(gt_list)
+        correct = 0
+        count = 0
+        n = len(pre_list)
+        m = len(gt_list)
+        if n < m:
+            pre_list.extend(['' for _ in range(m - n)])
+        else:
+            gt_list.extend(['' for _ in range(n - m)])
+
+        for x in range(len(gt_list)):
+            gt_parse_list = gt_list[x].split('*')
+            gt_parse_list.pop()
+            pre_parse_list = pre_list[x].split('*')
+            pre_parse_list.pop()
+            # print(gt_parse_list)
+            # print(pre_parse_list)
+            n1 = len(pre_parse_list)
+            m1 = len(gt_parse_list)
+            # print(n1, m1)
+            if n1 < m1:
+                pre_parse_list.extend(['' for _ in range(m1 - n1)])
+            else:
+                gt_parse_list.extend(['' for _ in range(n1 - m1)])
+
+            for j in range(len(gt_parse_list)):
+                count += 1
+                # infer = get_format_table_accuracy(gt_list[x], pre_list[x])
+                if gt_parse_list[j] == pre_parse_list[j] or \
+                        gt_parse_list[j].replace(' ', '') == pre_parse_list[j].replace(' ', ''):
+                    correct += 1
+                if key == 'new':
+                    self.new_table_text.extend(
+                        [f'{x + 1}行',
+                         gt_parse_list[j],
+                         pre_parse_list[j],
+                         '✅' if gt_parse_list[j] == pre_parse_list[j] or gt_parse_list[j].replace(' ', '') ==
+                                pre_parse_list[j].replace(' ', '') else '❌'])
+                elif key == 'old':
+                    self.old_table_text.extend(
+                        [f'{x + 1}行',
+                         gt_parse_list[j],
+                         pre_parse_list[j],
+                         '✅' if gt_parse_list[j] == pre_parse_list[j] or gt_parse_list[j].replace(' ', '') ==
+                                pre_parse_list[j].replace(' ', '') else '❌'])
+
+        acc = correct / count * 100
+        self.acc = acc
+        if key == 'new':
+            rows = len(self.new_table_text) // columns
+            self.write_header(level=2, title=f'表格识别正确率:{acc:.2f}%')
+            self.write_header(level=3, title=f'共检测{count}处,'
+                                             f'正确{correct},'
+                                             f'错误{count - correct}')
+            self.f.new_table(columns=columns, rows=rows, text=self.new_table_text, text_align=text_align)
+        elif key == 'old':
+            rows = len(self.old_table_text) // columns
+            self.write_header(level=2, title=f'表格识别正确率:{acc:.2f}%')
+            self.write_header(level=3, title=f'共检测{count}处,'
+                                             f'正确{correct},'
+                                             f'错误{count - correct}')
+            self.f.new_table(columns=columns, rows=rows, text=self.old_table_text, text_align=text_align)
+
+    def get_table_accuracy(self):
+        if self.acc < 0.6:
+            with open('../output/worst.txt', 'a') as f:
+                f.write(self.img_name + '\n')
+        return self.acc
+
+    # 比较两个json文件 并在md文件中写入对比结果
+    def evaluate_one(self, xlsx_dict, res_dict):
+        true_num = 0
+        xlsx_dict_no_space: dict = copy.deepcopy(xlsx_dict)
+        for index, text in xlsx_dict_no_space.items():
+            if type(xlsx_dict_no_space[index]) is str:
+                xlsx_dict_no_space[index] = text.replace(' ', '')
+            elif type(xlsx_dict_no_space[index]) is list:
+                for k, v in enumerate(xlsx_dict_no_space[index]):
+                    xlsx_dict_no_space[index][k] = v.replace(' ', '')
+        # 有key值的比较
+        for key_yes in res_dict:
+            if type(res_dict[key_yes]) is str:
+                if Levenshtein_Distance(res_dict[key_yes], xlsx_dict_no_space[key_yes]) == 0:
+                    self.table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
+                    true_num += 1
+                else:
+                    self.table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
+        # 无key值的比较
+        key_no_dict = {}
+        for key_no_xlsx_no_space, key_no_xlsx in zip(xlsx_dict_no_space['noKeyList'], xlsx_dict['noKeyList']):
+            key_no_dict[key_no_xlsx_no_space] = []
+            for key_no_res in res_dict['noKeyList']:
+                key_no_dict[key_no_xlsx_no_space].append(
+                    (Levenshtein_Distance(key_no_xlsx_no_space, key_no_res), key_no_res))
+            sort_NoKey = sorted(key_no_dict[key_no_xlsx_no_space], key=lambda x: x[0])
+            NoKey_min_distance = sort_NoKey[0][0]
+            if NoKey_min_distance == 0:
+                self.table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
+                true_num += 1
+            else:
+                self.table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
+
+        # 算正确率
+        all_num = len(self.table_result) // 4 - 1
+        rate = true_num / all_num * 100
+        # all_rate.append(rate)
+        statistics = f'共{all_num}个字段,正确{true_num}个,错误{all_num - true_num}个'
+        self.write_header(level=2, title=f'文字识别正确率:{rate}')
+        self.write_header(level=3, title=statistics)
+        self.f.new_table(columns=4, rows=len(self.table_result) // 4, text=self.table_result, text_align='center')
+        return rate, statistics
+
+    # def evaluate_one(xlsx_dict, res_dict):
+    #     true_num = 0
+    #     # 有key值的比较
+    #     for key_yes in res_dict:
+    #         if type(res_dict[key_yes]) is str:
+    #             if Levenshtein_Distance(res_dict[key_yes], xlsx_dict[key_yes]) == 0:
+    #                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '✅'])
+    #                 true_num += 1
+    #             else:
+    #                 table_result.extend([key_yes, xlsx_dict[key_yes], res_dict[key_yes], '❌'])
+    #     # 无key值的比较
+    #     key_no_dict = {}
+    #     for key_no_xlsx in xlsx_dict['noKeyList']:
+    #         key_no_dict[key_no_xlsx] = []
+    #         for key_no_res in res_dict['noKeyList']:
+    #             key_no_dict[key_no_xlsx].append((Levenshtein_Distance(key_no_xlsx, key_no_res), key_no_res))
+    #         sort_NoKey = sorted(key_no_dict[key_no_xlsx], key=lambda x: x[0])
+    #         NoKey_min_distance = sort_NoKey[0][0]
+    #         if NoKey_min_distance == 0:
+    #             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '✅'])
+    #             true_num += 1
+    #         else:
+    #             table_result.extend(['无key值', key_no_xlsx, sort_NoKey[0][1], '❌'])
+    #     # 算正确率
+    #     rate = true_num / (len(table_result) / 4)
+    #     all_rate.append(rate)
+    #     statistics = f'共{len(table_result) // 4}个字段,正确{true_num}个,错误{len(table_result) // 4 - true_num}个'
+    #     return "{:.2f}%".format(rate * 100), statistics

+ 48 - 0
YQ_OCR/utils/utils.py

@@ -0,0 +1,48 @@
+import json
+import requests
+from pathlib import Path
+from YQ_OCR.configs.config import *
+
+
+# 发送请求 带正确答案参数
+def send_request(img_path: Path, img_json: str):
+    file = {'file': (img_path.name, open(img_path, 'rb'), img_path)}
+    payload = {'docDataStr': img_json}
+    r = requests.post(url + url_path, files=file, data=payload)
+    return r.json()
+
+
+# 编辑距离
+def Levenshtein_Distance(str1, str2):
+    matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
+    for i in range(1, len(str1) + 1):
+        for j in range(1, len(str2) + 1):
+            d = 0 if (str1[i - 1] == str2[j - 1]) else 1
+            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
+    return matrix[len(str1)][len(str2)]
+
+
+# 处理返回结果
+def parse_result(r):  # sourcery skip: dict-comprehension
+    if r['status'] == '000':
+        result = r['result']
+        res = {}
+        for field in keyDict:
+            if field in result:
+                res[field] = result[field]
+        res['noKeyList'] = result['noKeyList']
+        res['logoList'] = result['logoList']
+        res['tableList'] = result['tableList']
+        logoFileName = [log['logoFileName'] for log in res['logoList']]
+        res['logoList'] = logoFileName
+        return res
+    elif r['status'] == '101':
+        return "101"
+
+
+# 打开正确的json文件
+def open_true_json(j_path):
+    with j_path.open('r', encoding='utf-8') as f:
+        j_dict = json.load(f)
+        j_json_str = json.dumps(j_dict, ensure_ascii=False)
+        return j_dict, j_json_str

+ 2 - 3
YQ_OCR/to_md/xlsx_convert_json.py → YQ_OCR/utils/xlsx_convert_json.py

@@ -2,14 +2,13 @@ import json
 import re
 from itertools import chain
 from pathlib import Path
-
 import pandas as pd
-from YQ_OCR.config import keyDict
+from YQ_OCR.configs.config import keyDict
 
 # 把xlsx转成json
 
 
-excels_path = '/Users/sxkj/to_md/YQ_OCR/img'
+excels_path = '/Users/sxkj/utils/YQ_OCR/img'
 
 
 # 返回文档里所以所需识别字符串

Vissa filer visades inte eftersom för många filer har ändrats