Explorar el Código

add max_profit

yan chuanli hace 2 años
padre
commit
6571a67e62

+ 3 - 2
.gitignore

@@ -1,5 +1,6 @@
 .DS_Store
 .idea
+.vscode
 __pycache__/
-*.json
-*.md
+#*.json
+#*.md

+ 8 - 0
HR_OCR/TestAllOcr/config.py

@@ -1,3 +1,11 @@
+'''
+Author: zeke-chin zeke-chin@icloud.com
+Date: 2022-09-26 14:58:10
+LastEditors: zeke-chin zeke-chin@icloud.com
+LastEditTime: 2022-09-30 09:59:43
+FilePath: /to_md/HR_OCR/TestAllOcr/config.py
+Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+'''
 import base64
 import requests
 

+ 9 - 1
HR_OCR/TestAllOcr/test_interface.py

@@ -1,3 +1,11 @@
+'''
+Author: zeke-chin zeke-chin@icloud.com
+Date: 2022-09-28 20:28:41
+LastEditors: zeke-chin zeke-chin@icloud.com
+LastEditTime: 2022-09-30 10:06:35
+FilePath: /to_md/HR_OCR/TestAllOcr/test_interface.py
+Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+'''
 from pathlib import Path
 import unittest
 import config
@@ -6,7 +14,7 @@ from config import send_request
 image_path = 'image'
 
 # DX_test, DX_sb, DX_test, DX_sb
-envl = 'DX_test'
+envl = 'TX_sb'
 url = config.URL[envl]
 token = config.TOKEN[envl]
 

+ 46 - 0
HR_OCR/test_script/to_md/README.md

@@ -0,0 +1,46 @@
+# 人力OCR
+
+## 生成markdown测试报告脚本
+
+1. 生成**目的文件夹**下图片的**json文件**
+
+- **目的文件夹**: 
+  - 存放所需测试**图片文件夹**
+  - 对**文件夹**内图片进行标准化
+    - 运行`suffix.py`脚本
+    - 产生*.jpg
+  - 生成<u>算法推理json文件</u>
+    - 运行`convert_json.py`脚本
+    - 产生对应jpg 文件的json文件
+  - 修改<u>算法推理json文件</u>成**正确的json文件**
+
+2. 跑生成md报告脚本
+
+- 修改use.py 并运行
+
+  ```python
+  # config
+  
+  # 目的文件夹
+  image_path = Path('/Users/zeke/work/sx/OCR/image_data/户口本9.30/0/img/')
+  # 图片type(如果接口不存在传0不影响结果)
+  image_type = 0
+  # 是否旋转
+  image_rotate = False
+  
+  # ocr地址选择
+  # 本地环境、腾讯云测试环境、腾讯云生产环境、电信云测试环境、电信云生产环境
+  ocr_address = 'local'  # 'local' 'TXtest' 'TXsb' 'DXtest' 'DXsb'
+  # ocr能力选择
+  ocr_name = 'regbook'  # 'cet' 'idcard' 'bankcard' 'regbook' 'schoolcert' 'business_license'
+  # 生产MD文件名
+  md_name = 'RegBook'
+  # ocr能力对应字段()
+  filed = 'regbook'
+  
+  # 若md_path为None 则默认使用图片父路径为markdown保存路径
+  # md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+  md_path = None or image_path.parent
+  ```
+
+  

BIN
HR_OCR/test_script/to_md/example/img/1_img.jpg


+ 9 - 0
HR_OCR/test_script/to_md/example/img/1_img.json

@@ -0,0 +1,9 @@
+{
+    "orientation": 0,
+    "name": "鉴康",
+    "id": "152801200003178527",
+    "language": "英语",
+    "level": "CET4",
+    "exam_time": "2021年6月",
+    "score": "451"
+}

BIN
HR_OCR/test_script/to_md/example/img/2_img.jpg


+ 9 - 0
HR_OCR/test_script/to_md/example/img/2_img.json

@@ -0,0 +1,9 @@
+{
+    "orientation": 0,
+    "name": "张鑫",
+    "id": "140227199809282317",
+    "language": "英语",
+    "level": "CET4",
+    "exam_time": "2021年6月",
+    "score": "445"
+}

+ 270 - 0
HR_OCR/test_script/to_md/new.py

@@ -0,0 +1,270 @@
+from pathlib import Path
+from typing import List, Optional
+import cv2
+import requests
+from mdutils.mdutils import MdUtils
+from dataclasses import dataclass
+import json
+import time
+import base64
+from itertools import chain
+from tqdm import tqdm
+from ocr_config import OCR_CONFIGS, Filed
+
+
+class Image:
+    def __init__(self, path: Path, rotate, is_rotate):
+        self._path = path
+        self.rotate = rotate
+        self._ocr_result = None
+        self.category = True
+        self.is_rotate = is_rotate
+        try:
+            self.gt_result = self.get_json()
+        except Exception as e:
+            print(self.json_path)
+            raise e
+
+    def __repr__(self):
+        return f'path: {self.path}, rotate: {self.rotate}, gt_result: {self.gt_result}, cate: {self.category}'
+
+    # 将方法转换为相同名称的只读属性
+    @property
+    def path(self):
+        return self._path
+
+    @path.setter
+    def path(self, path):
+        self._path = path
+
+    @property
+    def fn(self):
+        return self._path.stem
+
+    @property
+    def ocr_result(self):
+        return self._ocr_result
+
+    @ocr_result.setter
+    def ocr_result(self, value):
+        self._ocr_result = value
+
+    def get_gt_result(self, key):# sourcery skip: merge-duplicate-blocks, remove-redundant-if
+        if key == 'orientation':
+            if self.is_rotate:
+                return self.rotate + 1 if self.rotate is not None else 0
+            else:
+                return self.gt_result[key]
+        elif key in self.gt_result:
+            return self.gt_result[key]
+        else:
+            return None
+
+    @property
+    def json_path(self):
+        return self.path.parent / f'{self.path.stem}.json'
+
+    def save_image(self, img, rotate):
+        dst = self.path.parent.parent / (".ro_dst")
+        if not dst.exists(): dst.mkdir()
+        self.path = dst / f'{self.path.stem}-{rotate + 1}.jpg'
+        # print('save image', self.path)
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(str(self.path), img)
+        return self.path
+
+    def get_base64(self, rotate=None):
+        # print(self.path)
+        img = cv2.imread(str(self.path))
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        path = self.path
+        if rotate is not None:
+            img = cv2.rotate(img, rotate)
+            path = self.save_image(img, rotate)
+            # imencode 将图片编码到缓存,并保存到本地
+        with open(path, 'rb') as f:
+            return base64.encodebytes(f.read()).decode('utf-8')
+
+    def get_json(self):
+        with open(self.json_path, 'r') as f:
+            return json.load(f)
+
+
+def send_request(image: Image, ocr_name, ocr_address, image_type=None):
+    base64_str = image.get_base64(image.rotate)
+    config = OCR_CONFIGS[ocr_name][ocr_address]
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': config.token
+    }
+    data = {
+        'image': base64_str,
+    }
+    if image_type is not None:
+        data['image_type'] = image_type
+    response = requests.post(config.url, headers=headers, json=data)
+    return response.json()
+
+
+def parser_path(path: Path, rotate: bool):
+    name = time.strftime("%m-%d_", time.localtime()) + path.name
+    if rotate:
+        name = f'{name}_R.md'
+    return path.parent / name
+
+
+class Dataset(object):
+    def __init__(self, images_path, image_type, ocr_name, ocr_address, field, rotate=False):
+        self.image_type = image_type
+        self.ocr_name = ocr_name
+        self.ocr_address = ocr_address
+        self.images_path = images_path
+        self.image_list = []
+        # chain 迭代器,首先返回第一个可迭代对象中所有元素,接着返回下一个可迭代对象中所有元素,直到耗尽所有可迭代对象中的元素
+        # eg:chain('ABC', 'DEF') --> A B C D E F
+
+        for p in chain(*[Path(self.images_path).rglob('*.jpg')]):
+            if rotate:
+                self.image_list.extend(Image(p, r, rotate) for r in [None, 0, 1, 2])
+            else:
+                self.image_list.append(Image(p, None, rotate))
+
+        self.field = Filed.get(field)
+
+        self.correct = {k: 0 for k in self.field}
+        self.error = {k: 0 for k in self.field}
+
+    def __len__(self):
+        return len(self.image_list)
+
+    def _evaluate_one(self, image: Image):
+        def _get_predict(r, key):
+            # isinstance() 函数来判断一个对象是否是一个已知的类型
+            if isinstance(r[key], dict):
+                return r[key]['text']
+            else:
+                return r[key]
+
+        if image.rotate is not None: image.gt_result['orientation'] = image.rotate + 1
+        r = send_request(image, self.ocr_name, self.ocr_address, self.image_type)
+        err_str = ''
+        if r['status'] == '000':
+            res = r['result']
+            for key in self.field:
+                # print('attr: ', key)
+                if key in res:
+                    gt = image.get_gt_result(key)
+                    predict = _get_predict(res, key)
+                    # print(f'gt: {gt}, predict: {predict}')
+                    if predict == gt:
+                        self.correct[key] += 1
+                    else:
+                        image.category = False
+                        self.error[key] += 1
+                        err_str += f'-------{key}-------<br>正确:{gt}<br>返回:{predict}<br>'
+            if image.category:
+                image.ocr_result = image.gt_result
+            else:
+                image.ocr_result = err_str
+        else:
+            image.ocr_result = r['msg']
+            image.category = False
+            for key in self.field:
+                self.error[key] += 1
+
+    def __call__(self):  # sourcery skip: yield-from
+        # yield 返回一个生成器
+        for image in self.image_list:
+            yield image
+
+    # 比较
+    def evaluate(self):
+        for image in tqdm(self.image_list):
+            self._evaluate_one(image)
+
+    # 计算总体准确度
+    @property
+    def accuracy(self):
+        return sum(list(self.correct.values())) / sum(list(self.correct.values()) + list(self.error.values()))
+
+    # 计算元素准确度
+    @property
+    def attrs_accuracy(self):
+        return {k: self.correct[k] / (self.correct[k] + self.error[k]) for k in self.field}
+
+
+class MD(object):
+    def __init__(self, file_path: Path):
+        self.name = file_path.name
+        self.f = MdUtils(file_name=str(file_path))
+        self.field_table: List = ['字段', '正确率']
+        self.true_table: List = ['图片', '识别结果']
+        self.false_table: List = ['图片', '识别结果']
+        self.write_header(f'{self.name}测试报告')
+
+    def write_header(self, title, level=1):
+        self.f.new_header(level=level, title=title)
+
+    def write_total_accuracy(self, ds: Dataset):
+        def get_format_total_accuracy(ds: Dataset):
+            acc = ds.accuracy * 100
+            return "{:.2f}%".format(acc)
+
+        # 1. 拿到format之后的百分数
+        res = get_format_total_accuracy(ds)
+
+        # 2. 写入
+        self.f.new_paragraph(res)
+
+    def write_table_accuracy(self, ds: Dataset, columns=2, text_align='center'):
+        def format_table_accuracy(ds: Dataset):
+            table = ds.attrs_accuracy
+            for k, v in table.items():
+                acc = v * 100
+                table[k] = "{:.2f}%".format(acc)
+            return table
+
+        def dict_2_list(dic: dict):
+            l = []
+            for k, v in dic.items():
+                l.extend((k, v))
+            return l
+
+        table_dict = format_table_accuracy(ds)
+        table_list = dict_2_list(table_dict)
+        self.field_table.extend(table_list)
+
+        rows = len(self.field_table) // columns
+        self.f.new_table(columns=columns, rows=rows, text=self.field_table, text_align=text_align)
+
+    def write_table_result(self, ds: Dataset, columns=2, text_align='center'):
+        for image in ds.image_list:
+            md_image = self.f.new_inline_image(text='', path=f'{image.path.parent.name}/{image.path.name}')
+            if image.category:
+                self.true_table.extend([md_image, image.ocr_result])
+            else:
+                self.false_table.extend([md_image, image.ocr_result])
+
+        true_rows = len(self.true_table) // columns
+        false_rows = len(self.false_table) // columns
+        self.write_header('True')
+        self.f.new_table(columns=columns, rows=true_rows, text=self.true_table, text_align=text_align)
+        self.write_header('False')
+        self.f.new_table(columns=columns, rows=false_rows, text=self.false_table, text_align='left')
+
+# if __name__ == '__main__':
+#     markdown = MD('英语等级证书')
+#
+#     dataset = Dataset(Path(''), 'cet', 'local', False)
+#     print(len(dataset))
+#     for d in dataset():
+#         print(d)
+#
+#     dataset.evaluate()
+#     print(dataset.accuracy)
+#
+#     markdown.write_total_accuracy(dataset)
+#     markdown.write_table_accuracy(dataset)
+#     markdown.write_table_result(dataset)
+#
+#     markdown.f.create_md_file()

+ 109 - 0
HR_OCR/test_script/to_md/ocr_config.py

@@ -0,0 +1,109 @@
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class Type:
+    image_type: int
+    image_field: List
+
+
+@dataclass
+class RequestConfig:
+    url: str
+    token: str
+
+
+@dataclass
+class Configs:
+    request: RequestConfig
+    type: Type
+
+
+# cet
+cet_local_config = RequestConfig(url='http://192.168.199.27:18050/ocr_system/cet', token='')
+cet_TXtest_config = RequestConfig(
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm//cettest/cet',
+    token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
+cet_TXsb_config = RequestConfig(
+    url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/cet/cet',
+    token='dcae8cc6-0e49-4db8-a2d2-94ef84da3636')
+cet_DXtest_config = RequestConfig(
+    url='http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/cettest/cet',
+    token='4e00c444-620b-4d3c-85f4-777e64276f0e')
+cet_DXsb_config = RequestConfig(
+    url='http://aihub-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/cet/cet',
+    token='e045de0a-e97f-4f23-b4d5-6a032c39a81e')
+
+CET_CONFIGS = {
+    'local': cet_local_config,
+    'TXtest': cet_TXtest_config,
+    'TXsb': cet_TXsb_config,
+    'DXtest': cet_DXtest_config,
+    'DXsb': cet_DXsb_config
+}
+
+# regbook
+regbook_local_config = RequestConfig(url='http://192.168.199.27:18040/ocr_system/regbook', token='')
+regbook_TXtest_config = RequestConfig(
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm//hkbsbtest/regbook',
+    token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
+regbook_TXsb_config = RequestConfig(
+    url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/hkbsb/regbook',
+    token='dcae8cc6-0e49-4db8-a2d2-94ef84da3636')
+regbook_DXtest_config = RequestConfig(
+    url='http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/hkbsbtest/regbook',
+    token='4e00c444-620b-4d3c-85f4-777e64276f0e')
+regbook_DXsb_config = RequestConfig(
+    url='http://aihub-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/hkbsb/regbook',
+    token='e045de0a-e97f-4f23-b4d5-6a032c39a81e')
+
+REGBOOK_CONFIGS = {
+    'local': regbook_local_config,
+    'TXtest': regbook_TXtest_config,
+    'TXsb': regbook_TXsb_config,
+    'DXtest': regbook_DXtest_config,
+    'DXsb': regbook_DXsb_config
+}
+
+# business_license
+blfe_local_config = RequestConfig(url='http://192.168.199.27:18060/ocr_system/business_license', token='')
+blfe_TXtest_config = RequestConfig(
+    url='http://aihubtest.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/blfetest/blfe',
+    token='8ae1e5f1-1337-4f22-8d46-ff4c110d68fd')
+blfe_TXsb_config = RequestConfig(
+    url='http://aihub.digitalyili.com/aiSquare/openApi/reasoning-services/rlocrxm/blfe/blfe',
+    token='dcae8cc6-0e49-4db8-a2d2-94ef84da3636')
+blfe_DXtest_config = RequestConfig(
+    url='http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/blfetest/blfe',
+    token='4e00c444-620b-4d3c-85f4-777e64276f0e')
+blfe_DXsb_config = RequestConfig(
+    url='http://aihub-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr/blfe/blfe',
+    token='e045de0a-e97f-4f23-b4d5-6a032c39a81e')
+
+BLFE_CONFIGS = {
+    'local': blfe_local_config,
+    'TXtest': blfe_TXtest_config,
+    'TXsb': blfe_TXsb_config,
+    'DXtest': blfe_DXtest_config,
+    'DXsb': blfe_DXsb_config
+}
+
+OCR_CONFIGS = {
+    'cet': CET_CONFIGS,
+    'regbook': REGBOOK_CONFIGS,
+    'business_license': BLFE_CONFIGS
+}
+
+# 字段
+cet_field = ['orientation', 'name', 'id', 'language', 'level', 'exam_time', 'score']
+regbook_field = ['orientation', 'name', 'id', 'gender', 'birthplace', 'birthplace_province', 'birthplace_city',
+                 'birthplace_region', 'native_place', 'native_place_province', 'native_place_city',
+                 'native_place_region', 'blood_type', 'religion']
+business_license = ['orientation', 'social_code', 'company_name', 'legal_person', 'registered_capital', 'type',
+                    'start_date', 'business_scope', 'expire_date', 'address', 'stamp']
+Filed = {
+    'cet': cet_field,
+    'regbook': regbook_field,
+    'business_license': business_license
+}

+ 49 - 0
HR_OCR/test_script/to_md/use.py

@@ -0,0 +1,49 @@
+'''
+Author: zeke-chin zeke-chin@icloud.com
+Date: 2022-09-28 20:28:41
+LastEditors: zeke-chin zeke-chin@icloud.com
+LastEditTime: 2022-09-30 15:08:48
+FilePath: /to_md/HR_OCR/to_md/use.py
+Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+'''
+
+from pathlib import Path
+
+from new import MD, Image, Dataset, parser_path
+
+# config
+# 图片路径
+image_path = Path('/Users/zeke/work/sx/OCR/image_data/户口本9.30/0/img/')
+image_type = 0
+# 是否旋转
+image_rotate = False
+ocr_address = 'DXtest'  # 'local' 'TXtest' 'TXsb' 'DXtest' 'DXsb'
+
+ocr_name = 'cet'  # 'cet' 'idcard' 'bankcard' 'regbook' 'schoolcert' 'business_license'
+md_name = 'CET'
+filed = 'cet'
+
+# 若md_path为None 则默认使用图片父路径为markdown保存路径
+# md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+md_path = None or image_path.parent
+
+md_file = parser_path(Path(md_path) / Path(md_name + image_path.stem), image_rotate)
+
+
+if __name__ == '__main__':
+    markdown = MD(md_file)
+
+    dataset = Dataset(image_path, image_type, ocr_name, ocr_address, filed, image_rotate)
+    print(len(dataset))
+    for d in dataset():
+        print(d)
+
+    dataset.evaluate()
+    print(dataset.accuracy)
+
+    markdown.write_total_accuracy(dataset)
+    markdown.write_table_accuracy(dataset)
+    markdown.write_table_result(dataset)
+
+    print(md_file)
+    markdown.f.create_md_file()

+ 34 - 0
HR_OCR/test_script/tools/README.md

@@ -0,0 +1,34 @@
+# 人力OCR
+
+## convert_json.py
+
+```python
+# 项目url
+url = 'http://192.168.199.27:18040'
+# 目标文件夹
+imgs_path = './HR_OCR/to_md/example/img'
+
+def send_request(img_path, image_type = 0):
+    with open(img_path, 'rb') as f:
+        img_str: str = base64.encodebytes(f.read()).decode('utf-8')
+        data = {
+            'image': img_str,
+            'image_type': image_type
+        }
+        idc_header = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer 4e00c444-620b-4d3c-85f4-777e64276f0e'
+        }
+        r = requests.post(f'{url}/cettest/cet', json=data, headers=idc_header)
+        # r = requests.post(f'{url}/hkbsbtest/regbook', json=data)
+        print(r.json())
+        return r.json()
+```
+
+## suffix.py
+
+```python
+# 需要格式化的目的文件夹路径
+target_path = './HR_OCR/to_md/example/img'
+```
+

+ 53 - 0
HR_OCR/test_script/tools/convert_json.py

@@ -0,0 +1,53 @@
+from pathlib import Path
+
+import requests
+import json
+import base64
+from itertools import chain
+
+url = 'http://aihubpre-idc.digitalyili.com/aiSquare/openApi/reasoning-services/hrocr'
+imgs_path = './HR_OCR/to_md/example/img'
+
+def send_request(img_path, image_type = 0):
+    with open(img_path, 'rb') as f:
+        img_str: str = base64.encodebytes(f.read()).decode('utf-8')
+        data = {
+            'image': img_str,
+            'image_type': image_type
+        }
+        idc_header = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer 4e00c444-620b-4d3c-85f4-777e64276f0e'
+        }
+        r = requests.post(f'{url}/cettest/cet', json=data, headers=idc_header)
+        # r = requests.post(f'{url}/hkbsbtest/regbook', json=data)
+        print(r.json())
+        return r.json()
+
+
+def _parse_result(r):
+    if r['status'] == '000':
+        r = r['result']
+        if r:
+            del r['confidence']
+        return {k: v['text'] if isinstance(v, dict) else v for k, v in r.items()}
+    elif r['status'] == '101':
+        return "101"
+
+
+if __name__ == '__main__':
+
+    # 0
+    # img_paths = chain(*[Path(root / imgs_path).rglob(f'*.{ext}') for ext in ['jpeg', 'jpg', 'png', 'JPG', 'PNG']])
+    img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg']])
+    for img_path in img_paths:
+        print(img_path)
+        r = send_request(img_path)
+        res = _parse_result(r)
+        print(res)
+        img_path: Path = img_path
+        d = img_path.parent
+        fn = f'{img_path.stem}.json'
+
+        with (d / fn).open('w', encoding='utf-8') as f:
+            json.dump(res, f, ensure_ascii=False, indent=4)

+ 28 - 0
HR_OCR/test_script/tools/suffix.py

@@ -0,0 +1,28 @@
+import os
+from pathlib import Path
+from itertools import chain
+import sys
+
+# conf
+target_path = './HR_OCR/to_md/example/img'
+#suffix = sys.argv[2]
+suffix = 'jpg' if len(sys.argv) != 3 else sys.argv[2]
+
+def get_range(n):
+    len_n = len(str(n))
+    for num in range(1, n + 1):
+        output = str(num)
+        while len(output) < len_n:
+            output = f'0{output}'
+        yield output
+
+
+file_paths = list(chain(*[Path(target_path).rglob(f'*.{ext}') for ext in ['jpeg', 'jpg', 'png', 'JPG', 'PNG']]))
+print(len(file_paths))
+num = len(file_paths)
+file_name_list = list(get_range(num))
+for i in range(num):
+    file = file_paths[i]
+    print(file)
+    new = file.parent / f'{file_name_list[i]}_img.{suffix}'
+    file.rename(new)

+ 46 - 0
HR_OCR/to_md/README.md

@@ -0,0 +1,46 @@
+# 人力OCR
+
+## 生成markdown测试报告脚本
+
+1. 生成**目的文件夹**下图片的**json文件**
+
+- **目的文件夹**: 
+  - 存放所需测试**图片文件夹**
+  - 对**文件夹**内图片进行标准化
+    - 运行`suffix.py`脚本
+    - 产生*.jpg
+  - 生成<u>算法推理json文件</u>
+    - 运行`convert_json.py`脚本
+    - 产生对应jpg 文件的json文件
+  - 修改<u>算法推理json文件</u>成**正确的json文件**
+
+2. 跑生成md报告脚本
+
+- 修改use.py 并运行
+
+  ```python
+  # config
+  
+  # 目的文件夹
+  image_path = Path('/Users/zeke/work/sx/OCR/image_data/户口本9.30/0/img/')
+  # 图片type(如果接口不存在传0不影响结果)
+  image_type = 0
+  # 是否旋转
+  image_rotate = False
+  
+  # ocr地址选择
+  # 本地环境、腾讯云测试环境、腾讯云生产环境、电信云测试环境、电信云生产环境
+  ocr_address = 'local'  # 'local' 'TXtest' 'TXsb' 'DXtest' 'DXsb'
+  # ocr能力选择
+  ocr_name = 'regbook'  # 'cet' 'idcard' 'bankcard' 'regbook' 'schoolcert' 'business_license'
+  # 生产MD文件名
+  md_name = 'RegBook'
+  # ocr能力对应字段()
+  filed = 'regbook'
+  
+  # 若md_path为None 则默认使用图片父路径为markdown保存路径
+  # md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent
+  md_path = None or image_path.parent
+  ```
+
+  

BIN
HR_OCR/to_md/example/img/1_img.jpg


+ 9 - 0
HR_OCR/to_md/example/img/1_img.json

@@ -0,0 +1,9 @@
+{
+    "orientation": 0,
+    "name": "鉴康",
+    "id": "152801200003178527",
+    "language": "英语",
+    "level": "CET4",
+    "exam_time": "2021年6月",
+    "score": "451"
+}

BIN
HR_OCR/to_md/example/img/2_img.jpg


+ 9 - 0
HR_OCR/to_md/example/img/2_img.json

@@ -0,0 +1,9 @@
+{
+    "orientation": 0,
+    "name": "张鑫",
+    "id": "140227199809282317",
+    "language": "英语",
+    "level": "CET4",
+    "exam_time": "2021年6月",
+    "score": "445"
+}

+ 6 - 5
HR_OCR/to_md/ocr_config.py

@@ -1,4 +1,3 @@
-
 from dataclasses import dataclass
 from typing import List
 
@@ -14,6 +13,7 @@ class RequestConfig:
     url: str
     token: str
 
+
 @dataclass
 class Configs:
     request: RequestConfig
@@ -98,11 +98,12 @@ OCR_CONFIGS = {
 # 字段
 cet_field = ['orientation', 'name', 'id', 'language', 'level', 'exam_time', 'score']
 regbook_field = ['orientation', 'name', 'id', 'gender', 'birthplace', 'birthplace_province', 'birthplace_city',
-                          'birthplace_region', 'native_place', 'native_place_province', 'native_place_city',
-                          'native_place_region', 'blood_type', 'religion']
-business_license = ['orientation', 'social_code', 'company_name', 'legal_person', 'registered_capital', 'type', 'start_date', 'business_scope', 'expire_date', 'address', 'stamp']
+                 'birthplace_region', 'native_place', 'native_place_province', 'native_place_city',
+                 'native_place_region', 'blood_type', 'religion']
+business_license = ['orientation', 'social_code', 'company_name', 'legal_person', 'registered_capital', 'type',
+                    'start_date', 'business_scope', 'expire_date', 'address', 'stamp']
 Filed = {
     'cet': cet_field,
     'regbook': regbook_field,
     'business_license': business_license
-}
+}

+ 14 - 4
HR_OCR/to_md/use.py

@@ -1,3 +1,11 @@
+'''
+Author: zeke-chin zeke-chin@icloud.com
+Date: 2022-09-28 20:28:41
+LastEditors: zeke-chin zeke-chin@icloud.com
+LastEditTime: 2022-09-30 15:08:48
+FilePath: /to_md/HR_OCR/to_md/use.py
+Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+'''
 
 from pathlib import Path
 
@@ -5,15 +13,17 @@ from new import MD, Image, Dataset, parser_path
 
 # config
 # 图片路径
+
 image_path = Path('/Users/sxkj/Desktop/9.29/1/img')
 image_type = 1
+
 # 是否旋转
 image_rotate = False
-ocr_address = 'local'  # 'local' 'TXtest' 'TXsb''DXtest' 'DXsb'
+ocr_address = 'DXtest'  # 'local' 'TXtest' 'TXsb' 'DXtest' 'DXsb'
 
-ocr_name = 'regbook'  # 'cet' 'idcard' 'bankcard' 'regbook' 'schoolcert''business_license'
-md_name = 'RegBook'
-filed = 'regbook'
+ocr_name = 'cet'  # 'cet' 'idcard' 'bankcard' 'regbook' 'schoolcert' 'business_license'
+md_name = 'CET'
+filed = 'cet'
 
 # 若md_path为None 则默认使用图片父路径为markdown保存路径
 # md_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/example' or image_path.parent

+ 34 - 0
HR_OCR/tools/README.md

@@ -0,0 +1,34 @@
+# 人力OCR
+
+## convert_json.py
+
+```python
+# 项目url
+url = 'http://192.168.199.27:18040'
+# 目标文件夹
+imgs_path = './HR_OCR/to_md/example/img'
+
+def send_request(img_path, image_type = 0):
+    with open(img_path, 'rb') as f:
+        img_str: str = base64.encodebytes(f.read()).decode('utf-8')
+        data = {
+            'image': img_str,
+            'image_type': image_type
+        }
+        idc_header = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer 4e00c444-620b-4d3c-85f4-777e64276f0e'
+        }
+        r = requests.post(f'{url}/cettest/cet', json=data, headers=idc_header)
+        # r = requests.post(f'{url}/hkbsbtest/regbook', json=data)
+        print(r.json())
+        return r.json()
+```
+
+## suffix.py
+
+```python
+# 需要格式化的目的文件夹路径
+target_path = './HR_OCR/to_md/example/img'
+```
+

+ 7 - 8
HR_OCR/tools/convert_json.py

@@ -14,11 +14,12 @@ def send_request(img_path):
         data = {
             'image': img_str
         }
-        # idc_header = {
-        #     'Content-Type': 'application/json',
-        #     'Authorization': 'Bearer 4e00c444-620b-4d3c-85f4-777e64276f0e'
-        # }
-        r = requests.post(f'{url}/ocr_system/regbook', json=data)
+        idc_header = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer 4e00c444-620b-4d3c-85f4-777e64276f0e'
+        }
+        r = requests.post(f'{url}/cettest/cet', json=data, headers=idc_header)
+        # r = requests.post(f'{url}/hkbsbtest/regbook', json=data)
         print(r.json())
         return r.json()
 
@@ -36,10 +37,8 @@ def _parse_result(r):
 if __name__ == '__main__':
 
     # 0
-    root = Path(__file__).parent
-    print(root)
     # img_paths = chain(*[Path(root / imgs_path).rglob(f'*.{ext}') for ext in ['jpeg', 'jpg', 'png', 'JPG', 'PNG']])
-    img_paths = chain(*[Path(root / imgs_path).rglob(f'*.{ext}') for ext in ['jpg']])
+    img_paths = chain(*[Path(imgs_path).rglob(f'*.{ext}') for ext in ['jpg']])
     for img_path in img_paths:
         print(img_path)
         r = send_request(img_path)

+ 5 - 0
HR_OCR/tools/suffix.py

@@ -4,7 +4,11 @@ from itertools import chain
 import sys
 
 # conf
+<<<<<<< HEAD
 target_path = '/Users/sxkj/to_md/9.29/1'
+=======
+target_path = './HR_OCR/to_md/example/img'
+>>>>>>> a0b1c051142a2687d8ae2c63f543f021d3239f1f
 #suffix = sys.argv[2]
 suffix = 'jpg' if len(sys.argv) != 3 else sys.argv[2]
 
@@ -18,6 +22,7 @@ def get_range(n):
 
 
 file_paths = list(chain(*[Path(target_path).rglob(f'*.{ext}') for ext in ['jpeg', 'jpg', 'png', 'JPG', 'PNG']]))
+print(len(file_paths))
 num = len(file_paths)
 file_name_list = list(get_range(num))
 for i in range(num):

+ 25 - 0
YQ_OCR/img/03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版.json

@@ -0,0 +1,25 @@
+{
+    "productCategory": "产品种类:调制豆乳",
+    "ingredients": "配料:饮用水、大豆(非转基因)、白砂糖",
+    "proStanCode": "产品标准代号:GB/T30885",
+    "productionDate": "生产日期:见瓶盖",
+    "shelfLife": "保质期:常温密闭条件下9个月",
+    "storageConditions": "贮存条件:请保存于阴凉干燥处,避免阳光直晒、高温",
+    "conSerHotline": "消费者服务热线:4008169999",
+    "tips": "温馨提示:请勿带包装置于微波炉中加热。",
+    "welcome": "欢迎访问:www.yili.com",
+    "noKeyList": [
+        "植选",
+        "浓香豆乳畅饮系列",
+        "大豆添加量:44g/瓶",
+        "原味",
+        "全程非转基因可追溯大豆",
+        "3.0g/100mL",
+        "优质植物蛋白",
+        "保持环境清洁请勿乱抛空瓶",
+        "为保证产品风味,开启后需冷藏并尽快饮用完毕。",
+        "可能会有少量蛋白沉淀和脂肪上浮,属正常现象,请放心饮用。如发现涨瓶,请勿开启。",
+        "净含量:315mL",
+        "6907992515007"
+    ]
+}

+ 33 - 0
YQ_OCR/img/03-植选PET 内包—植选豆乳以团之名形象定制包装周艺轩版.md

@@ -0,0 +1,33 @@
+
+
+
+
+# 测试结果
+
+## 正确率:71.43%
+
+### 共21个字段,正确15个,错误6个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:调制豆乳|产品种类:调制豆乳|✅|
+|ingredients|配料:饮用水、大豆(非转基因)、白砂糖|配料:饮用水、大豆(非转基因)白砂糖大豆添加量:44g/瓶|❌|
+|proStanCode|产品标准代号:GB/T30885|产品标准代号:GB/T30885|✅|
+|productionDate|生产日期:见瓶盖|生产日期:见瓶盖|✅|
+|shelfLife|保质期:常温密闭条件下9个月|保质期:常温密闭条件下9个月|✅|
+|storageConditions|贮存条件:请保存于阴凉干燥处,避免阳光直晒、高温|贮存条件:请保存于阴凉干燥处,避免阳光直晒、高温。|❌|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|tips|温馨提示:请勿带包装置于微波炉中加热。|温馨提示:请勿带包装置于微波炉中加热。|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.ili.com|❌|
+|无key值|植选|植选|✅|
+|无key值|浓香豆乳畅饮系列|浓香豆乳畅饮系列|✅|
+|无key值|大豆添加量:44g/瓶|生产日期:见瓶盖|❌|
+|无key值|原味|原味|✅|
+|无key值|全程非转基因可追溯大豆|全程非转基因可追溯大豆|✅|
+|无key值|3.0g/100mL|3.0g|❌|
+|无key值|优质植物蛋白|优质植物蛋白|✅|
+|无key值|保持环境清洁请勿乱抛空瓶|保持环境清洁请勿乱抛空瓶|✅|
+|无key值|为保证产品风味,开启后需冷藏并尽快饮用完毕。|为保证产品风味,开启后需冷藏并尽快饮用完毕。|✅|
+|无key值|可能会有少量蛋白沉淀和脂肪上浮,属正常现象,请放心饮用。如发现涨瓶,请勿开启。|可能会有少量蛋白沉淀和脂肪上浮属正常现象,请放心饮用。如发现胀瓶,请勿开启。|❌|
+|无key值|净含量:315mL|净含量:315mL|✅|
+|无key值|6907992515007|6907992515007|✅|

+ 23 - 0
YQ_OCR/img/巧克力味牛奶饮品.json

@@ -0,0 +1,23 @@
+{
+    "productCategory": "产品种类:配制型含乳饮料",
+    "ingredients": "配料:生牛乳、饮用水、白砂糖、可可粉、食品添加剂(微晶纤维素、单,双甘油脂肪酸酯、蔗糖脂肪酸酯、柠檬酸钠、结冷胶、安赛蜜、三氯蔗糖、食品用香精)",
+    "proStanCode": "产品标准代号:GB/T21732",
+    "productionDate": "生产日期:见盒顶部",
+    "shelfLife": "保质期:常温密闭条件下6个月",
+    "storageConditions": "贮存条件:未开启前,无需冷藏,开启之后,立即饮用。",
+    "conSerHotline": "消费者服务热线:4008169999",
+    "tips": "友情提示:喝前摇一摇",
+    "welcome": "欢迎访问:www.yili.com",
+    "noKeyList": [
+        "牛奶饮品",
+        "产品名称:巧克力味牛奶饮品",
+        "生产日期:见箱体",
+        "切勿带包装置于微波炉中加热",
+        "清真",
+        "保持环境清洁请勿乱抛空包",
+        "伊利",
+        "(具体生产商/产地见生产日期末端代码)",
+        "净含量:250mL",
+        "6907992500102"
+    ]
+}

+ 30 - 0
YQ_OCR/img/巧克力味牛奶饮品.md

@@ -0,0 +1,30 @@
+
+
+
+
+# 测试结果
+
+## 正确率:66.67%
+
+### 共18个字段,正确12个,错误6个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:配制型含乳饮料|产品种类:配制型含乳饮料|✅|
+|ingredients|配料:生牛乳、饮用水、白砂糖、可可粉、食品添加剂(微晶纤维素、单,双甘油脂肪酸酯、蔗糖脂肪酸酯、柠檬酸钠、结冷胶、安赛蜜、三氯蔗糖、食品用香精)|配料:生牛乳、饮用水、白砂糖可可粉、食品添加剂(微晶纤维素、单,双甘油脂肪酸酯、蔗糖脂肪酸酯柠檬酸钠、结冷胶、安赛蜜、三氯蔗糖、食品用香精)|❌|
+|proStanCode|产品标准代号:GB/T21732|产品标准代号:GB/T21732|✅|
+|productionDate|生产日期:见盒顶部|生产日期:见盒顶部|✅|
+|storageConditions|贮存条件:未开启前,无需冷藏,开启之后,立即饮用。|贮存条件:未开启前无需冷藏开启之后 立即饮用|❌|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|tips|友情提示:喝前摇一摇|友情提示:喝前摇一摇|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yli.com|❌|
+|无key值|牛奶饮品|牛奶饮品|✅|
+|无key值|产品名称:巧克力味牛奶饮品|产品名称:巧克力味牛奶饮品|✅|
+|无key值|生产日期:见箱体|生产日期:见盒顶部|❌|
+|无key值|切勿带包装置于微波炉中加热|切勿带包装置于微波炉中加热|✅|
+|无key值|清真|清真|✅|
+|无key值|保持环境清洁请勿乱抛空包|保持环境清洁请勿乱抛空包|✅|
+|无key值|伊利|伊利|✅|
+|无key值|(具体生产商/产地见生产日期末端代码)|(具体生产商/产地见生产日期末端代码)|❌|
+|无key值|净含量:250mL|净含量:250mL|❌|
+|无key值|6907992500102|6907992500102|✅|

+ 23 - 0
YQ_OCR/img/餐饮纯牛奶 内包.json

@@ -0,0 +1,23 @@
+{
+    "productCategory": "产品种类:全脂灭菌纯牛乳",
+    "ingredients": "配料:生牛乳",
+    "proStanCode": "产品标准代号:GB25190",
+    "productionDate": "生产日期:见盒顶部",
+    "shelfLife": "保质期:常温密闭条件下6个月",
+    "storageConditions": "贮存条件:未开启前无需冷藏开启之后请贮存于2-6℃并于2日内饮用完毕",
+    "conSerHotline": "消费者服务热线:4008169999",
+    "welcome": "欢迎访问:www.yili.com",
+    "noKeyList": [
+        "纯牛奶",
+        "餐饮之选",
+        "非脂乳固体≥8.5%",
+        "保持环境清洁请勿乱抛空包",
+        "切勿带包装置于微波炉中加热。",
+        "净含量:1L",
+        "6907992513621",
+        "内蒙古伊利实业集团股份有限公司出品 地址:内蒙古自治区呼和浩特市金山开发区金山大街1号",
+        "宁夏伊利乳业有限责任公司(A12) 产地及地址:宁夏吴忠市利通区金积工业园区 食品生产许可证编号:SC10564030200130",
+        "阜新伊利乳品有限责任公司(B6) 产地及地址:辽宁省阜新市阜蒙县园区路2号 食品生产许可证编号:SC10521090000011",
+        "定州伊利乳业有限责任公司(C1) 产地及地址:河北省定州市伊利工业园区 食品生产许可证编号:SC10613068200020"
+    ]
+}

+ 31 - 0
YQ_OCR/img/餐饮纯牛奶 内包.md

@@ -0,0 +1,31 @@
+
+
+
+
+# 测试结果
+
+## 正确率:94.74%
+
+### 共19个字段,正确18个,错误1个
+
+|key值|正确答案|ocr返回结果|是否正确|
+| :---: | :---: | :---: | :---: |
+|productCategory|产品种类:全脂灭菌纯牛乳|产品种类:全脂灭菌纯牛乳|✅|
+|ingredients|配料:生牛乳|配料:生牛乳|✅|
+|proStanCode|产品标准代号:GB25190|产品标准代号:GB25190|✅|
+|productionDate|生产日期:见盒顶部|生产日期:见盒顶部|✅|
+|shelfLife|保质期:常温密闭条件下6个月|保质期:常温密闭条件下6个月|✅|
+|storageConditions|贮存条件:未开启前无需冷藏开启之后请贮存于2-6℃并于2日内饮用完毕|贮存条件:未开启前无需冷藏开启之后请贮存于2-6℃并于2日内饮用完毕|✅|
+|conSerHotline|消费者服务热线:4008169999|消费者服务热线:4008169999|✅|
+|welcome|欢迎访问:www.yili.com|欢迎访问:www.yili.com|✅|
+|无key值|纯牛奶|纯牛奶|✅|
+|无key值|餐饮之选|餐饮之选|✅|
+|无key值|非脂乳固体≥8.5%|非脂乳固体≥8.5%|✅|
+|无key值|保持环境清洁请勿乱抛空包|保持环境清洁请勿乱抛空包|✅|
+|无key值|切勿带包装置于微波炉中加热。|切勿带包装置于微波炉中加热|❌|
+|无key值|净含量:1L|净含量:1L|✅|
+|无key值|6907992513621|6907992513621|✅|
+|无key值|内蒙古伊利实业集团股份有限公司出品 地址:内蒙古自治区呼和浩特市金山开发区金山大街1号|内蒙古伊利实业集团股份有限公司出品地址:内蒙古自治区呼和浩特市金山开发区金山大街1号|✅|
+|无key值|宁夏伊利乳业有限责任公司(A12) 产地及地址:宁夏吴忠市利通区金积工业园区 食品生产许可证编号:SC10564030200130|宁夏伊利乳业有限责任公司(A12)产地及地址:宁夏吴忠市利通区金积工业园区食品生产许可证编号:SC10564030200130|✅|
+|无key值|阜新伊利乳品有限责任公司(B6) 产地及地址:辽宁省阜新市阜蒙县园区路2号 食品生产许可证编号:SC10521090000011|阜新伊利乳品有限责任公司(B6)产地及地址:辽宁省阜新市阜蒙县园区路2号食品生产许可证编号:SC10521090000011|✅|
+|无key值|定州伊利乳业有限责任公司(C1) 产地及地址:河北省定州市伊利工业园区 食品生产许可证编号:SC10613068200020|定州伊利乳业有限责任公司(C1)产地及地址:河北省定州市伊利工业园区食品生产许可证编号:SC10613068200020|✅|