chenguilong
/
test_script


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							import json
import re
from itertools import chain
from pathlib import Path

import pandas as pd
from YQ_OCR.config import keyDict

# 把xlsx转成json


excels_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/YQ_OCR/img'


# 返回文档里所以所需识别字符串
def get_xlsx_str_list(xlsx_path):
    workbook_pd_Common = pd.read_excel(xlsx_path, sheet_name="Common")
    workbook_pd_Packing = pd.read_excel(xlsx_path, sheet_name="Packing")
    Common_list = list(workbook_pd_Common['内容'].values)
    for index, text in enumerate(Common_list):
        Common_list[index] = text.replace('\n', '').replace(' ', '')
    Packing_list = workbook_pd_Packing.values.tolist()[0]
    str_list = Common_list
    str_list.append(str(Packing_list[0]))
    str_list.append(str(Packing_list[2]))
    return str_list


# 将list 转化为 dict
def xlsx_list_2_dict(xlsx_list):
    xlsx_dict = {}
    for k, v in keyDict.items():
        for x_str in xlsx_list:
            if bool(re.match(v, x_str)):
                xlsx_dict[k] = x_str
                xlsx_list.remove(x_str)
    xlsx_dict['noKeyList'] = xlsx_list
    return xlsx_dict


# 解析文档 返回json
def get_true_json(xlsx_path):  # sourcery skip: inline-immediately-returned-variable
    xlsx_list = get_xlsx_str_list(xlsx_path)
    xlsx_dict = xlsx_list_2_dict(xlsx_list)
    return xlsx_dict


if __name__ == '__main__':
    excel_paths = chain(*[Path(excels_path).rglob('*.xlsx')])

    for excel_path in excel_paths:
        print(excel_path)
        true_json = get_true_json(excel_path)
        json_path = Path(excel_path).with_suffix('.json')
        with json_path.open('w', encoding='utf-8') as f:
            json.dump(true_json, f, ensure_ascii=False, indent=4)