import json import re from itertools import chain from pathlib import Path import pandas as pd from YQ_OCR.config import keyDict # 把xlsx转成json excels_path = '/Users/sxkj/to_md/YQ_OCR/img' # 返回文档里所以所需识别字符串 def get_xlsx_str_list(xlsx_path): workbook_pd_Common = pd.read_excel(xlsx_path, sheet_name="Common") workbook_pd_Packing = pd.read_excel(xlsx_path, sheet_name="Packing") Common_list = list(workbook_pd_Common['内容'].values) for index, text in enumerate(Common_list): Common_list[index] = text.replace('\n', '').replace(' ', '') Packing_list = workbook_pd_Packing.values.tolist()[0] str_list = Common_list str_list.append(str(Packing_list[0])) str_list.append(str(Packing_list[2])) return str_list # 将list 转化为 dict def xlsx_list_2_dict(xlsx_list): xlsx_dict = {} for k, v in keyDict.items(): for x_str in xlsx_list: if bool(re.match(v, x_str)): xlsx_dict[k] = x_str xlsx_list.remove(x_str) xlsx_dict['noKeyList'] = xlsx_list return xlsx_dict # 解析文档 返回json def get_true_json(xlsx_path): # sourcery skip: inline-immediately-returned-variable xlsx_list = get_xlsx_str_list(xlsx_path) xlsx_dict = xlsx_list_2_dict(xlsx_list) return xlsx_dict if __name__ == '__main__': excel_paths = chain(*[Path(excels_path).rglob('*.xlsx')]) for excel_path in excel_paths: print(excel_path) true_json = get_true_json(excel_path) json_path = Path(excel_path).with_suffix('.json') with json_path.open('w', encoding='utf-8') as f: json.dump(true_json, f, ensure_ascii=False, indent=4)