1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- import json
- import re
- from itertools import chain
- from pathlib import Path
- import pandas as pd
- from YQ_OCR.config import keyDict
- # 把xlsx转成json
- excels_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/YQ_OCR/img'
- # 返回文档里所以所需识别字符串
- def get_xlsx_str_list(xlsx_path):
- workbook_pd_Common = pd.read_excel(xlsx_path, sheet_name="Common")
- workbook_pd_Packing = pd.read_excel(xlsx_path, sheet_name="Packing")
- Common_list = list(workbook_pd_Common['内容'].values)
- for index, text in enumerate(Common_list):
- Common_list[index] = text.replace('\n', '').replace(' ', '')
- Packing_list = workbook_pd_Packing.values.tolist()[0]
- str_list = Common_list
- str_list.append(str(Packing_list[0]))
- str_list.append(str(Packing_list[2]))
- return str_list
- # 将list 转化为 dict
- def xlsx_list_2_dict(xlsx_list):
- xlsx_dict = {}
- for k, v in keyDict.items():
- for x_str in xlsx_list:
- if bool(re.match(v, x_str)):
- xlsx_dict[k] = x_str
- xlsx_list.remove(x_str)
- xlsx_dict['noKeyList'] = xlsx_list
- return xlsx_dict
- # 解析文档 返回json
- def get_true_json(xlsx_path): # sourcery skip: inline-immediately-returned-variable
- xlsx_list = get_xlsx_str_list(xlsx_path)
- xlsx_dict = xlsx_list_2_dict(xlsx_list)
- return xlsx_dict
- if __name__ == '__main__':
- excel_paths = chain(*[Path(excels_path).rglob('*.xlsx')])
- for excel_path in excel_paths:
- print(excel_path)
- true_json = get_true_json(excel_path)
- json_path = Path(excel_path).with_suffix('.json')
- with json_path.open('w', encoding='utf-8') as f:
- json.dump(true_json, f, ensure_ascii=False, indent=4)
|