xlsx_convert_json.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import json
  2. import re
  3. from itertools import chain
  4. from pathlib import Path
  5. import pandas as pd
  6. from YQ_OCR.config import keyDict
  7. # 把xlsx转成json
  8. excels_path = '/Users/zeke/work/sx/OCR/HROCR/to_md/YQ_OCR/img'
  9. # 返回文档里所以所需识别字符串
  10. def get_xlsx_str_list(xlsx_path):
  11. workbook_pd_Common = pd.read_excel(xlsx_path, sheet_name="Common")
  12. workbook_pd_Packing = pd.read_excel(xlsx_path, sheet_name="Packing")
  13. Common_list = list(workbook_pd_Common['内容'].values)
  14. for index, text in enumerate(Common_list):
  15. Common_list[index] = text.replace('\n', '').replace(' ', '')
  16. Packing_list = workbook_pd_Packing.values.tolist()[0]
  17. str_list = Common_list
  18. str_list.append(str(Packing_list[0]))
  19. str_list.append(str(Packing_list[2]))
  20. return str_list
  21. # 将list 转化为 dict
  22. def xlsx_list_2_dict(xlsx_list):
  23. xlsx_dict = {}
  24. for k, v in keyDict.items():
  25. for x_str in xlsx_list:
  26. if bool(re.match(v, x_str)):
  27. xlsx_dict[k] = x_str
  28. xlsx_list.remove(x_str)
  29. xlsx_dict['noKeyList'] = xlsx_list
  30. return xlsx_dict
  31. # 解析文档 返回json
  32. def get_true_json(xlsx_path): # sourcery skip: inline-immediately-returned-variable
  33. xlsx_list = get_xlsx_str_list(xlsx_path)
  34. xlsx_dict = xlsx_list_2_dict(xlsx_list)
  35. return xlsx_dict
  36. if __name__ == '__main__':
  37. excel_paths = chain(*[Path(excels_path).rglob('*.xlsx')])
  38. for excel_path in excel_paths:
  39. print(excel_path)
  40. true_json = get_true_json(excel_path)
  41. json_path = Path(excel_path).with_suffix('.json')
  42. with json_path.open('w', encoding='utf-8') as f:
  43. json.dump(true_json, f, ensure_ascii=False, indent=4)