import html2text import jsonlines class Dataset(object): def __init__(self, gt_file, img_name, results): self.gt_file = gt_file self.img_name = img_name self.results = results self.pre_list = [] self.gt_list = [] def __len__(self): return [len(self.pre_list), len(self.gt_list)] def get_pre_list(self): pre_xml = self.results['tableList'][0] self.pre_list = parse_pre_str(pre_xml) return self.pre_list def get_pre_structure(self): pre_xml = self.results['tableList'][0] # print('gt', pre_xml) pre_html = html2text.html2text(pre_xml) # str return pre_html def get_gt_list(self): with jsonlines.open(self.gt_file, 'r') as rfd: for data in rfd: gt_xml = data['gt'] # print(gt_xml) self.gt_list = parse_gt_str(gt_xml) return self.gt_list def get_gt_structure(self): with jsonlines.open(self.gt_file, 'r') as rfd: for data in rfd: gt_html = html2text.html2text(data['gt']) # str return gt_html gt_html = 'Error:并未找到需要该图片的标注信息!' return gt_html def parse_gt_str(text): text = text.replace('