1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- import html2text
- import jsonlines
- class Dataset(object):
- def __init__(self, gt_file, img_name, results):
- self.gt_file = gt_file
- self.img_name = img_name
- self.results = results
- self.pre_list = []
- self.gt_list = []
- def __len__(self):
- return [len(self.pre_list), len(self.gt_list)]
- def get_pre_list(self):
- pre_xml = self.results['tableList'][0]
- self.pre_list = parse_pre_str(pre_xml)
- return self.pre_list
- def get_pre_structure(self):
- pre_xml = self.results['tableList'][0]
- # print('gt', pre_xml)
- pre_html = html2text.html2text(pre_xml) # str
- return pre_html
- def get_gt_list(self):
- with jsonlines.open(self.gt_file, 'r') as rfd:
- for data in rfd:
- gt_xml = data['gt']
- # print(gt_xml)
- self.gt_list = parse_gt_str(gt_xml)
- return self.gt_list
- def get_gt_structure(self):
- with jsonlines.open(self.gt_file, 'r') as rfd:
- for data in rfd:
- gt_html = html2text.html2text(data['gt']) # str
- return gt_html
- gt_html = 'Error:并未找到需要该图片的标注信息!'
- return gt_html
- def parse_gt_str(text):
- text = text.replace('<td colspan="3">', '')
- text = text.replace('<td colspan="2">', '')
- text = text.replace('<td rowspan="2">', '')
- text = text.replace('<html>', '')
- text = text.replace('</html>', '')
- text = text.replace('<body>', '')
- text = text.replace('</body>', '')
- text = text.replace('<table>', '')
- text = text.replace('</table>', '')
- text = text.replace('<tbody>', '')
- text = text.replace('</tbody>', '')
- # print('gt', text)
- text = text.replace('<td>', '')
- text = text.replace('</td>', '*')
- text = text.replace('<tr>', '')
- return text.strip('</tr>').split('</tr>')
- def parse_pre_str(text):
- text = text.replace('<td colspan="3">', '')
- text = text.replace('<td colspan="2">', '')
- text = text.replace('<td rowspan="2">', '')
- text = text.replace('<html>', '')
- text = text.replace('</html>', '')
- text = text.replace('<body>', '')
- text = text.replace('</body>', '')
- text = text.replace('<table>', '')
- text = text.replace('</table>', '')
- text = text.replace('<tbody>', '')
- text = text.replace('</tbody>', '')
- # print('pre', text)
- text = text.replace('<td>', '')
- text = text.replace('</td>', '*')
- text = text.replace('<tr>', '')
- # return text.strip('</tr>').split('</tr>')
- return text.strip('</tr>').split('</tr>')
|