datasets.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import html2text
  2. import jsonlines
  3. class Dataset(object):
  4. def __init__(self, gt_file, img_name, results):
  5. self.gt_file = gt_file
  6. self.img_name = img_name
  7. self.results = results
  8. self.pre_list = []
  9. self.gt_list = []
  10. def __len__(self):
  11. return [len(self.pre_list), len(self.gt_list)]
  12. def get_pre_list(self):
  13. pre_xml = self.results['tableList'][0]
  14. self.pre_list = parse_pre_str(pre_xml)
  15. return self.pre_list
  16. def get_pre_structure(self):
  17. pre_xml = self.results['tableList'][0]
  18. # print('gt', pre_xml)
  19. pre_html = html2text.html2text(pre_xml) # str
  20. return pre_html
  21. def get_gt_list(self):
  22. with jsonlines.open(self.gt_file, 'r') as rfd:
  23. for data in rfd:
  24. gt_xml = data['gt']
  25. # print(gt_xml)
  26. self.gt_list = parse_gt_str(gt_xml)
  27. return self.gt_list
  28. def get_gt_structure(self):
  29. with jsonlines.open(self.gt_file, 'r') as rfd:
  30. for data in rfd:
  31. gt_html = html2text.html2text(data['gt']) # str
  32. return gt_html
  33. gt_html = 'Error:并未找到需要该图片的标注信息!'
  34. return gt_html
  35. def parse_gt_str(text):
  36. text = text.replace('<td colspan="3">', '')
  37. text = text.replace('<td colspan="2">', '')
  38. text = text.replace('<td rowspan="2">', '')
  39. text = text.replace('<html>', '')
  40. text = text.replace('</html>', '')
  41. text = text.replace('<body>', '')
  42. text = text.replace('</body>', '')
  43. text = text.replace('<table>', '')
  44. text = text.replace('</table>', '')
  45. text = text.replace('<tbody>', '')
  46. text = text.replace('</tbody>', '')
  47. # print('gt', text)
  48. text = text.replace('<td>', '')
  49. text = text.replace('</td>', '*')
  50. text = text.replace('<tr>', '')
  51. return text.strip('</tr>').split('</tr>')
  52. def parse_pre_str(text):
  53. text = text.replace('<td colspan="3">', '')
  54. text = text.replace('<td colspan="2">', '')
  55. text = text.replace('<td rowspan="2">', '')
  56. text = text.replace('<html>', '')
  57. text = text.replace('</html>', '')
  58. text = text.replace('<body>', '')
  59. text = text.replace('</body>', '')
  60. text = text.replace('<table>', '')
  61. text = text.replace('</table>', '')
  62. text = text.replace('<tbody>', '')
  63. text = text.replace('</tbody>', '')
  64. # print('pre', text)
  65. text = text.replace('<td>', '')
  66. text = text.replace('</td>', '*')
  67. text = text.replace('<tr>', '')
  68. # return text.strip('</tr>').split('</tr>')
  69. return text.strip('</tr>').split('</tr>')