post_hander.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import re
  2. from typing import List
  3. from .post_decorators import combined_decorator
  4. class PostHandler:
  5. def __init__(self, predict_html):
  6. self.predict_html = predict_html
  7. self.format_lines = self._get_format_lines()
  8. # 将二维列表处理为想要的富文本格式
  9. @property
  10. def format_predict_html(self):
  11. if self.format_lines:
  12. header = '<html><body><table><tbody>'
  13. footer = '</tbody></table></body></html>'
  14. COLS = 3
  15. html = []
  16. for i, line in enumerate(self.format_lines):
  17. print('-=-=-=-=', line)
  18. html.append('<tr>')
  19. for j in range(COLS):
  20. col_html = []
  21. try:
  22. if i == 0 and '成分表' in line[j]:
  23. col_html.append('<td colspan="3">')
  24. col_html.append(line[j])
  25. col_html.append('</td>')
  26. html.extend(col_html)
  27. break
  28. else:
  29. col_html.append('<td>')
  30. col_html.append(line[j])
  31. col_html.append('</td>')
  32. except IndexError as e:
  33. print(e)
  34. col_html = ['<td>', '', '</td>']
  35. html.extend(col_html)
  36. html.append('</tr>')
  37. res = f'{header}{"".join(html)}{footer}'
  38. return res
  39. else:
  40. return self.predict_html
  41. # 对每一行进行处理
  42. @combined_decorator
  43. def _format_predict_line(self, predict_line):
  44. return predict_line
  45. # 对每一行进行处理
  46. def _get_format_lines(self):
  47. format_lines = []
  48. predict_lines = self._get_lines(self.predict_html)
  49. predict_words_list = [self._split_to_words(line) for line in predict_lines]
  50. for line in predict_words_list:
  51. line = self._format_predict_line(line)
  52. format_lines.append(line)
  53. return format_lines
  54. # 获取每一行
  55. def _get_lines(self, html) -> List[str]:
  56. '''
  57. res: ['<td>项目</td><td>每100克</td><td>营养素参考值%</td>',...]
  58. '''
  59. rows = re.split('<tr>', html)
  60. res = []
  61. for row in rows:
  62. m = re.findall('<td.*>.*</td>', row)
  63. if m:
  64. res.extend(m)
  65. return res
  66. # 切分每一个格子
  67. def _split_to_words(self, line):
  68. '''
  69. line: '<td>项目</td><td>每100克</td><td>营养素参考值%</td>'
  70. res: ['项目', '每100克', '营养素参考值%', '']
  71. '''
  72. res = [re.sub('<td.*>', '', word) for word in re.split('</td>', line)]
  73. return res