post_hander.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import re
  2. from typing import List
  3. from .post_decorators import combined_decorator
  4. class PostHandler:
  5. def __init__(self, predict_html):
  6. self.predict_html = predict_html
  7. self.format_lines = self._get_format_lines()
  8. @property
  9. def format_predict_html(self):
  10. if self.format_lines:
  11. header = '<html><body><table><tbody>'
  12. footer = '</tbody></table></body></html>'
  13. COLS = 3
  14. html = []
  15. for i, line in enumerate(self.format_lines):
  16. html.append('<tr>')
  17. for j in range(COLS):
  18. try:
  19. if i == 0 and '成分表' in line[j]:
  20. html.append('<td colspan="3">')
  21. html.append(line[j])
  22. html.append('</td>')
  23. break
  24. else:
  25. html.append('<td>')
  26. html.append(line[j])
  27. html.append('</td>')
  28. except IndexError as e:
  29. print(e)
  30. html.append('<td>')
  31. html.append('')
  32. html.append('</td>')
  33. continue
  34. html.append('</tr>')
  35. res = f'{header}{"".join(html)}{footer}'
  36. return res
  37. else:
  38. return self.predict_html
  39. @combined_decorator
  40. def _format_predict_line(self, predict_line):
  41. return predict_line
  42. def _get_format_lines(self):
  43. format_lines = []
  44. predict_lines = self._get_lines(self.predict_html)
  45. predict_words_list = [self._split_to_words(line) for line in predict_lines]
  46. for line in predict_words_list:
  47. line = self._format_predict_line(line)
  48. format_lines.append(line)
  49. return format_lines
  50. def _get_lines(self, html) -> List[str]:
  51. '''
  52. res: ['<td>项目</td><td>每100克</td><td>营养素参考值%</td>',...]
  53. '''
  54. rows = re.split('<tr>', html)
  55. res = []
  56. for row in rows:
  57. m = re.findall('<td.*>.*</td>', row)
  58. if m:
  59. res.extend(m)
  60. return res
  61. def _split_to_words(self, line):
  62. '''
  63. line: '<td>项目</td><td>每100克</td><td>营养素参考值%</td>'
  64. res: ['项目', '每100克', '营养素参考值%', '']
  65. '''
  66. res = [re.sub('<td.*>', '', word) for word in re.split('</td>', line)]
  67. return res