post_hander.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import re
  2. from typing import List
  3. from .post_decorators import *
  4. class PostHandler:
  5. def __init__(self, predict_html):
  6. self.predict_html = predict_html
  7. self.format_lines = self._get_format_lines()
  8. @property
  9. def format_predict_html(self):
  10. if self.format_lines:
  11. header = '<html><body><table><tbody>'
  12. footer = '</tbody></table></body></html>'
  13. COLS = 3
  14. html = []
  15. for i, line in enumerate(self.format_lines):
  16. html.append('<tr>')
  17. for j in range(COLS):
  18. try:
  19. if i == 0 and '成分表' in line[j]:
  20. html.append('<td colspan="3">')
  21. html.append(line[j])
  22. html.append('</td>')
  23. break
  24. else:
  25. html.append('<td>')
  26. html.append(line[j])
  27. html.append('</td>')
  28. except IndexError as e:
  29. print(e)
  30. html.append('<td>')
  31. html.append('')
  32. html.append('</td>')
  33. continue
  34. html.append('</tr>')
  35. res = f'{header}{"".join(html)}{footer}'
  36. return res
  37. else:
  38. return self.predict_html
  39. @rule5_decorator
  40. @rule4_decorator
  41. @rule3_decorator
  42. @rule2_decorator
  43. @rule1_decorator
  44. def _format_predict_line(self, predict_line):
  45. return predict_line
  46. def _get_format_lines(self):
  47. format_lines = []
  48. predict_lines = self._get_lines(self.predict_html)
  49. predict_words_list = [self._split_to_words(line) for line in predict_lines]
  50. for line in predict_words_list:
  51. line = self._format_predict_line(line)
  52. format_lines.append(line)
  53. return format_lines
  54. def _get_lines(self, html) -> List[str]:
  55. '''
  56. res: ['<td>项目</td><td>每100克</td><td>营养素参考值%</td>',...]
  57. '''
  58. rows = re.split('<tr>', html)
  59. res = []
  60. for row in rows:
  61. m = re.findall('<td.*>.*</td>', row)
  62. if m:
  63. res.extend(m)
  64. return res
  65. def _split_to_words(self, line):
  66. '''
  67. line: '<td>项目</td><td>每100克</td><td>营养素参考值%</td>'
  68. res: ['项目', '每100克', '营养素参考值%', '']
  69. '''
  70. res = [re.sub('<td.*>', '', word) for word in re.split('</td>', line)]
  71. return res