check_table.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import cv2
  2. import numpy as np
  3. class Table:
  4. def __init__(self, html, img=[]):
  5. self.img = img
  6. self.html = html
  7. self.html_arr = []
  8. self.total = 0
  9. self.empty = 0
  10. # def get_body(self):
  11. # try:
  12. # res = self.html.split('<tbody>')[1]
  13. # except Exception as r:
  14. # print('<tbody> 识别失败')
  15. # print(r)
  16. # try:
  17. # res = res.split('</tbody>')[0]
  18. # except Exception as r:
  19. # print('</tbody> 识别失败')
  20. # print(r)
  21. # return res
  22. def get_tr(self):
  23. # str = self.get_body()
  24. str = self.html
  25. if len(str.split('<tr>')) > 1:
  26. return str.split('<tr>')[1:]
  27. else:
  28. return []
  29. def get_td(self):
  30. if self.html_arr != []:
  31. return
  32. tr_list = self.get_tr()
  33. for i in range(len(tr_list)):
  34. if tr_list[i] == '':
  35. continue
  36. tr = tr_list[i].split('</td>')[:-1]
  37. temp_list = []
  38. for cell in tr:
  39. if '<td colspan=\\"3\\">' in cell:
  40. temp_list.append(cell.split('<td colspan=\\"3\\">')[1])
  41. if '<td colspan="3">' in cell:
  42. temp_list.append(cell.split('<td colspan="3">')[1])
  43. if '<td>' in cell:
  44. temp_list.append(cell.split('<td>')[1])
  45. print(temp_list)
  46. self.html_arr.append(temp_list)
  47. def get_empty(self):
  48. self.get_td()
  49. if self.total != 0:
  50. return
  51. for tr in self.html_arr:
  52. for cell in tr:
  53. self.total += 1
  54. if cell == '':
  55. self.empty += 1
  56. def change_green2white(self):
  57. hsv = cv2.cvtColor(self.img, cv2.COLOR_BGR2HSV)
  58. lower_green = np.array([35, 43, 46])
  59. upper_green = np.array([77, 220, 255])
  60. mask_green = cv2.inRange(hsv, lower_green, upper_green)
  61. color = [248, 248, 255]
  62. self.img[mask_green != 0] = color
  63. def get_str(self):
  64. str = ''
  65. for tr in self.html_arr:
  66. for cell in tr:
  67. str += cell
  68. return str
  69. def check_html(self):
  70. self.get_empty()
  71. html_str = self.get_str()
  72. print(self.html)
  73. print(self.html_arr)
  74. print(self.empty)
  75. if (self.empty > 4 and self.empty > self.total // 4) or (
  76. '项目' in html_str and '每份' in html_str and '营养素参考值' in html_str and np.max(
  77. [len(a) for a in self.html_arr]) < 3):
  78. print('识别效果不佳,改变图片颜色!')
  79. self.change_green2white()
  80. return 1
  81. return 0