check_table.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import cv2
  2. import numpy as np
  3. class Table:
  4. def __init__(self, html, img=[]):
  5. self.img = img
  6. self.html = html
  7. self.html_arr = []
  8. self.total = 0
  9. self.empty = 0
  10. def get_body(self):
  11. print(self.html)
  12. try:
  13. res = self.html.split('<tbody>')[1]
  14. except Exception as r:
  15. print('<tbody> 识别失败')
  16. print(r)
  17. try:
  18. res = res.split('</tbody>')[0]
  19. except Exception as r:
  20. print('</tbody> 识别失败')
  21. print(r)
  22. return res
  23. def get_tr(self):
  24. # str = self.get_body()
  25. str = self.html
  26. if len(str.split('<tr>')) > 1:
  27. return str.split('<tr>')[1:]
  28. else:
  29. return []
  30. def get_td(self):
  31. if self.html_arr != []:
  32. return
  33. tr_list = self.get_tr()
  34. for i in range(len(tr_list)):
  35. if tr_list[i] == '':
  36. continue
  37. tr = tr_list[i].split('</td>')[:-1]
  38. temp_list = []
  39. for cell in tr:
  40. if '<td colspan=\\"3\\">' in cell:
  41. temp_list.append(cell.split('<td colspan=\\"3\\">')[1])
  42. if '<td colspan="3">' in cell:
  43. temp_list.append(cell.split('<td colspan="3">')[1])
  44. if '<td>' in cell:
  45. temp_list.append(cell.split('<td>')[1])
  46. print(temp_list)
  47. self.html_arr.append(temp_list)
  48. def get_empty(self):
  49. self.get_td()
  50. if self.total != 0:
  51. return
  52. for tr in self.html_arr:
  53. for cell in tr:
  54. self.total += 1
  55. if cell == '':
  56. self.empty += 1
  57. def change_green2white(self):
  58. hsv = cv2.cvtColor(self.img, cv2.COLOR_BGR2HSV)
  59. lower_green = np.array([35, 43, 46])
  60. upper_green = np.array([77, 220, 255])
  61. mask_green = cv2.inRange(hsv, lower_green, upper_green)
  62. color = [248, 248, 255]
  63. self.img[mask_green != 0] = color
  64. def get_str(self):
  65. str = ''
  66. for tr in self.html_arr:
  67. for cell in tr:
  68. str+=cell
  69. return str
  70. def check_html(self):
  71. self.get_empty()
  72. html_str = self.get_str()
  73. print(self.html)
  74. print(self.html_arr)
  75. print(self.empty)
  76. if (self.empty > 4 and self.empty > self.total // 4) or ('项目' in html_str and '每份' in html_str and '营养素参考值' in html_str and np.max([len(a) for a in self.html_arr])<3):
  77. print('识别效果不佳,改变图片颜色!')
  78. self.change_green2white()
  79. return 1
  80. return 0