1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- import cv2
- import numpy as np
- class Table:
- def __init__(self, html, img=[]):
- self.img = img
- self.html = html
- self.html_arr = []
- self.total = 0
- self.empty = 0
- def get_body(self):
- try:
- res = self.html.split('<tbody>')[1]
- except Exception as r:
- print('<tbody> 识别失败')
- print(r)
- try:
- res = res.split('</tbody>')[0]
- except Exception as r:
- print('</tbody> 识别失败')
- print(r)
- return res
- def get_tr(self):
- str = self.get_body()
- if len(str.split('<tr>')) > 1:
- return str.split('<tr>')
- else:
- return []
- def get_td(self):
- if self.html_arr != []:
- return
- tr_list = self.get_tr()
- for i in range(len(tr_list)):
- if tr_list[i] == '':
- continue
- tr = tr_list[i].split('</td>')[:-1]
- temp_list = []
- for cell in tr:
- if '<td colspan=\\"3\\">' in cell:
- temp_list.append(cell.split('<td colspan=\\"3\\">')[1])
- if '<td>' in cell:
- temp_list.append(cell.split('<td>')[1])
- self.html_arr.append(temp_list)
- def get_empty(self):
- self.get_td()
- if self.total != 0:
- return
- for tr in self.html_arr:
- for cell in tr:
- self.total += 1
- if cell == '':
- self.empty += 1
- def change_green2white(self):
- hsv = cv2.cvtColor(self.img, cv2.COLOR_BGR2HSV)
- lower_green = np.array([35, 43, 46])
- upper_green = np.array([77, 220, 255])
- mask_green = cv2.inRange(hsv, lower_green, upper_green)
- color = [248, 248, 255]
- self.img[mask_green != 0] = color
- def get_str(self):
- str = ''
- for tr in self.html_arr:
- for cell in tr:
- str+=cell
- return str
- def check_html(self):
- self.get_empty()
- html_str = self.get_str()
- if (self.empty > 4 and self.empty > self.total // 4) or ('项目' in html_str and '每份' in html_str and '营养素参考值' in html_str and np.max([len(a) for a in self.html_arr])<3):
- self.change_green2white()
- return 1
- return 0
|