chenguilong
/
ocr-table


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							import cv2
import numpy as np

class Table:
    def __init__(self, html, img=[]):
        """
        表格类的初始化函数。

        Parameters:
            html (str): 输入的HTML字符串。
            img (List): 输入的图像数组，默认为空列表。
        """
        self.img = img
        self.html = html
        self.html_arr = []  # 存储HTML解析后的表格内容
        self.total = 0  # 表格单元总数
        self.empty = 0  # 空白表格单元数

    def get_tr(self):
        """
        从HTML中提取并返回表格行。

        Returns:
            List: 提取的表格行列表。
        """
        str = self.html
        if len(str.split('<tr>')) > 1:
            return str.split('<tr>')[1:]
        else:
            return []

    def get_td(self):
        """
        从HTML中提取并存储表格单元。

        Returns:
            None
        """
        if self.html_arr != []:
            return
        tr_list = self.get_tr()
        for i in range(len(tr_list)):
            if tr_list[i] == '':
                continue
            tr = tr_list[i].split('</td>')[:-1]
            temp_list = []
            for cell in tr:
                if '<td colspan=\\"3\\">' in cell:
                    temp_list.append(cell.split('<td colspan=\\"3\\">')[1])
                if '<td colspan="3">' in cell:
                    temp_list.append(cell.split('<td colspan="3">')[1])
                if '<td>' in cell:
                    temp_list.append(cell.split('<td>')[1])
            print(temp_list)
            self.html_arr.append(temp_list)

    def get_empty(self):
        """
        统计表格中的空白单元格数量和总单元格数量。

        Returns:
            None
        """
        self.get_td()
        if self.total != 0:
            return
        for tr in self.html_arr:
            for cell in tr:
                self.total += 1
                if cell == '':
                    self.empty += 1

    def change_hard2white(self, hard_color):
        """
        将图像中绿色区域修改为白色。

        Returns:
            None
        """
        color = [248, 248, 255]
        hsv = cv2.cvtColor(self.img, cv2.COLOR_BGR2HSV)
        lower_green = np.array(hard_color[0])
        upper_green = np.array(hard_color[1])
        mask_green = cv2.inRange(hsv, lower_green, upper_green)
        self.img[mask_green != 0] = color

    def get_str(self):
        """
        从HTML数组中获取字符串。

        Returns:
            str: 提取的字符串。
        """
        str = ''
        for tr in self.html_arr:
            for cell in tr:
                str += cell
        return str

    def check_html(self, hard_color):
        """
        检查HTML表格的质量，如果识别效果不佳，则修改图像颜色。

        Returns:
            int: 返回1表示识别效果不佳，返回0表示识别效果良好。
        """
        self.get_empty()
        html_str = self.get_str()
        # 空白值大于四个，或者大于总格子数的四分之一（self.total // 4，除数之后向下取整）
        # HTML字符串 html_str 中同时包含 '项目'、'每份' 和 '营养素参考值'，并且在每一行的格子数中最大值小于3时。
        if (self.empty > 4 and self.empty > self.total // 4) or ('项目' in html_str and '每份' in html_str and '营养素参考值' in html_str and np.max([len(a) for a in self.html_arr]) < 3):
            print('识别效果不佳，改变图片颜色！')
            self.change_hard2white(hard_color)
            return 1
        return 0