chenguilong
/
ocr-table


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
							import re
from typing import List
from .post_decorators import combined_decorator

class PostHandler:
    def __init__(self, predict_html):
        self.predict_html = predict_html
        self.format_lines = self._get_format_lines()

    @property
    def format_predict_html(self):
        if self.format_lines:
            header = '<html><body><table><tbody>'
            footer = '</tbody></table></body></html>'
            COLS = 3
            html = []
            for i, line in enumerate(self.format_lines):
                html.append('<tr>')
                for j in range(COLS):
                    try:
                        if i == 0 and '成分表' in line[j]:
                            html.append('<td colspan="3">')
                            html.append(line[j])
                            html.append('</td>')
                            break
                        else:
                            html.append('<td>')
                            html.append(line[j])
                            html.append('</td>')
                    except IndexError as e:
                        print(e)
                        html.append('<td>')
                        html.append('')
                        html.append('</td>')
                        continue
                html.append('</tr>')
            res = f'{header}{"".join(html)}{footer}'
            return res
        else:
            return self.predict_html

    @combined_decorator
    def _format_predict_line(self, predict_line):
        return predict_line

    def _get_format_lines(self):
        format_lines = []
        predict_lines = self._get_lines(self.predict_html)
        predict_words_list = [self._split_to_words(line) for line in predict_lines]
        for line in predict_words_list:
            line = self._format_predict_line(line)
            format_lines.append(line)
        return format_lines

    def _get_lines(self, html) -> List[str]:
        '''
        res:  ['<td>项目</td><td>每100克</td><td>营养素参考值%</td>',...]
        '''
        rows = re.split('<tr>', html)
        res = []
        for row in rows:
            m = re.findall('<td.*>.*</td>', row)
            if m:
                res.extend(m)
        return res

    def _split_to_words(self, line):
        '''
        line: '<td>项目</td><td>每100克</td><td>营养素参考值%</td>'
        res: ['项目', '每100克', '营养素参考值%', '']
        '''
        res = [re.sub('<td.*>', '', word) for word in re.split('</td>', line)]
        return res