123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- from decorator import decorator
- import re
- @decorator
- def rule1_decorator(f, *args, **kwargs):
- """
- 处理表头第二格合并至第三格的情况
- predict_line = ['项目 ', '', '每100克营养素参考值%', '']
- """
- predict_line = args[1]
- predict_line = f(*args, **kwargs)
- idx = 0
- if '' in predict_line:
- idx = predict_line.index('')
- try:
- if idx == 1:
- if '项目' in predict_line[0] and '每100克' in predict_line[2]:
- predict_line[1] = '每100克'
- r = re.split('每100克', predict_line[2])
- if len(r) == 2 and r[1]:
- predict_line[2] = r[1]
- except IndexError as e:
- print('rule1_decorator', e)
- return predict_line
- @decorator
- def rule2_decorator(f, *args, **kwargs):
- """
- 处理碳水化合物这一行,第二格合并至第一格的问题
- predict_line = ['碳水化合物18.2克', '', '6%', '']
- """
- predict_line = args[1]
- predict_line = f(*args, **kwargs)
- idx = 0
- if '' in predict_line:
- idx = predict_line.index('')
- try:
- if idx == 1:
- if '化合物' in predict_line[0]:
- r = re.split('化合物', predict_line[0])
- predict_line[0] = '碳水化合物'
- if len(r) == 2 and r[1]:
- predict_line[1] = r[1]
- except IndexError as e:
- print('rule2_decorator', e)
- return predict_line
- @decorator
- def rule3_decorator(f, *args, **kwargs):
- """
- 处理易错字
- ['患直质', '1.6克', '3%', '']
- ['脂扇', '1.1', '19%', '']
- ['碳水化合物', '勿18.2克', '6%', '']
- ['能量.', '408千焦', '5%']
- ['——精', '2.9克']
- """
- predict_line = args[1]
- predict_line = f(*args, **kwargs)
- predict_line = [re.sub('患直质', '蛋白质', s) for s in predict_line]
- predict_line = [re.sub('脂扇', '脂肪', s) for s in predict_line]
- predict_line = [re.sub('勿(.*克)', '\\1', s) for s in predict_line]
- predict_line = [re.sub('毫 克', '毫克', s) for s in predict_line]
- predict_line = [re.sub('——精', '——糖', s) for s in predict_line]
- return predict_line
- @decorator
- def rule4_decorator(f, *args, **kwargs):
- """
- 处理表头第一格合并至第二格的问题
- ['', '项目每100克', '营养素参考值']
- """
- predict_line = args[1]
- predict_line = f(*args, **kwargs)
- try:
- if '项目' in predict_line[1] and predict_line[0] == '':
- predict_line[0] = '项目'
- predict_line[1] = predict_line[1].replace('项目', '')
- except IndexError as e:
- print('rule4_decorator', e)
- return predict_line
- @decorator
- def rule5_decorator(f, *args, **kwargs):
- """
- 处理表头第三格合并至第二格的问题
- predict_line = ['项目 ', '每份(70g)营养素参考值%', '']
- """
- predict_line = args[1]
- predict_line = f(*args, **kwargs)
- try:
- if '项目' in predict_line[0] and '营养素参考值' in predict_line[1] and len(predict_line[1]) > 7 and \
- predict_line[2] == '':
- predict_line[2] = '营养素参考值'
- if len(predict_line[1].split('营养素参考值')) > 1:
- predict_line[2] = predict_line[2] + predict_line[1].split('营养素参考值')[1]
- predict_line[1] = predict_line[1].split('营养素参考值')[0]
- except IndexError as e:
- print('rule5_decorator', e)
- return predict_line
- @decorator
- def rule6_decorator(f, *args, **kwargs):
- """
- 处理表头第二格合并至第三格的问题
- predict_line = ['项目 ', '', '每份(70g)营养素参考值%', '']
- """
- predict_line = args[1]
- predict_line = f(*args, **kwargs)
- idx = 0
- if '' in predict_line:
- idx = predict_line.index('')
- try:
- if idx == 1:
- if '项目' in predict_line[0] and '每份' in predict_line[2] and '营养素参考值' in predict_line[2]:
- predict_line[1] = predict_line[2].split('营养素参考值')[0]
- r = predict_line[2].split('营养素参考值')
- if len(r) == 2:
- predict_line[2] = '营养素参考值' + r[1]
- except IndexError as e:
- print('rule6_decorator', e)
- return predict_line
- @decorator
- def rule7_decorator(f, *args, **kwargs):
- """
- 处理项目缺一个字未识别出的问题
- predict_line = ['项', '每份(70g)', '营养素参考值%', '']
- """
- predict_line = f(*args, **kwargs)
- try:
- if '项目' in predict_line[0] or '项' in predict_line[0] or '目' in predict_line[0]:
- predict_line[0] = '项目'
- except IndexError as e:
- print('rule7_decorator', e)
- return predict_line
- @decorator
- def rule8_decorator(f, *args, **kwargs):
- """
- 处理表头数据集中在第三格的问题
- predict_line = ['', '', '项目每份(70g)营养素参考值%', '']
- """
- predict_line = f(*args, **kwargs)
- try:
- if len(predict_line) >= 3 \
- and '' == predict_line[0] \
- and '' in predict_line[1] \
- and ('项' in predict_line[2] or '目' in predict_line[2]) \
- and ('100' in predict_line[2] or '克' in predict_line[2]) \
- and '营养' in predict_line[2]:
- predict_line[0] = '项目'
- predict_line[1] = '每100克'
- predict_line[2] = '营养素参考值%'
- except IndexError as e:
- print('rule8_decorator', e)
- return predict_line
- decorators = []
- def register_decorator(dtor):
- decorators.append(dtor)
- def combined_decorator(func):
- for dtor in reversed(decorators):
- func = dtor(func)
- return func
- register_decorator(rule1_decorator)
- register_decorator(rule2_decorator)
- register_decorator(rule3_decorator)
- register_decorator(rule4_decorator)
- register_decorator(rule5_decorator)
- register_decorator(rule6_decorator)
- register_decorator(rule7_decorator)
- register_decorator(rule8_decorator)
|