post_decorators.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. from decorator import decorator
  2. import re
  3. @decorator
  4. def rule1_decorator(f, *args, **kwargs):
  5. """
  6. 处理表头第二格合并至第三格的情况
  7. predict_line = ['项目 ', '', '每100克营养素参考值%', '']
  8. """
  9. predict_line = args[1]
  10. predict_line = f(*args, **kwargs)
  11. idx = 0
  12. if '' in predict_line:
  13. idx = predict_line.index('')
  14. try:
  15. if idx == 1:
  16. if '项目' in predict_line[0] and '每100克' in predict_line[2]:
  17. predict_line[1] = '每100克'
  18. r = re.split('每100克', predict_line[2])
  19. if len(r) == 2 and r[1]:
  20. predict_line[2] = r[1]
  21. except IndexError as e:
  22. print('rule1_decorator', e)
  23. return predict_line
  24. @decorator
  25. def rule2_decorator(f, *args, **kwargs):
  26. """
  27. 处理碳水化合物这一行,第二格合并至第一格的问题
  28. predict_line = ['碳水化合物18.2克', '', '6%', '']
  29. """
  30. predict_line = args[1]
  31. predict_line = f(*args, **kwargs)
  32. idx = 0
  33. if '' in predict_line:
  34. idx = predict_line.index('')
  35. try:
  36. if idx == 1:
  37. if '化合物' in predict_line[0]:
  38. r = re.split('化合物', predict_line[0])
  39. predict_line[0] = '碳水化合物'
  40. if len(r) == 2 and r[1]:
  41. predict_line[1] = r[1]
  42. except IndexError as e:
  43. print('rule2_decorator', e)
  44. return predict_line
  45. @decorator
  46. def rule3_decorator(f, *args, **kwargs):
  47. """
  48. 处理易错字
  49. ['患直质', '1.6克', '3%', '']
  50. ['脂扇', '1.1', '19%', '']
  51. ['碳水化合物', '勿18.2克', '6%', '']
  52. ['能量.', '408千焦', '5%']
  53. ['——精', '2.9克']
  54. """
  55. predict_line = args[1]
  56. predict_line = f(*args, **kwargs)
  57. predict_line = [re.sub('患直质', '蛋白质', s) for s in predict_line]
  58. predict_line = [re.sub('脂扇', '脂肪', s) for s in predict_line]
  59. predict_line = [re.sub('勿(.*克)', '\\1', s) for s in predict_line]
  60. predict_line = [re.sub('毫 克', '毫克', s) for s in predict_line]
  61. predict_line = [re.sub('——精', '——糖', s) for s in predict_line]
  62. return predict_line
  63. @decorator
  64. def rule4_decorator(f, *args, **kwargs):
  65. """
  66. 处理表头第一格合并至第二格的问题
  67. ['', '项目每100克', '营养素参考值']
  68. """
  69. predict_line = args[1]
  70. predict_line = f(*args, **kwargs)
  71. try:
  72. if '项目' in predict_line[1] and predict_line[0] == '':
  73. predict_line[0] = '项目'
  74. predict_line[1] = predict_line[1].replace('项目', '')
  75. except IndexError as e:
  76. print('rule4_decorator', e)
  77. return predict_line
  78. @decorator
  79. def rule5_decorator(f, *args, **kwargs):
  80. """
  81. 处理表头第三格合并至第二格的问题
  82. predict_line = ['项目 ', '每份(70g)营养素参考值%', '']
  83. """
  84. predict_line = args[1]
  85. predict_line = f(*args, **kwargs)
  86. try:
  87. if '项目' in predict_line[0] and '营养素参考值' in predict_line[1] and len(predict_line[1]) > 7 and \
  88. predict_line[2] == '':
  89. predict_line[2] = '营养素参考值'
  90. if len(predict_line[1].split('营养素参考值')) > 1:
  91. predict_line[2] = predict_line[2] + predict_line[1].split('营养素参考值')[1]
  92. predict_line[1] = predict_line[1].split('营养素参考值')[0]
  93. except IndexError as e:
  94. print('rule5_decorator', e)
  95. return predict_line
  96. @decorator
  97. def rule6_decorator(f, *args, **kwargs):
  98. """
  99. 处理表头第二格合并至第三格的问题
  100. predict_line = ['项目 ', '', '每份(70g)营养素参考值%', '']
  101. """
  102. predict_line = args[1]
  103. predict_line = f(*args, **kwargs)
  104. idx = 0
  105. if '' in predict_line:
  106. idx = predict_line.index('')
  107. try:
  108. if idx == 1:
  109. if '项目' in predict_line[0] and '每份' in predict_line[2] and '营养素参考值' in predict_line[2]:
  110. predict_line[1] = predict_line[2].split('营养素参考值')[0]
  111. r = predict_line[2].split('营养素参考值')
  112. if len(r) == 2:
  113. predict_line[2] = '营养素参考值' + r[1]
  114. except IndexError as e:
  115. print('rule6_decorator', e)
  116. return predict_line
  117. @decorator
  118. def rule7_decorator(f, *args, **kwargs):
  119. """
  120. 处理项目缺一个字未识别出的问题
  121. predict_line = ['项', '每份(70g)', '营养素参考值%', '']
  122. """
  123. predict_line = f(*args, **kwargs)
  124. try:
  125. if '项目' in predict_line[0] or '项' in predict_line[0] or '目' in predict_line[0]:
  126. predict_line[0] = '项目'
  127. except IndexError as e:
  128. print('rule7_decorator', e)
  129. return predict_line
  130. @decorator
  131. def rule8_decorator(f, *args, **kwargs):
  132. """
  133. 处理表头数据集中在第三格的问题
  134. predict_line = ['', '', '项目每份(70g)营养素参考值%', '']
  135. """
  136. predict_line = f(*args, **kwargs)
  137. try:
  138. if len(predict_line) >= 3 \
  139. and '' == predict_line[0] \
  140. and '' in predict_line[1] \
  141. and ('项' in predict_line[2] or '目' in predict_line[2]) \
  142. and ('100' in predict_line[2] or '克' in predict_line[2]) \
  143. and '营养' in predict_line[2]:
  144. predict_line[0] = '项目'
  145. predict_line[1] = '每100克'
  146. predict_line[2] = '营养素参考值%'
  147. except IndexError as e:
  148. print('rule8_decorator', e)
  149. return predict_line
  150. decorators = []
  151. def register_decorator(dtor):
  152. decorators.append(dtor)
  153. def combined_decorator(func):
  154. for dtor in reversed(decorators):
  155. func = dtor(func)
  156. return func
  157. register_decorator(rule1_decorator)
  158. register_decorator(rule2_decorator)
  159. register_decorator(rule3_decorator)
  160. register_decorator(rule4_decorator)
  161. register_decorator(rule5_decorator)
  162. register_decorator(rule6_decorator)
  163. register_decorator(rule7_decorator)
  164. register_decorator(rule8_decorator)