xujiayue 2 years ago
parent
commit
84b12e718a
1 changed files with 52 additions and 47 deletions
  1. 52 47
      core/business_parse.py

+ 52 - 47
core/business_parse.py

@@ -174,63 +174,68 @@ class BussinessParse1(object):
         h, w, _ = image.shape
         down_list = []
         down_list2 = []
+        raw_txt = ''
+        down_txt = ''
+        raw_conf_list = []
+        down_conf_list = []
+        down_conf = 0.0
+        simple_key = False
         for i in raw_results:
-            if bool(re.match(r'注册资本', i.txt)):
-                [x0, _] = i.lt
-                [_, y1] = i.rb
-                down_list.append([x0, y1])
-            elif bool(re.search(r'日期', i.txt)):
+            if bool(re.search(r'日期', i.txt)):
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 down_list.append([x0, y1])
             elif bool(re.match(r'营业期限', i.txt)):
+                simple_key = True
                 [x0, _] = i.lt
                 [_, y1] = i.rb
                 down_list.append([x0, y1])
             elif bool(re.match(r'登记', i.txt)):
                 [_, y0] = i.lt
                 down_list2.append(y0)
-
-        t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
-        l2 = sorted(down_list, key=lambda x: x[0])[0][0]
-        d2 = int(down_list2[0]) if len(down_list2) else h
-        down_img = image[int(t2): d2, int(l2): w]
-
-        down_result = self.ocr.ocr(down_img)
-
-        down_conf_list = []
-        down_conf = 0.0
-        down_txt = ''
-        for idx, res in enumerate(down_result):
-            # print(res)
-            if len(down_result) - 1 != idx:
-                if bool(re.match(r'经营范围', res[1][0])):
-                    t = res[0][0][1]
-                    d = res[0][2][1]
-                    if len(down_result[idx - 1][1][0]) > 15 and abs(
-                            down_result[idx - 1][0][2][1] - res[0][0][1]) <= abs(
-                        d - t) * 1.8:
-                        down_txt += down_result[idx - 1][1][0]
-                    down_txt += res[1][0]
-                    down_conf_list.append(res[1][1])
-                    down_position = down_result[idx + 1][0][0][0]
-                    down_down_position = (down_result[idx + 1][0][2][1] + down_result[idx + 1][0][3][1]) // 2
-                    for x in down_result[idx + 1:]:
-                        print(abs(down_down_position - x[0][0][1]))
-                        print(abs(d - t) * 1.2)
-                        if abs(x[0][0][0] - down_position) < 130 and abs(down_down_position - x[0][0][1]) <= abs(
-                                d - t) * 1.8:
-                            down_down_position = (x[0][2][1] + x[0][3][1]) // 2
-                            if down_txt[-1] == x[1][0][0]:
-                                down_txt += x[1][0][1:]
-                            else:
-                                down_txt += x[1][0]
-                            down_conf_list.append(x[1][1])
-                        # print(down_txt)
+            elif bool(re.match(r'经营范围', i.txt)):
+                [x0, y0] = i.lt
+                [x1, _] = i.rb
+                # 第一方案:
+                for j in raw_results:
+                    [x, _] = j.lt
+                    [_, y] = j.rb
+                    if abs(x - x1) <= abs(x1 - x0) and y >= y0 and '登记' not in j.txt:
+                        raw_txt += j.txt
+                        raw_conf_list.append(j.conf)
+        if len(down_list) and len(down_list2):
+            t2 = sorted(down_list, key=lambda x: x[1], reverse=True)[0][1]
+            l2 = sorted(down_list, key=lambda x: x[0])[0][0]
+            d2 = int(down_list2[0]) if len(down_list2) else h
+            down_img = image[int(t2): d2, int(l2): w]
+            h1, w1, _ = down_img.shape
+            down_result = self.ocr.ocr(down_img)
+            # print('simple_key', simple_key)
+            # 第二方案(检索到‘营业期限’关键词)
+            if simple_key:
+                # print('111')
+                for res in down_result:
+                    l1 = res[0][0][0]
+                    if l1 < (7 * w1 // 24):
+                        down_txt += res[1][0]
+                        down_conf_list.append(res[1][1])
+                # print(down_txt)
+            # 第三套方案
+            else:
+                for idx, res in enumerate(down_result):
+                    if bool(re.match(r'经营范围', res[1][0])):
+                        t = res[0][0][1]
+                        for i in down_result:
+                            if i[0][2][1] < t and i[0][0][0] < (7 * w1 // 24):
+                                down_txt += res[1][0]
+                                down_conf_list.append(res[1][1])
         down_txt = down_txt.replace('经营范围', '')
+        raw_txt = raw_txt.replace('经营范围', '')
         if len(down_conf_list):
             down_conf = sum(down_conf_list) / len(down_conf_list)
-
+        if len(raw_txt) > len(down_txt):
+            down_txt = raw_txt
+            down_conf = sum(raw_conf_list) / len(raw_conf_list)
         # cv2.imshow('11', down_img)
         # cv2.waitKey(0)
         return down_txt, down_conf
@@ -239,6 +244,10 @@ class BussinessParse1(object):
         h, w, _ = image.shape
         top_list1 = []
         top_list2 = []
+        top_conf_list = []
+        top_conf = 0.0
+        top_txt = ''
+        last_key = ''
         type_key = False
         for i in raw_results:
             if bool(re.match(r'法定代表', i.txt)) or bool(re.match(r'负责人', i.txt)):
@@ -269,10 +278,6 @@ class BussinessParse1(object):
         top_img = image[int(t1): int(d1), int(l1): w]
         top_result = self.ocr.ocr(top_img)
 
-        top_conf_list = []
-        top_conf = 0.0
-        top_txt = ''
-        last_key = ''
         # 住所信息
         for idx, res in enumerate(top_result):
             # print(res)