fangjiasheng пре 1 година
родитељ
комит
c634c30776
1 измењених фајлова са 163 додато и 17 уклоњено
  1. 163 17
      BiddingKG/dl/interface/Preprocessing.py

+ 163 - 17
BiddingKG/dl/interface/Preprocessing.py

@@ -698,7 +698,7 @@ def tableToText(soup, docid=None):
                 inner_table[i][j] = col
                 inner_table[i][j] = col
 
 
         # 模型预测表头
         # 模型预测表头
-        predict_list = predict(inner_table)
+        # predict_list = predict(inner_table)
 
 
         start_time = time.time()
         start_time = time.time()
         predict_list = predict(inner_table)
         predict_list = predict(inner_table)
@@ -719,6 +719,11 @@ def tableToText(soup, docid=None):
         # repairTable(inner_table)
         # repairTable(inner_table)
         inner_table = table_head_repair_process(inner_table, docid)
         inner_table = table_head_repair_process(inner_table, docid)
 
 
+        # 组合结果
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                inner_table[i][j] = [origin_inner_table[i][j][0], int(inner_table[i][j][1])]
+
         if show:
         if show:
             print("="*80)
             print("="*80)
             print("table_head after repair")
             print("table_head after repair")
@@ -1420,8 +1425,9 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
         for i in range(len(inner_table)):
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
             for j in range(len(inner_table[i])):
                 # 删除前后逗号
                 # 删除前后逗号
-                inner_table[i][j][0] = re.sub('^[,,]+', '', inner_table[i][j][0])
-                inner_table[i][j][0] = re.sub('[,,]+$', '', inner_table[i][j][0])
+                inner_table[i][j][0] = re.sub('^[,, ]+', '', inner_table[i][j][0])
+                inner_table[i][j][0] = re.sub('[,, ]+$', '', inner_table[i][j][0])
+                inner_table[i][j][0] = re.sub('[, ]+', '', inner_table[i][j][0])
         return inner_table
         return inner_table
 
 
     def repair_by_colon(inner_table):
     def repair_by_colon(inner_table):
@@ -1442,7 +1448,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
                         if start_index == 0:
                         if start_index == 0:
                             continue
                             continue
                         if end_index == len(_text):
                         if end_index == len(_text):
-                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and inner_table[i][j+1][0] and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
                                 inner_table[i][j][1] = 0
                                 inner_table[i][j][1] = 0
                                 inner_table[i][j+1][1] = 0
                                 inner_table[i][j+1][1] = 0
                             else:
                             else:
@@ -1461,19 +1467,121 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
         """
         """
         根据列重复修复当前格子的表头值
         根据列重复修复当前格子的表头值
         """
         """
+        # 统计每个值的表头情况
+        col_head_dict = {}
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                col = inner_table[i][j]
+                if col[0] in col_head_dict.keys():
+                    col_head_dict[col[0]] += [col[1]]
+                else:
+                    col_head_dict[col[0]] = [col[1]]
+
         # 多个重复列的预测值不同,以第一个为准
         # 多个重复列的预测值不同,以第一个为准
         for i in range(len(inner_table)):
         for i in range(len(inner_table)):
             col = inner_table[i][0]
             col = inner_table[i][0]
-            for j in range(1, len(inner_table[i])):
+            key = col[0] + '\t' + str(0)
+            dup_dict = {}
+            for j in range(len(inner_table[i])):
                 if inner_table[i][j][0] == col[0]:
                 if inner_table[i][j][0] == col[0]:
-                    if inner_table[i][j][1] != col[1]:
-                        if col != inner_table[i][0]:
-                            inner_table[i][j][1] = col[1]
-                        else:
-                            inner_table[i][0][1] = inner_table[i][j][1]
-                            col = inner_table[i][0]
+                    if key in dup_dict.keys():
+                        dup_dict[key] += [j]
+                    else:
+                        dup_dict[key] = [j]
+
+                    # if inner_table[i][j][1] != col[1]:
+                    #     if col != inner_table[i][0]:
+                    #         inner_table[i][j][1] = col[1]
+                    #     else:
+                    #         inner_table[i][0][1] = inner_table[i][j][1]
+                    #         col = inner_table[i][0]
                 else:
                 else:
                     col = inner_table[i][j]
                     col = inner_table[i][j]
+                    key = col[0] + '\t' + str(j)
+                    dup_dict[key] = [j]
+
+            # print('dup_dict', dup_dict)
+            #
+            for key in dup_dict.keys():
+                index_list = dup_dict.get(key)
+                if len(index_list) <= 1:
+                    continue
+
+                # 需要表头不同
+                table_head_list = []
+                for index in index_list:
+                    table_head_list.append(inner_table[i][index][1])
+                table_head_list = list(set(table_head_list))
+                if len(table_head_list) <= 1:
+                    continue
+
+                # 若是职业特殊处理
+                col = key.split('\t')[0]
+                if re.search('([^人]员|工程师|建造师|经理|安全负责人|技术负责人|合同商务负责人)$', col):
+                    table_head_flag = 0
+                # 看是否包含表头
+                else:
+                    table_head_flag = 0
+                    for index in index_list:
+                        if inner_table[i][index][1] == 1:
+                            table_head_flag = 1
+                            break
+
+                table_head = None
+                if index_list[0] > 0 and index_list[-1] == len(inner_table[i]) - 1:
+                    table_head = inner_table[i][index_list[0]][1]
+                elif index_list[0] > 0 and index_list[-1] != len(inner_table[i]) - 1:
+                    # 查看前面是否有表头-非表头表达
+                    is_head_not_head = 0
+                    for k in range(index_list[0]):
+                        if k+1 < index_list[0] and inner_table[i][k][1] == 1 and inner_table[i][k+1][1] == 0:
+                            is_head_not_head = 1
+                            break
+                    # 查看前后有没有表头
+                    start_has_head = 0
+                    end_has_head = 0
+                    if not is_head_not_head:
+                        for k in range(index_list[0], len(inner_table[i])):
+                            if inner_table[i][k][0] == inner_table[i][index_list[0]][0]:
+                                continue
+                            if inner_table[i][k][1] == 1:
+                                end_has_head = 1
+                                break
+                        for k in range(index_list[0]):
+                            if inner_table[i][k][0] == inner_table[i][index_list[0]][0]:
+                                continue
+                            if inner_table[i][k][1] == 1:
+                                start_has_head = 1
+                                break
+
+                    head_list = col_head_dict.get(inner_table[i][index_list[0]][0])
+                    if is_head_not_head:
+                        table_head = table_head_flag
+                    elif len(head_list) >= 4:
+                        if head_list.count(0) > head_list.count(1):
+                            table_head = 0
+                        else:
+                            table_head = 1
+                    elif not start_has_head and not end_has_head:
+                        table_head = 0
+                    else:
+                        table_head = table_head_flag
+                elif index_list[0] == 0 and index_list[-1] == len(inner_table[i]) - 1:
+                    table_head = table_head_flag
+                elif index_list[0] == 0 and index_list[-1] != len(inner_table[i]) - 1:
+                    head_list = col_head_dict.get(inner_table[i][index_list[0]][0])
+                    if len(head_list) >= 4:
+                        if head_list.count(0) > head_list.count(1):
+                            table_head = 0
+                        else:
+                            table_head = 1
+                    else:
+                        table_head = table_head_flag
+
+                if table_head is not None:
+                    for index in index_list:
+                        inner_table[i][index][1] = table_head
+
         return inner_table
         return inner_table
 
 
     def repair_by_around(inner_table):
     def repair_by_around(inner_table):
@@ -1581,18 +1689,24 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
         根据关键词修复当前格子的表头值
         根据关键词修复当前格子的表头值
         """
         """
         # 修复表头关键词未作为表头
         # 修复表头关键词未作为表头
-        # 末尾匹配匹配关键词,直接作为表头
+        # 末尾匹配匹配关键词且字数小于7,直接作为表头
         head_keyword = ['供应商', '总价', '总价(元)', '总价\(元\)', '品目一', '品目二', '品目三']
         head_keyword = ['供应商', '总价', '总价(元)', '总价\(元\)', '品目一', '品目二', '品目三']
         # 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
         # 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
-        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
+        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。', ]
         # 开头匹配关键词,直接不做表头
         # 开头匹配关键词,直接不做表头
-        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑', '低档', '高档',
+                         '更换配置']
         # 文本匹配关键词且前一列为表头,直接作为表头
         # 文本匹配关键词且前一列为表头,直接作为表头
         head_keyword4 = ['综合排名']
         head_keyword4 = ['综合排名']
         # 文本在关键词中,直接不做表头
         # 文本在关键词中,直接不做表头
-        head_keyword5 = ['殡葬用地']
+        head_keyword5 = ['殡葬用地', '电脑包', '电池']
         # 文本匹配关键词,直接不作表头
         # 文本匹配关键词,直接不作表头
-        head_keyword6 = ['市场行情', '有限公司']
+        head_keyword6 = ['市场行情', '有限公司', '能提供']
+        # 末尾匹配关键词,直接不做表头
+        head_keyword7 = ['基金', '结转', '结余', '税', '结余分配', '协议供货', '房屋',
+                         '纳税人', '自然人', '计算所得额']
+        # 文本匹配关键词且整行都是表头,直接做表头
+        head_keyword8 = ['备注']
 
 
         # n1 next one, n2 next two, l1 last one, l2 last two
         # n1 next one, n2 next two, l1 last one, l2 last two
         for i in range(len(inner_table)):
         for i in range(len(inner_table)):
@@ -1608,7 +1722,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
                 #     inner_table[i][j][1] = 1
                 #     inner_table[i][j][1] = 1
                 for key in head_keyword:
                 for key in head_keyword:
                     match = re.search(key+'$', row_col[0])
                     match = re.search(key+'$', row_col[0])
-                    if match:
+                    if match and len(inner_table[i][j][0]) <= 6:
                         inner_table[i][j][1] = 1
                         inner_table[i][j][1] = 1
                 for key in head_keyword2:
                 for key in head_keyword2:
                     match = re.search(key+'$', row_col[0])
                     match = re.search(key+'$', row_col[0])
@@ -1628,9 +1742,34 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
                     match = re.search(key, row_col[0])
                     match = re.search(key, row_col[0])
                     if match:
                     if match:
                         inner_table[i][j][1] = 0
                         inner_table[i][j][1] = 0
+                for key in head_keyword7:
+                    match = re.search(key+'$', row_col[0])
+                    if match and row_col[1] == 1:
+                        inner_table[i][j][1] = 0
+                if row_col[0] in head_keyword8 and row_col[1] == 0:
+                    print('head_keyword8 row_col0', row_col[0])
+                    all_head_flag = 1
+                    for k in range(len(row)):
+                        if row[k][0] in ['', row_col[0]]:
+                            continue
+                        if row[k][1] == 0:
+                            print('row[k]', row[k])
+                            all_head_flag = 0
+                            break
+                    print('all_head_flag', all_head_flag)
+                    if all_head_flag:
+                        inner_table[i][j][1] = 1
+
 
 
         return inner_table
         return inner_table
 
 
+    def repair_by_length(inner_table):
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                if len(inner_table[i][j][0]) >= 30:
+                    inner_table[i][j][1] = 0
+        return inner_table
+
     _inner_table = pre_process(_inner_table)
     _inner_table = pre_process(_inner_table)
     compare_inner_table = copy.deepcopy(_inner_table)
     compare_inner_table = copy.deepcopy(_inner_table)
     if show:
     if show:
@@ -1685,6 +1824,13 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
     if show:
     if show:
         print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
         print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
 
 
+    _inner_table = repair_by_length(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        print('table_head repair9 ' + str(docid))
+    if show:
+        print('table_head_repair_process9', show_row_index, _inner_table[show_row_index])
+
     return _inner_table
     return _inner_table