1 year ago · c634c30776
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -698,7 +698,7 @@ def tableToText(soup, docid=None):
 
				                 inner_table[i][j] = col
			
 
				 
			
 
				         # 模型预测表头
			
 
				-        predict_list = predict(inner_table)
			
 
				+        # predict_list = predict(inner_table)
			
 
				 
			
 
				         start_time = time.time()
			
 
				         predict_list = predict(inner_table)
			
@@ -719,6 +719,11 @@ def tableToText(soup, docid=None):
 
				         # repairTable(inner_table)
			
 
				         inner_table = table_head_repair_process(inner_table, docid)
			
 
				 
			
 
				+        # 组合结果
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                inner_table[i][j] = [origin_inner_table[i][j][0], int(inner_table[i][j][1])]
			
 
				+
			
 
				         if show:
			
 
				             print("="*80)
			
 
				             print("table_head after repair")
			
@@ -1420,8 +1425,9 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				         for i in range(len(inner_table)):
			
 
				             for j in range(len(inner_table[i])):
			
 
				                 # 删除前后逗号
			
 
				-                inner_table[i][j][0] = re.sub('^[，,]+', '', inner_table[i][j][0])
			
 
				-                inner_table[i][j][0] = re.sub('[，,]+$', '', inner_table[i][j][0])
			
 
				+                inner_table[i][j][0] = re.sub('^[，, ]+', '', inner_table[i][j][0])
			
 
				+                inner_table[i][j][0] = re.sub('[，, ]+$', '', inner_table[i][j][0])
			
 
				+                inner_table[i][j][0] = re.sub('[， ]+', '', inner_table[i][j][0])
			
 
				         return inner_table
			
 
				 
			
 
				     def repair_by_colon(inner_table):
			
@@ -1442,7 +1448,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				                         if start_index == 0:
			
 
				                             continue
			
 
				                         if end_index == len(_text):
			
 
				-                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
			
 
				+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and inner_table[i][j+1][0] and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
			
 
				                                 inner_table[i][j][1] = 0
			
 
				                                 inner_table[i][j+1][1] = 0
			
 
				                             else:
			
@@ -1461,19 +1467,121 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				         """
			
 
				         根据列重复修复当前格子的表头值
			
 
				         """
			
 
				+        # 统计每个值的表头情况
			
 
				+        col_head_dict = {}
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                col = inner_table[i][j]
			
 
				+                if col[0] in col_head_dict.keys():
			
 
				+                    col_head_dict[col[0]] += [col[1]]
			
 
				+                else:
			
 
				+                    col_head_dict[col[0]] = [col[1]]
			
 
				+
			
 
				         # 多个重复列的预测值不同，以第一个为准
			
 
				         for i in range(len(inner_table)):
			
 
				             col = inner_table[i][0]
			
 
				-            for j in range(1, len(inner_table[i])):
			
 
				+            key = col[0] + '\t' + str(0)
			
 
				+            dup_dict = {}
			
 
				+            for j in range(len(inner_table[i])):
			
 
				                 if inner_table[i][j][0] == col[0]:
			
 
				-                    if inner_table[i][j][1] != col[1]:
			
 
				-                        if col != inner_table[i][0]:
			
 
				-                            inner_table[i][j][1] = col[1]
			
 
				-                        else:
			
 
				-                            inner_table[i][0][1] = inner_table[i][j][1]
			
 
				-                            col = inner_table[i][0]
			
 
				+                    if key in dup_dict.keys():
			
 
				+                        dup_dict[key] += [j]
			
 
				+                    else:
			
 
				+                        dup_dict[key] = [j]
			
 
				+
			
 
				+                    # if inner_table[i][j][1] != col[1]:
			
 
				+                    #     if col != inner_table[i][0]:
			
 
				+                    #         inner_table[i][j][1] = col[1]
			
 
				+                    #     else:
			
 
				+                    #         inner_table[i][0][1] = inner_table[i][j][1]
			
 
				+                    #         col = inner_table[i][0]
			
 
				                 else:
			
 
				                     col = inner_table[i][j]
			
 
				+                    key = col[0] + '\t' + str(j)
			
 
				+                    dup_dict[key] = [j]
			
 
				+
			
 
				+            # print('dup_dict', dup_dict)
			
 
				+            #
			
 
				+            for key in dup_dict.keys():
			
 
				+                index_list = dup_dict.get(key)
			
 
				+                if len(index_list) <= 1:
			
 
				+                    continue
			
 
				+
			
 
				+                # 需要表头不同
			
 
				+                table_head_list = []
			
 
				+                for index in index_list:
			
 
				+                    table_head_list.append(inner_table[i][index][1])
			
 
				+                table_head_list = list(set(table_head_list))
			
 
				+                if len(table_head_list) <= 1:
			
 
				+                    continue
			
 
				+
			
 
				+                # 若是职业特殊处理
			
 
				+                col = key.split('\t')[0]
			
 
				+                if re.search('([^人]员|工程师|建造师|经理|安全负责人|技术负责人|合同商务负责人)$', col):
			
 
				+                    table_head_flag = 0
			
 
				+                # 看是否包含表头
			
 
				+                else:
			
 
				+                    table_head_flag = 0
			
 
				+                    for index in index_list:
			
 
				+                        if inner_table[i][index][1] == 1:
			
 
				+                            table_head_flag = 1
			
 
				+                            break
			
 
				+
			
 
				+                table_head = None
			
 
				+                if index_list[0] > 0 and index_list[-1] == len(inner_table[i]) - 1:
			
 
				+                    table_head = inner_table[i][index_list[0]][1]
			
 
				+                elif index_list[0] > 0 and index_list[-1] != len(inner_table[i]) - 1:
			
 
				+                    # 查看前面是否有表头-非表头表达
			
 
				+                    is_head_not_head = 0
			
 
				+                    for k in range(index_list[0]):
			
 
				+                        if k+1 < index_list[0] and inner_table[i][k][1] == 1 and inner_table[i][k+1][1] == 0:
			
 
				+                            is_head_not_head = 1
			
 
				+                            break
			
 
				+                    # 查看前后有没有表头
			
 
				+                    start_has_head = 0
			
 
				+                    end_has_head = 0
			
 
				+                    if not is_head_not_head:
			
 
				+                        for k in range(index_list[0], len(inner_table[i])):
			
 
				+                            if inner_table[i][k][0] == inner_table[i][index_list[0]][0]:
			
 
				+                                continue
			
 
				+                            if inner_table[i][k][1] == 1:
			
 
				+                                end_has_head = 1
			
 
				+                                break
			
 
				+                        for k in range(index_list[0]):
			
 
				+                            if inner_table[i][k][0] == inner_table[i][index_list[0]][0]:
			
 
				+                                continue
			
 
				+                            if inner_table[i][k][1] == 1:
			
 
				+                                start_has_head = 1
			
 
				+                                break
			
 
				+
			
 
				+                    head_list = col_head_dict.get(inner_table[i][index_list[0]][0])
			
 
				+                    if is_head_not_head:
			
 
				+                        table_head = table_head_flag
			
 
				+                    elif len(head_list) >= 4:
			
 
				+                        if head_list.count(0) > head_list.count(1):
			
 
				+                            table_head = 0
			
 
				+                        else:
			
 
				+                            table_head = 1
			
 
				+                    elif not start_has_head and not end_has_head:
			
 
				+                        table_head = 0
			
 
				+                    else:
			
 
				+                        table_head = table_head_flag
			
 
				+                elif index_list[0] == 0 and index_list[-1] == len(inner_table[i]) - 1:
			
 
				+                    table_head = table_head_flag
			
 
				+                elif index_list[0] == 0 and index_list[-1] != len(inner_table[i]) - 1:
			
 
				+                    head_list = col_head_dict.get(inner_table[i][index_list[0]][0])
			
 
				+                    if len(head_list) >= 4:
			
 
				+                        if head_list.count(0) > head_list.count(1):
			
 
				+                            table_head = 0
			
 
				+                        else:
			
 
				+                            table_head = 1
			
 
				+                    else:
			
 
				+                        table_head = table_head_flag
			
 
				+
			
 
				+                if table_head is not None:
			
 
				+                    for index in index_list:
			
 
				+                        inner_table[i][index][1] = table_head
			
 
				+
			
 
				         return inner_table
			
 
				 
			
 
				     def repair_by_around(inner_table):
			
@@ -1581,18 +1689,24 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				         根据关键词修复当前格子的表头值
			
 
				         """
			
 
				         # 修复表头关键词未作为表头
			
 
				-        # 末尾匹配匹配关键词，直接作为表头
			
 
				+        # 末尾匹配匹配关键词且字数小于7，直接作为表头
			
 
				         head_keyword = ['供应商', '总价', '总价（元）', '总价\(元\)', '品目一', '品目二', '品目三']
			
 
				         # 末尾匹配关键词且前一列为表头且与前一列文本不同，直接不做表头
			
 
				-        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
			
 
				+        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。', ]
			
 
				         # 开头匹配关键词，直接不做表头
			
 
				-        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
			
 
				+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑', '低档', '高档',
			
 
				+                         '更换配置']
			
 
				         # 文本匹配关键词且前一列为表头，直接作为表头
			
 
				         head_keyword4 = ['综合排名']
			
 
				         # 文本在关键词中，直接不做表头
			
 
				-        head_keyword5 = ['殡葬用地']
			
 
				+        head_keyword5 = ['殡葬用地', '电脑包', '电池']
			
 
				         # 文本匹配关键词，直接不作表头
			
 
				-        head_keyword6 = ['市场行情', '有限公司']
			
 
				+        head_keyword6 = ['市场行情', '有限公司', '能提供']
			
 
				+        # 末尾匹配关键词，直接不做表头
			
 
				+        head_keyword7 = ['基金', '结转', '结余', '税', '结余分配', '协议供货', '房屋',
			
 
				+                         '纳税人', '自然人', '计算所得额']
			
 
				+        # 文本匹配关键词且整行都是表头，直接做表头
			
 
				+        head_keyword8 = ['备注']
			
 
				 
			
 
				         # n1 next one, n2 next two, l1 last one, l2 last two
			
 
				         for i in range(len(inner_table)):
			
@@ -1608,7 +1722,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				                 #     inner_table[i][j][1] = 1
			
 
				                 for key in head_keyword:
			
 
				                     match = re.search(key+'$', row_col[0])
			
 
				-                    if match:
			
 
				+                    if match and len(inner_table[i][j][0]) <= 6:
			
 
				                         inner_table[i][j][1] = 1
			
 
				                 for key in head_keyword2:
			
 
				                     match = re.search(key+'$', row_col[0])
			
@@ -1628,9 +1742,34 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				                     match = re.search(key, row_col[0])
			
 
				                     if match:
			
 
				                         inner_table[i][j][1] = 0
			
 
				+                for key in head_keyword7:
			
 
				+                    match = re.search(key+'$', row_col[0])
			
 
				+                    if match and row_col[1] == 1:
			
 
				+                        inner_table[i][j][1] = 0
			
 
				+                if row_col[0] in head_keyword8 and row_col[1] == 0:
			
 
				+                    print('head_keyword8 row_col0', row_col[0])
			
 
				+                    all_head_flag = 1
			
 
				+                    for k in range(len(row)):
			
 
				+                        if row[k][0] in ['', row_col[0]]:
			
 
				+                            continue
			
 
				+                        if row[k][1] == 0:
			
 
				+                            print('row[k]', row[k])
			
 
				+                            all_head_flag = 0
			
 
				+                            break
			
 
				+                    print('all_head_flag', all_head_flag)
			
 
				+                    if all_head_flag:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+
			
 
				 
			
 
				         return inner_table
			
 
				 
			
 
				+    def repair_by_length(inner_table):
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                if len(inner_table[i][j][0]) >= 30:
			
 
				+                    inner_table[i][j][1] = 0
			
 
				+        return inner_table
			
 
				+
			
 
				     _inner_table = pre_process(_inner_table)
			
 
				     compare_inner_table = copy.deepcopy(_inner_table)
			
 
				     if show:
			
@@ -1685,6 +1824,13 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				     if show:
			
 
				         print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
			
 
				 
			
 
				+    _inner_table = repair_by_length(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        print('table_head repair9 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process9', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				     return _inner_table