1 жил өмнө · fe9e673dff
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
 
				 
			
 
				 
			
 
				 #
			
 
				-def tableToText(soup):
			
 
				+def tableToText(soup, docid=None):
			
 
				     '''
			
 
				     @param:
			
 
				         soup:网页html的soup
			
@@ -181,7 +181,7 @@ def tableToText(soup):
 
				             count_1 = 0
			
 
				             count_0 = 0
			
 
				             for i in range(len(line)):
			
 
				-                if line[i][0]==fix_value:
			
 
				+                if line[i][0] == fix_value:
			
 
				                     continue
			
 
				                 if line[i][1]==1:
			
 
				                     if first_1==-1:
			
@@ -211,12 +211,12 @@ def tableToText(soup):
 
				             """
			
 
				             @summary: 计算每个节点受到的挤压度来判断是否需要染色
			
 
				             """
			
 
				-            #print("B",inner_table[index])
			
 
				+            # print("B",inner_table[index])
			
 
				             min_presure = 3
			
 
				             list_dye = []
			
 
				             first = None
			
 
				             count = 0
			
 
				-            temp_set = set()
			
 
				+            temp_set = set(['~~'])
			
 
				             _index = 0
			
 
				             for item in inner_table[index]:
			
 
				                 if first is None:
			
@@ -272,7 +272,7 @@ def tableToText(soup):
 
				                             dye_set.add((inner_table[index][h][0],dye_type))
			
 
				                             key_set.add(inner_table[index][h][0])
			
 
				                     begin = end
			
 
				-                #print("E",inner_table[index])
			
 
				+                # print("E",inner_table[index])
			
 
				 
			
 
				         def otherrepair(inner_table,index,dye_set,key_set):
			
 
				             list_provide_repair = []
			
@@ -327,25 +327,254 @@ def tableToText(soup):
 
				             return
			
 
				         repairTable(inner_table, dye_set, key_set)
			
 
				 
			
 
				-    def repair_table2(inner_table):
			
 
				+    def repair_table2(inner_table, show=0, row_no=0):
			
 
				         """
			
 
				         @summary: 修复表头识别，将明显错误的进行修正
			
 
				         """
			
 
				-        # 修复第一第二第三中标候选人作为列表头
			
 
				-        if len(inner_table) >= 2 and len(inner_table[0]) >= 3:
			
 
				-            for i in range(len(inner_table[:3])):
			
 
				-                for j in range(len(inner_table[i])-2):
			
 
				-                    if inner_table[i][j][0] == '第一中标候选人' \
			
 
				-                            and inner_table[i][j+1][0] == '第二中标候选人' \
			
 
				-                            and inner_table[i][j+2][0] == '第三中标候选人' \
			
 
				-                            and i+1 < len(inner_table) \
			
 
				-                            and inner_table[i+1][j][1] == 0 \
			
 
				-                            and inner_table[i+1][j+1][1] == 0 \
			
 
				-                            and inner_table[i+1][j+2][1] == 0:
			
 
				+
			
 
				+        # 循环处理单元格，一次获取需要的
			
 
				+        one_head_index_list = []
			
 
				+        zero_head_index_list = []
			
 
				+        all_head_index_list = []
			
 
				+        for i in range(len(inner_table)):
			
 
				+            head_cnt = 0
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                # 删除前后逗号
			
 
				+                inner_table[i][j][0] = re.sub('^[，,]+', '', inner_table[i][j][0])
			
 
				+                inner_table[i][j][0] = re.sub('[，,]+$', '', inner_table[i][j][0])
			
 
				+
			
 
				+                # 统计表头数
			
 
				+                if inner_table[i][j][1] == 1:
			
 
				+                    head_cnt += 1
			
 
				+
			
 
				+            # 表头数list
			
 
				+            if head_cnt == 0:
			
 
				+                zero_head_index_list.append(i)
			
 
				+            elif head_cnt == 1:
			
 
				+                one_head_index_list.append(i)
			
 
				+            elif head_cnt == len(inner_table[i]):
			
 
				+                all_head_index_list.append(i)
			
 
				+
			
 
				+        # 修复冒号在文本中间的，不能作为表头；(冒号后面需多个字)
			
 
				+        # 冒号在括号中的除外
			
 
				+        # 冒号在最后的，判断后一个格子是否有重复的文字
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                _text = inner_table[i][j][0]
			
 
				+                if len(_text) >= 3 and inner_table[i][j][1] == 1:
			
 
				+                    match = re.search('[:：]', _text)
			
 
				+                    if match:
			
 
				+                        start_index, end_index = match.span()
			
 
				+                        if start_index == 0:
			
 
				+                            continue
			
 
				+                        if end_index == len(_text):
			
 
				+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
			
 
				+                                inner_table[i][j][1] = 0
			
 
				+                                inner_table[i][j+1][1] = 0
			
 
				+                            else:
			
 
				+                                continue
			
 
				+                        if re.search('[(（]', _text[:start_index]) and re.search('[)）]', _text[end_index:]):
			
 
				+                            continue
			
 
				+
			
 
				+                        m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
			
 
				+                        m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
			
 
				+                        if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
			
 
				+                            inner_table[i][j][1] = 0
			
 
				+
			
 
				+        if show:
			
 
				+            print('inner_table[i]1', inner_table[row_no])
			
 
				+
			
 
				+        # 修复实际只有几列，但有一列由于重复占了太多行表头识别错误
			
 
				+        # for i in range(len(inner_table)):
			
 
				+        #     head_flag_dict = {}
			
 
				+        #     for j in range(len(inner_table[i])):
			
 
				+        #         if inner_table[i][j][0] in head_flag_dict.keys():
			
 
				+        #             head_flag_dict[inner_table[i][j][0]] += [inner_table[i][j][1]]
			
 
				+        #         else:
			
 
				+        #             head_flag_dict[inner_table[i][j][0]] = [inner_table[i][j][1]]
			
 
				+        #
			
 
				+        #     if len(head_flag_dict.keys()) == 2:
			
 
				+        #         col_flag = None
			
 
				+        #         col_value = None
			
 
				+        #         for key in head_flag_dict.keys():
			
 
				+        #             flag_list = head_flag_dict[key]
			
 
				+        #             if len(flag_list) >= 4 and len(set(flag_list)) == 2 and len(set(flag_list[1:])) == 1:
			
 
				+        #                 col_flag = flag_list[0]
			
 
				+        #                 col_value = key
			
 
				+        #                 break
			
 
				+        #
			
 
				+        #         if col_flag is not None:
			
 
				+        #             for j in range(len(inner_table[i])):
			
 
				+        #                 if inner_table[i][j][0] == col_value:
			
 
				+        #                     inner_table[i][j][1] = col_flag
			
 
				+
			
 
				+        # 多个重复列的预测值不同，以第一个为准
			
 
				+        for i in range(len(inner_table)):
			
 
				+            col = inner_table[i][0]
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                if inner_table[i][j][0] == col[0]:
			
 
				+                    if inner_table[i][j][1] != col[1]:
			
 
				+                        inner_table[i][j][1] = col[1]
			
 
				+                else:
			
 
				+                    col = inner_table[i][j]
			
 
				+
			
 
				+        if show:
			
 
				+            print('inner_table[i]2', inner_table[row_no])
			
 
				+
			
 
				+        # 修复多个重复的单元格表头不一致
			
 
				+        # for i in range(len(inner_table)):
			
 
				+        #     for j in range(len(inner_table[i])-1):
			
 
				+        #         only_chinese1 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j][0]))
			
 
				+        #         only_chinese2 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j+1][0]))
			
 
				+        #         if only_chinese1 == only_chinese2 and inner_table[i][j][1] != inner_table[i][j+1][1]:
			
 
				+        #             inner_table[i][j][1] = 1
			
 
				+        #             inner_table[i][j+1][1] = 1
			
 
				+
			
 
				+        # if show:
			
 
				+        #     print('inner_table[i]3', inner_table[row_no])
			
 
				+
			
 
				+        # # 修复一行几乎都是表头，个别不是；或者一行几乎都是非表头，个别是
			
 
				+        # for i in range(len(inner_table)):
			
 
				+        #     head_dict = {}
			
 
				+        #     not_head_dict = {}
			
 
				+        #     for j in range(len(inner_table[i])):
			
 
				+        #         if inner_table[i][j][1] == 1:
			
 
				+        #             if inner_table[i][j][0] not in head_dict:
			
 
				+        #                 head_dict[inner_table[i][j][0]] = 1
			
 
				+        #         else:
			
 
				+        #             if inner_table[i][j][0] not in not_head_dict:
			
 
				+        #                 not_head_dict[inner_table[i][j][0]] = 1
			
 
				+        #
			
 
				+        #     # 非表头:表头 <= 1:3
			
 
				+        #     # if len(head_dict.keys()) > 0 and len(not_head_dict.keys()) / len(head_dict.keys()) <= 1/3 and len(head_dict.keys()) >= 3:
			
 
				+        #     #     for j in range(len(inner_table[i])):
			
 
				+        #     #         if len(re.sub(' ', '', inner_table[i][j][0])) > 0:
			
 
				+        #     #             inner_table[i][j][1] = 1
			
 
				+        #
			
 
				+        #     # 表头数一个且非表头数大于2且上一行都是表头
			
 
				+        #     if i > 0 and len(head_dict.keys()) == 1 and len(not_head_dict.keys()) >= 2 and inner_table[i][0][1] == 0:
			
 
				+        #         last_row = inner_table[i-1]
			
 
				+        #         col_list = []
			
 
				+        #         for j in range(len(last_row)):
			
 
				+        #             if len(re.sub(' ', '', last_row[j][0])) > 0:
			
 
				+        #                 if last_row[j][1] == 0:
			
 
				+        #                     col_list = []
			
 
				+        #                     break
			
 
				+        #                 col_list.append(last_row[j][0])
			
 
				+        #         if col_list:
			
 
				+        #             col_list = list(set(col_list))
			
 
				+        #             if len(col_list) > 2:
			
 
				+        #                 for j in range(len(inner_table[i])):
			
 
				+        #                     if inner_table[i][j][1] == 1:
			
 
				+        #                         inner_table[i][j][1] = 0
			
 
				+
			
 
				+        # 一整个大表格，第一行为表头，下面行中有个别格子被识别为表头
			
 
				+        # 候选人后面修复
			
 
				+        for index in one_head_index_list:
			
 
				+            if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
			
 
				+                    or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
			
 
				+                    or (index - 1 in all_head_index_list):
			
 
				+                for j in range(len(inner_table[index])):
			
 
				+                    inner_table[index][j][1] = 0
			
 
				+                zero_head_index_list.append(index)
			
 
				+
			
 
				+
			
 
				+
			
 
				+        if show:
			
 
				+            print('inner_table[i]4', inner_table[row_no])
			
 
				+
			
 
				+        # 修复第一第二第三中标候选人作为表头
			
 
				+        first_tenderer = ['第一中标候选人', '第一中标人', '第一中标（成交）人', '第一候选人']
			
 
				+        second_tenderer = ['第二中标候选人', '第二中标（成交）候选人', '第二候选人']
			
 
				+        third_tenderer = ['第三中标候选人', '第三中标（成交）候选人', '第三候选人']
			
 
				+        # n1 next one, n2 next two, l1 last one, l2 last two
			
 
				+        for i in range(len(inner_table)):
			
 
				+            row = inner_table[i]
			
 
				+            n1_row, n2_row = None, None
			
 
				+            if i+1 < len(inner_table):
			
 
				+                n1_row = inner_table[i+1]
			
 
				+            if i+2 < len(inner_table):
			
 
				+                n2_row = inner_table[i+2]
			
 
				+            for j in range(len(row)):
			
 
				+                row_col = row[j]
			
 
				+                n1_row_col, n2_row_col = None, None
			
 
				+                row_n1_col, row_n2_col = None, None
			
 
				+                n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
			
 
				+                if n1_row:
			
 
				+                    n1_row_col = n1_row[j]
			
 
				+                if n2_row:
			
 
				+                    n2_row_col = n2_row[j]
			
 
				+                if j+1 < len(row):
			
 
				+                    row_n1_col = row[j+1]
			
 
				+                if j+2 < len(row):
			
 
				+                    row_n2_col = row[j+2]
			
 
				+                if n1_row and j+1 < len(n1_row):
			
 
				+                    n1_row_n1_col = n1_row[j+1]
			
 
				+                if n2_row and j+1 < len(n2_row):
			
 
				+                    n2_row_n1_col = n2_row[j+1]
			
 
				+                if n1_row and j+2 < len(n1_row):
			
 
				+                    n1_row_n2_col = n1_row[j+2]
			
 
				+
			
 
				+                # 连续作为行表头
			
 
				+                if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
			
 
				+                    if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+                        inner_table[i+1][j][1] = 1
			
 
				+                    if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
			
 
				+                        inner_table[i+2][j][1] = 1
			
 
				+
			
 
				+                # 连续作为列表头
			
 
				+                if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
			
 
				+                    if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
			
 
				                         inner_table[i][j][1] = 1
			
 
				                         inner_table[i][j+1][1] = 1
			
 
				+                    if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
			
 
				                         inner_table[i][j+2][1] = 1
			
 
				-                        break
			
 
				+
			
 
				+        if show:
			
 
				+            print('inner_table[i]5', inner_table[row_no])
			
 
				+
			
 
				+        # 修复表头关键词未作为表头
			
 
				+        # 文本匹配关键词，直接作为表头
			
 
				+        head_keyword = ['供应商', '总价']
			
 
				+        # 末尾匹配关键词且前一列为表头且与前一列文本不同，直接不做表头
			
 
				+        head_keyword2 = ['管理中心', '有限公司', '项目采购', ]
			
 
				+        # 开头匹配关键词，直接不做表头
			
 
				+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
			
 
				+        # 文本匹配关键词且前一列为表头，直接作为表头
			
 
				+        head_keyword4 = ['综合排名']
			
 
				+        # 文本在关键词中，直接不做表头
			
 
				+        head_keyword5 = ['殡葬用地']
			
 
				+
			
 
				+        # n1 next one, n2 next two, l1 last one, l2 last two
			
 
				+        for i in range(len(inner_table)):
			
 
				+            row = inner_table[i]
			
 
				+            for j in range(len(row)):
			
 
				+                row_col = row[j]
			
 
				+                row_l1_col = None
			
 
				+                if j-1 > 0:
			
 
				+                    row_l1_col = row[j-1]
			
 
				+
			
 
				+                match = re.search('[\u4e00-\u9fa50-9a-zA-Z:：]+', row_col[0])
			
 
				+                if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
			
 
				+                    inner_table[i][j][1] = 1
			
 
				+                for key in head_keyword2:
			
 
				+                    match = re.search(key+'$', row_col[0])
			
 
				+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
			
 
				+                        inner_table[i][j][1] = 0
			
 
				+                for key in head_keyword3:
			
 
				+                    match = re.search('^'+key, row_col[0])
			
 
				+                    if match and row_col[1] == 1:
			
 
				+                        inner_table[i][j][1] = 0
			
 
				+                for key in head_keyword4:
			
 
				+                    match = re.search(key, row_col[0])
			
 
				+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+                if row_col[0] in head_keyword5:
			
 
				+                    inner_table[i][j][1] = 0
			
 
				+
			
 
				+        if show:
			
 
				+            print('inner_table[i]6', inner_table[row_no])
			
 
				 
			
 
				         # 修复姓名被作为表头 # 2023-02-10 取消修复，避免项目名称、编号，单位、单价等作为了非表头
			
 
				         # surname = [
			
@@ -358,6 +587,7 @@ def tableToText(soup):
 
				         #                 and (inner_table[i][j][0][0] in surname or inner_table[i][j][0][:2] in surname) \
			
 
				         #                 and re.search("[^\u4e00-\u9fa5]", inner_table[i][j][0]) is None:
			
 
				         #             inner_table[i][j][1] = 0
			
 
				+
			
 
				         return inner_table
			
 
				 
			
 
				     def sliceTable(inner_table,fix_value="~~"):
			
@@ -456,7 +686,7 @@ def tableToText(soup):
 
				         
			
 
				         return inner_table,head_list
			
 
				 
			
 
				-    def set_head_model(inner_table):
			
 
				+    def set_head_model(inner_table, show=0):
			
 
				         origin_inner_table = copy.deepcopy(inner_table)
			
 
				         for i in range(len(inner_table)):
			
 
				             for j in range(len(inner_table[i])):
			
@@ -469,16 +699,30 @@ def tableToText(soup):
 
				         # 模型预测表头
			
 
				         predict_list = predict(inner_table)
			
 
				 
			
 
				+        start_time = time.time()
			
 
				+        predict_list = predict(inner_table)
			
 
				+        print('table head predict cost: ', time.time()-start_time)
			
 
				+
			
 
				         # 组合结果
			
 
				         for i in range(len(inner_table)):
			
 
				             for j in range(len(inner_table[i])):
			
 
				                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
			
 
				 
			
 
				-        # print("table_head before repair", inner_table)
			
 
				+        if show:
			
 
				+            print("table_head before repair")
			
 
				+            for r in inner_table:
			
 
				+                print('row', r)
			
 
				+            print("="*80)
			
 
				 
			
 
				         # 表头修正
			
 
				-        repairTable(inner_table)
			
 
				-        inner_table = repair_table2(inner_table)
			
 
				+        # repairTable(inner_table)
			
 
				+        inner_table = table_head_repair_process(inner_table, docid)
			
 
				+
			
 
				+        if show:
			
 
				+            print("="*80)
			
 
				+            print("table_head after repair")
			
 
				+            for r in inner_table:
			
 
				+                print('row', r)
			
 
				 
			
 
				         # 按表头分割表格
			
 
				         head_list = sliceTable(inner_table)
			
@@ -1139,6 +1383,7 @@ def tableToText(soup):
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				     #逆序处理嵌套表格
			
 
				+    # print('len(tbodies)1', len(tbodies))
			
 
				     for tbody_index in range(1,len(tbodies)+1):
			
 
				         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
@@ -1155,6 +1400,7 @@ def tableToText(soup):
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				     #逆序处理嵌套表格
			
 
				+    # print('len(tbodies)2', len(tbodies))
			
 
				     for tbody_index in range(1,len(tbodies)+1):
			
 
				         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
@@ -1163,6 +1409,284 @@ def tableToText(soup):
 
				     return soup
			
 
				     # return list_innerTable
			
 
				 
			
 
				+
			
 
				+def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0):
			
 
				+    def pre_process(inner_table):
			
 
				+        """
			
 
				+        修复前的预处理
			
 
				+        """
			
 
				+        # 循环处理单元格，一次获取需要的
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                # 删除前后逗号
			
 
				+                inner_table[i][j][0] = re.sub('^[，,]+', '', inner_table[i][j][0])
			
 
				+                inner_table[i][j][0] = re.sub('[，,]+$', '', inner_table[i][j][0])
			
 
				+        return inner_table
			
 
				+
			
 
				+    def repair_by_colon(inner_table):
			
 
				+        """
			
 
				+        根据冒号修复当前格子的表头值
			
 
				+        """
			
 
				+
			
 
				+        # 修复冒号在文本中间的，不能作为表头；(冒号后面需多个字)
			
 
				+        # 冒号在括号中的除外
			
 
				+        # 冒号在最后的，判断后一个格子是否有重复的文字
			
 
				+        for i in range(len(inner_table)):
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                _text = inner_table[i][j][0]
			
 
				+                if len(_text) >= 3 and inner_table[i][j][1] == 1:
			
 
				+                    match = re.search('[:：]', _text)
			
 
				+                    if match:
			
 
				+                        start_index, end_index = match.span()
			
 
				+                        if start_index == 0:
			
 
				+                            continue
			
 
				+                        if end_index == len(_text):
			
 
				+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
			
 
				+                                inner_table[i][j][1] = 0
			
 
				+                                inner_table[i][j+1][1] = 0
			
 
				+                            else:
			
 
				+                                continue
			
 
				+                        if re.search('[(（]', _text[:start_index]) and re.search('[)）]', _text[end_index:]):
			
 
				+                            continue
			
 
				+
			
 
				+                        m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
			
 
				+                        m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
			
 
				+                        if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
			
 
				+                            inner_table[i][j][1] = 0
			
 
				+
			
 
				+        return inner_table
			
 
				+
			
 
				+    def repair_by_duplicate(inner_table):
			
 
				+        """
			
 
				+        根据列重复修复当前格子的表头值
			
 
				+        """
			
 
				+        # 多个重复列的预测值不同，以第一个为准
			
 
				+        for i in range(len(inner_table)):
			
 
				+            col = inner_table[i][0]
			
 
				+            for j in range(1, len(inner_table[i])):
			
 
				+                if inner_table[i][j][0] == col[0]:
			
 
				+                    if inner_table[i][j][1] != col[1]:
			
 
				+                        if col != inner_table[i][0]:
			
 
				+                            inner_table[i][j][1] = col[1]
			
 
				+                        else:
			
 
				+                            inner_table[i][0][1] = inner_table[i][j][1]
			
 
				+                            col = inner_table[i][0]
			
 
				+                else:
			
 
				+                    col = inner_table[i][j]
			
 
				+        return inner_table
			
 
				+
			
 
				+    def repair_by_around(inner_table):
			
 
				+        """
			
 
				+        根据周围的表头值修复当前格子的表头值
			
 
				+        """
			
 
				+        one_head_index_list = []
			
 
				+        zero_head_index_list = []
			
 
				+        all_head_index_list = []
			
 
				+        for i in range(len(inner_table)):
			
 
				+            head_cnt = 0
			
 
				+            head_index = None
			
 
				+            head_dict = {}
			
 
				+            for j in range(len(inner_table[i])):
			
 
				+                # 统计表头数
			
 
				+                if inner_table[i][j][1] == 1:
			
 
				+                    head_cnt += 1
			
 
				+                    head_index = j
			
 
				+                if inner_table[i][j][0] not in ['~~', '', ' ']:
			
 
				+                    if inner_table[i][j][0] in head_dict.keys():
			
 
				+                        head_dict[inner_table[i][j][0]] += 1
			
 
				+                    else:
			
 
				+                        head_dict[inner_table[i][j][0]] = 1
			
 
				+            # 表头数list
			
 
				+            if head_cnt == 0:
			
 
				+                zero_head_index_list.append(i)
			
 
				+            elif head_cnt == 1:
			
 
				+                # 这个单个表头需满足前面有非表头
			
 
				+                find_flag = 0
			
 
				+                for k in range(head_index):
			
 
				+                    if inner_table[i][k][1] == 0:
			
 
				+                        find_flag = 1
			
 
				+                if find_flag and len(head_dict.keys()) > 2:
			
 
				+                    one_head_index_list.append(i)
			
 
				+            elif head_cnt == len(inner_table[i]):
			
 
				+                all_head_index_list.append(i)
			
 
				+
			
 
				+        # 一整个大表格，第一行为表头，下面行中有个别格子被识别为表头
			
 
				+        # 候选人后面修复
			
 
				+        for index in one_head_index_list:
			
 
				+            if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
			
 
				+                    or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
			
 
				+                    or (index - 1 in all_head_index_list):
			
 
				+                for j in range(len(inner_table[index])):
			
 
				+                    inner_table[index][j][1] = 0
			
 
				+                zero_head_index_list.append(index)
			
 
				+        return inner_table
			
 
				+
			
 
				+    def repair_by_tenderer(inner_table):
			
 
				+        """
			
 
				+        根据第一第二第三候选人修复当前格子的表头值
			
 
				+        """
			
 
				+        # 修复第一第二第三中标候选人作为表头
			
 
				+        first_tenderer = ['第一中标候选人', '第一中标人', '第一中标（成交）人', '第一候选人']
			
 
				+        second_tenderer = ['第二中标候选人', '第二中标（成交）候选人', '第二候选人']
			
 
				+        third_tenderer = ['第三中标候选人', '第三中标（成交）候选人', '第三候选人']
			
 
				+        # n1 next one, n2 next two, l1 last one, l2 last two
			
 
				+        for i in range(len(inner_table)):
			
 
				+            row = inner_table[i]
			
 
				+            n1_row, n2_row = None, None
			
 
				+            if i+1 < len(inner_table):
			
 
				+                n1_row = inner_table[i+1]
			
 
				+            if i+2 < len(inner_table):
			
 
				+                n2_row = inner_table[i+2]
			
 
				+            for j in range(len(row)):
			
 
				+                row_col = row[j]
			
 
				+                n1_row_col, n2_row_col = None, None
			
 
				+                row_n1_col, row_n2_col = None, None
			
 
				+                n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
			
 
				+                if n1_row:
			
 
				+                    n1_row_col = n1_row[j]
			
 
				+                if n2_row:
			
 
				+                    n2_row_col = n2_row[j]
			
 
				+                if j+1 < len(row):
			
 
				+                    row_n1_col = row[j+1]
			
 
				+                if j+2 < len(row):
			
 
				+                    row_n2_col = row[j+2]
			
 
				+                if n1_row and j+1 < len(n1_row):
			
 
				+                    n1_row_n1_col = n1_row[j+1]
			
 
				+                if n2_row and j+1 < len(n2_row):
			
 
				+                    n2_row_n1_col = n2_row[j+1]
			
 
				+                if n1_row and j+2 < len(n1_row):
			
 
				+                    n1_row_n2_col = n1_row[j+2]
			
 
				+
			
 
				+                # 连续作为行表头
			
 
				+                if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
			
 
				+                    if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+                        inner_table[i+1][j][1] = 1
			
 
				+                    if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
			
 
				+                        inner_table[i+2][j][1] = 1
			
 
				+
			
 
				+                # 连续作为列表头
			
 
				+                if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
			
 
				+                    if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+                        inner_table[i][j+1][1] = 1
			
 
				+                    if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
			
 
				+                        inner_table[i][j+2][1] = 1
			
 
				+
			
 
				+        return inner_table
			
 
				+
			
 
				+    def repair_by_keywords(inner_table):
			
 
				+        """
			
 
				+        根据关键词修复当前格子的表头值
			
 
				+        """
			
 
				+        # 修复表头关键词未作为表头
			
 
				+        # 末尾匹配匹配关键词，直接作为表头
			
 
				+        head_keyword = ['供应商', '总价', '总价（元）', '总价\(元\)', '品目一', '品目二', '品目三']
			
 
				+        # 末尾匹配关键词且前一列为表头且与前一列文本不同，直接不做表头
			
 
				+        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
			
 
				+        # 开头匹配关键词，直接不做表头
			
 
				+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
			
 
				+        # 文本匹配关键词且前一列为表头，直接作为表头
			
 
				+        head_keyword4 = ['综合排名']
			
 
				+        # 文本在关键词中，直接不做表头
			
 
				+        head_keyword5 = ['殡葬用地']
			
 
				+        # 文本匹配关键词，直接不作表头
			
 
				+        head_keyword6 = ['市场行情', '有限公司']
			
 
				+
			
 
				+        # n1 next one, n2 next two, l1 last one, l2 last two
			
 
				+        for i in range(len(inner_table)):
			
 
				+            row = inner_table[i]
			
 
				+            for j in range(len(row)):
			
 
				+                row_col = row[j]
			
 
				+                row_l1_col = None
			
 
				+                if j-1 >= 0:
			
 
				+                    row_l1_col = row[j-1]
			
 
				+
			
 
				+                # match = re.search('[\u4e00-\u9fa50-9a-zA-Z:：]+', row_col[0])
			
 
				+                # if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
			
 
				+                #     inner_table[i][j][1] = 1
			
 
				+                for key in head_keyword:
			
 
				+                    match = re.search(key+'$', row_col[0])
			
 
				+                    if match:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+                for key in head_keyword2:
			
 
				+                    match = re.search(key+'$', row_col[0])
			
 
				+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
			
 
				+                        inner_table[i][j][1] = 0
			
 
				+                for key in head_keyword3:
			
 
				+                    match = re.search('^'+key, row_col[0])
			
 
				+                    if match and row_col[1] == 1:
			
 
				+                        inner_table[i][j][1] = 0
			
 
				+                for key in head_keyword4:
			
 
				+                    match = re.search(key, row_col[0])
			
 
				+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
			
 
				+                        inner_table[i][j][1] = 1
			
 
				+                if row_col[0] in head_keyword5:
			
 
				+                    inner_table[i][j][1] = 0
			
 
				+                for key in head_keyword6:
			
 
				+                    match = re.search(key, row_col[0])
			
 
				+                    if match:
			
 
				+                        inner_table[i][j][1] = 0
			
 
				+
			
 
				+        return inner_table
			
 
				+
			
 
				+    _inner_table = pre_process(_inner_table)
			
 
				+    compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+    if show:
			
 
				+        print('table_head_repair_process1', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_colon(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair2 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process2', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_keywords(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair3 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process3', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_tenderer(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair4 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process4', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_duplicate(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair5 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process5', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_around(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair6 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process6', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_tenderer(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair7 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process7', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    _inner_table = repair_by_keywords(_inner_table)
			
 
				+    if _inner_table != compare_inner_table:
			
 
				+        compare_inner_table = copy.deepcopy(_inner_table)
			
 
				+        log('table_head repair8 ' + str(docid))
			
 
				+    if show:
			
 
				+        print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
			
 
				+
			
 
				+    return _inner_table
			
 
				+
			
 
				+
			
 
				 re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
			
 
				 num_dict = {
			
 
				     "一": 1, "二": 2,
			
@@ -2191,7 +2715,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				 
			
 
				         article_processed = get_preprocessed_outline(article_processed)
			
 
				         # print('article_processed')
			
 
				-        article_processed = tableToText(article_processed)
			
 
				+        article_processed = tableToText(article_processed, doc_id)
			
 
				         article_processed = segment(article_processed)
			
 
				 
			
 
				         article_processed = article_processed.replace('(', '（').replace(')', '）')  #2022/8/10 统一为中文括号