fangjiasheng 1 жил өмнө
parent
commit
fe9e673dff

+ 547 - 23
BiddingKG/dl/interface/Preprocessing.py

@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
 
 
 #
-def tableToText(soup):
+def tableToText(soup, docid=None):
     '''
     @param:
         soup:网页html的soup
@@ -181,7 +181,7 @@ def tableToText(soup):
             count_1 = 0
             count_0 = 0
             for i in range(len(line)):
-                if line[i][0]==fix_value:
+                if line[i][0] == fix_value:
                     continue
                 if line[i][1]==1:
                     if first_1==-1:
@@ -211,12 +211,12 @@ def tableToText(soup):
             """
             @summary: 计算每个节点受到的挤压度来判断是否需要染色
             """
-            #print("B",inner_table[index])
+            # print("B",inner_table[index])
             min_presure = 3
             list_dye = []
             first = None
             count = 0
-            temp_set = set()
+            temp_set = set(['~~'])
             _index = 0
             for item in inner_table[index]:
                 if first is None:
@@ -272,7 +272,7 @@ def tableToText(soup):
                             dye_set.add((inner_table[index][h][0],dye_type))
                             key_set.add(inner_table[index][h][0])
                     begin = end
-                #print("E",inner_table[index])
+                # print("E",inner_table[index])
 
         def otherrepair(inner_table,index,dye_set,key_set):
             list_provide_repair = []
@@ -327,25 +327,254 @@ def tableToText(soup):
             return
         repairTable(inner_table, dye_set, key_set)
 
-    def repair_table2(inner_table):
+    def repair_table2(inner_table, show=0, row_no=0):
         """
         @summary: 修复表头识别,将明显错误的进行修正
         """
-        # 修复第一第二第三中标候选人作为列表头
-        if len(inner_table) >= 2 and len(inner_table[0]) >= 3:
-            for i in range(len(inner_table[:3])):
-                for j in range(len(inner_table[i])-2):
-                    if inner_table[i][j][0] == '第一中标候选人' \
-                            and inner_table[i][j+1][0] == '第二中标候选人' \
-                            and inner_table[i][j+2][0] == '第三中标候选人' \
-                            and i+1 < len(inner_table) \
-                            and inner_table[i+1][j][1] == 0 \
-                            and inner_table[i+1][j+1][1] == 0 \
-                            and inner_table[i+1][j+2][1] == 0:
+
+        # 循环处理单元格,一次获取需要的
+        one_head_index_list = []
+        zero_head_index_list = []
+        all_head_index_list = []
+        for i in range(len(inner_table)):
+            head_cnt = 0
+            for j in range(len(inner_table[i])):
+                # 删除前后逗号
+                inner_table[i][j][0] = re.sub('^[,,]+', '', inner_table[i][j][0])
+                inner_table[i][j][0] = re.sub('[,,]+$', '', inner_table[i][j][0])
+
+                # 统计表头数
+                if inner_table[i][j][1] == 1:
+                    head_cnt += 1
+
+            # 表头数list
+            if head_cnt == 0:
+                zero_head_index_list.append(i)
+            elif head_cnt == 1:
+                one_head_index_list.append(i)
+            elif head_cnt == len(inner_table[i]):
+                all_head_index_list.append(i)
+
+        # 修复冒号在文本中间的,不能作为表头;(冒号后面需多个字)
+        # 冒号在括号中的除外
+        # 冒号在最后的,判断后一个格子是否有重复的文字
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                _text = inner_table[i][j][0]
+                if len(_text) >= 3 and inner_table[i][j][1] == 1:
+                    match = re.search('[::]', _text)
+                    if match:
+                        start_index, end_index = match.span()
+                        if start_index == 0:
+                            continue
+                        if end_index == len(_text):
+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
+                                inner_table[i][j][1] = 0
+                                inner_table[i][j+1][1] = 0
+                            else:
+                                continue
+                        if re.search('[((]', _text[:start_index]) and re.search('[))]', _text[end_index:]):
+                            continue
+
+                        m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
+                        m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
+                        if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
+                            inner_table[i][j][1] = 0
+
+        if show:
+            print('inner_table[i]1', inner_table[row_no])
+
+        # 修复实际只有几列,但有一列由于重复占了太多行表头识别错误
+        # for i in range(len(inner_table)):
+        #     head_flag_dict = {}
+        #     for j in range(len(inner_table[i])):
+        #         if inner_table[i][j][0] in head_flag_dict.keys():
+        #             head_flag_dict[inner_table[i][j][0]] += [inner_table[i][j][1]]
+        #         else:
+        #             head_flag_dict[inner_table[i][j][0]] = [inner_table[i][j][1]]
+        #
+        #     if len(head_flag_dict.keys()) == 2:
+        #         col_flag = None
+        #         col_value = None
+        #         for key in head_flag_dict.keys():
+        #             flag_list = head_flag_dict[key]
+        #             if len(flag_list) >= 4 and len(set(flag_list)) == 2 and len(set(flag_list[1:])) == 1:
+        #                 col_flag = flag_list[0]
+        #                 col_value = key
+        #                 break
+        #
+        #         if col_flag is not None:
+        #             for j in range(len(inner_table[i])):
+        #                 if inner_table[i][j][0] == col_value:
+        #                     inner_table[i][j][1] = col_flag
+
+        # 多个重复列的预测值不同,以第一个为准
+        for i in range(len(inner_table)):
+            col = inner_table[i][0]
+            for j in range(len(inner_table[i])):
+                if inner_table[i][j][0] == col[0]:
+                    if inner_table[i][j][1] != col[1]:
+                        inner_table[i][j][1] = col[1]
+                else:
+                    col = inner_table[i][j]
+
+        if show:
+            print('inner_table[i]2', inner_table[row_no])
+
+        # 修复多个重复的单元格表头不一致
+        # for i in range(len(inner_table)):
+        #     for j in range(len(inner_table[i])-1):
+        #         only_chinese1 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j][0]))
+        #         only_chinese2 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j+1][0]))
+        #         if only_chinese1 == only_chinese2 and inner_table[i][j][1] != inner_table[i][j+1][1]:
+        #             inner_table[i][j][1] = 1
+        #             inner_table[i][j+1][1] = 1
+
+        # if show:
+        #     print('inner_table[i]3', inner_table[row_no])
+
+        # # 修复一行几乎都是表头,个别不是;或者一行几乎都是非表头,个别是
+        # for i in range(len(inner_table)):
+        #     head_dict = {}
+        #     not_head_dict = {}
+        #     for j in range(len(inner_table[i])):
+        #         if inner_table[i][j][1] == 1:
+        #             if inner_table[i][j][0] not in head_dict:
+        #                 head_dict[inner_table[i][j][0]] = 1
+        #         else:
+        #             if inner_table[i][j][0] not in not_head_dict:
+        #                 not_head_dict[inner_table[i][j][0]] = 1
+        #
+        #     # 非表头:表头 <= 1:3
+        #     # if len(head_dict.keys()) > 0 and len(not_head_dict.keys()) / len(head_dict.keys()) <= 1/3 and len(head_dict.keys()) >= 3:
+        #     #     for j in range(len(inner_table[i])):
+        #     #         if len(re.sub(' ', '', inner_table[i][j][0])) > 0:
+        #     #             inner_table[i][j][1] = 1
+        #
+        #     # 表头数一个且非表头数大于2且上一行都是表头
+        #     if i > 0 and len(head_dict.keys()) == 1 and len(not_head_dict.keys()) >= 2 and inner_table[i][0][1] == 0:
+        #         last_row = inner_table[i-1]
+        #         col_list = []
+        #         for j in range(len(last_row)):
+        #             if len(re.sub(' ', '', last_row[j][0])) > 0:
+        #                 if last_row[j][1] == 0:
+        #                     col_list = []
+        #                     break
+        #                 col_list.append(last_row[j][0])
+        #         if col_list:
+        #             col_list = list(set(col_list))
+        #             if len(col_list) > 2:
+        #                 for j in range(len(inner_table[i])):
+        #                     if inner_table[i][j][1] == 1:
+        #                         inner_table[i][j][1] = 0
+
+        # 一整个大表格,第一行为表头,下面行中有个别格子被识别为表头
+        # 候选人后面修复
+        for index in one_head_index_list:
+            if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
+                    or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
+                    or (index - 1 in all_head_index_list):
+                for j in range(len(inner_table[index])):
+                    inner_table[index][j][1] = 0
+                zero_head_index_list.append(index)
+
+
+
+        if show:
+            print('inner_table[i]4', inner_table[row_no])
+
+        # 修复第一第二第三中标候选人作为表头
+        first_tenderer = ['第一中标候选人', '第一中标人', '第一中标(成交)人', '第一候选人']
+        second_tenderer = ['第二中标候选人', '第二中标(成交)候选人', '第二候选人']
+        third_tenderer = ['第三中标候选人', '第三中标(成交)候选人', '第三候选人']
+        # n1 next one, n2 next two, l1 last one, l2 last two
+        for i in range(len(inner_table)):
+            row = inner_table[i]
+            n1_row, n2_row = None, None
+            if i+1 < len(inner_table):
+                n1_row = inner_table[i+1]
+            if i+2 < len(inner_table):
+                n2_row = inner_table[i+2]
+            for j in range(len(row)):
+                row_col = row[j]
+                n1_row_col, n2_row_col = None, None
+                row_n1_col, row_n2_col = None, None
+                n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
+                if n1_row:
+                    n1_row_col = n1_row[j]
+                if n2_row:
+                    n2_row_col = n2_row[j]
+                if j+1 < len(row):
+                    row_n1_col = row[j+1]
+                if j+2 < len(row):
+                    row_n2_col = row[j+2]
+                if n1_row and j+1 < len(n1_row):
+                    n1_row_n1_col = n1_row[j+1]
+                if n2_row and j+1 < len(n2_row):
+                    n2_row_n1_col = n2_row[j+1]
+                if n1_row and j+2 < len(n1_row):
+                    n1_row_n2_col = n1_row[j+2]
+
+                # 连续作为行表头
+                if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
+                    if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
+                        inner_table[i][j][1] = 1
+                        inner_table[i+1][j][1] = 1
+                    if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
+                        inner_table[i+2][j][1] = 1
+
+                # 连续作为列表头
+                if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
+                    if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
                         inner_table[i][j][1] = 1
                         inner_table[i][j+1][1] = 1
+                    if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
                         inner_table[i][j+2][1] = 1
-                        break
+
+        if show:
+            print('inner_table[i]5', inner_table[row_no])
+
+        # 修复表头关键词未作为表头
+        # 文本匹配关键词,直接作为表头
+        head_keyword = ['供应商', '总价']
+        # 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
+        head_keyword2 = ['管理中心', '有限公司', '项目采购', ]
+        # 开头匹配关键词,直接不做表头
+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
+        # 文本匹配关键词且前一列为表头,直接作为表头
+        head_keyword4 = ['综合排名']
+        # 文本在关键词中,直接不做表头
+        head_keyword5 = ['殡葬用地']
+
+        # n1 next one, n2 next two, l1 last one, l2 last two
+        for i in range(len(inner_table)):
+            row = inner_table[i]
+            for j in range(len(row)):
+                row_col = row[j]
+                row_l1_col = None
+                if j-1 > 0:
+                    row_l1_col = row[j-1]
+
+                match = re.search('[\u4e00-\u9fa50-9a-zA-Z::]+', row_col[0])
+                if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
+                    inner_table[i][j][1] = 1
+                for key in head_keyword2:
+                    match = re.search(key+'$', row_col[0])
+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
+                        inner_table[i][j][1] = 0
+                for key in head_keyword3:
+                    match = re.search('^'+key, row_col[0])
+                    if match and row_col[1] == 1:
+                        inner_table[i][j][1] = 0
+                for key in head_keyword4:
+                    match = re.search(key, row_col[0])
+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
+                        inner_table[i][j][1] = 1
+                if row_col[0] in head_keyword5:
+                    inner_table[i][j][1] = 0
+
+        if show:
+            print('inner_table[i]6', inner_table[row_no])
 
         # 修复姓名被作为表头 # 2023-02-10 取消修复,避免项目名称、编号,单位、单价等作为了非表头
         # surname = [
@@ -358,6 +587,7 @@ def tableToText(soup):
         #                 and (inner_table[i][j][0][0] in surname or inner_table[i][j][0][:2] in surname) \
         #                 and re.search("[^\u4e00-\u9fa5]", inner_table[i][j][0]) is None:
         #             inner_table[i][j][1] = 0
+
         return inner_table
 
     def sliceTable(inner_table,fix_value="~~"):
@@ -456,7 +686,7 @@ def tableToText(soup):
         
         return inner_table,head_list
 
-    def set_head_model(inner_table):
+    def set_head_model(inner_table, show=0):
         origin_inner_table = copy.deepcopy(inner_table)
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
@@ -469,16 +699,30 @@ def tableToText(soup):
         # 模型预测表头
         predict_list = predict(inner_table)
 
+        start_time = time.time()
+        predict_list = predict(inner_table)
+        print('table head predict cost: ', time.time()-start_time)
+
         # 组合结果
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
 
-        # print("table_head before repair", inner_table)
+        if show:
+            print("table_head before repair")
+            for r in inner_table:
+                print('row', r)
+            print("="*80)
 
         # 表头修正
-        repairTable(inner_table)
-        inner_table = repair_table2(inner_table)
+        # repairTable(inner_table)
+        inner_table = table_head_repair_process(inner_table, docid)
+
+        if show:
+            print("="*80)
+            print("table_head after repair")
+            for r in inner_table:
+                print('row', r)
 
         # 按表头分割表格
         head_list = sliceTable(inner_table)
@@ -1139,6 +1383,7 @@ def tableToText(soup):
             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
                 in_attachment = True
     #逆序处理嵌套表格
+    # print('len(tbodies)1', len(tbodies))
     for tbody_index in range(1,len(tbodies)+1):
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
         inner_table = trunTable(tbody,_in_attachment)
@@ -1155,6 +1400,7 @@ def tableToText(soup):
             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
                 in_attachment = True
     #逆序处理嵌套表格
+    # print('len(tbodies)2', len(tbodies))
     for tbody_index in range(1,len(tbodies)+1):
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
         inner_table = trunTable(tbody,_in_attachment)
@@ -1163,6 +1409,284 @@ def tableToText(soup):
     return soup
     # return list_innerTable
 
+
+def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0):
+    def pre_process(inner_table):
+        """
+        修复前的预处理
+        """
+        # 循环处理单元格,一次获取需要的
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                # 删除前后逗号
+                inner_table[i][j][0] = re.sub('^[,,]+', '', inner_table[i][j][0])
+                inner_table[i][j][0] = re.sub('[,,]+$', '', inner_table[i][j][0])
+        return inner_table
+
+    def repair_by_colon(inner_table):
+        """
+        根据冒号修复当前格子的表头值
+        """
+
+        # 修复冒号在文本中间的,不能作为表头;(冒号后面需多个字)
+        # 冒号在括号中的除外
+        # 冒号在最后的,判断后一个格子是否有重复的文字
+        for i in range(len(inner_table)):
+            for j in range(len(inner_table[i])):
+                _text = inner_table[i][j][0]
+                if len(_text) >= 3 and inner_table[i][j][1] == 1:
+                    match = re.search('[::]', _text)
+                    if match:
+                        start_index, end_index = match.span()
+                        if start_index == 0:
+                            continue
+                        if end_index == len(_text):
+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
+                                inner_table[i][j][1] = 0
+                                inner_table[i][j+1][1] = 0
+                            else:
+                                continue
+                        if re.search('[((]', _text[:start_index]) and re.search('[))]', _text[end_index:]):
+                            continue
+
+                        m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
+                        m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
+                        if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
+                            inner_table[i][j][1] = 0
+
+        return inner_table
+
+    def repair_by_duplicate(inner_table):
+        """
+        根据列重复修复当前格子的表头值
+        """
+        # 多个重复列的预测值不同,以第一个为准
+        for i in range(len(inner_table)):
+            col = inner_table[i][0]
+            for j in range(1, len(inner_table[i])):
+                if inner_table[i][j][0] == col[0]:
+                    if inner_table[i][j][1] != col[1]:
+                        if col != inner_table[i][0]:
+                            inner_table[i][j][1] = col[1]
+                        else:
+                            inner_table[i][0][1] = inner_table[i][j][1]
+                            col = inner_table[i][0]
+                else:
+                    col = inner_table[i][j]
+        return inner_table
+
+    def repair_by_around(inner_table):
+        """
+        根据周围的表头值修复当前格子的表头值
+        """
+        one_head_index_list = []
+        zero_head_index_list = []
+        all_head_index_list = []
+        for i in range(len(inner_table)):
+            head_cnt = 0
+            head_index = None
+            head_dict = {}
+            for j in range(len(inner_table[i])):
+                # 统计表头数
+                if inner_table[i][j][1] == 1:
+                    head_cnt += 1
+                    head_index = j
+                if inner_table[i][j][0] not in ['~~', '', ' ']:
+                    if inner_table[i][j][0] in head_dict.keys():
+                        head_dict[inner_table[i][j][0]] += 1
+                    else:
+                        head_dict[inner_table[i][j][0]] = 1
+            # 表头数list
+            if head_cnt == 0:
+                zero_head_index_list.append(i)
+            elif head_cnt == 1:
+                # 这个单个表头需满足前面有非表头
+                find_flag = 0
+                for k in range(head_index):
+                    if inner_table[i][k][1] == 0:
+                        find_flag = 1
+                if find_flag and len(head_dict.keys()) > 2:
+                    one_head_index_list.append(i)
+            elif head_cnt == len(inner_table[i]):
+                all_head_index_list.append(i)
+
+        # 一整个大表格,第一行为表头,下面行中有个别格子被识别为表头
+        # 候选人后面修复
+        for index in one_head_index_list:
+            if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
+                    or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
+                    or (index - 1 in all_head_index_list):
+                for j in range(len(inner_table[index])):
+                    inner_table[index][j][1] = 0
+                zero_head_index_list.append(index)
+        return inner_table
+
+    def repair_by_tenderer(inner_table):
+        """
+        根据第一第二第三候选人修复当前格子的表头值
+        """
+        # 修复第一第二第三中标候选人作为表头
+        first_tenderer = ['第一中标候选人', '第一中标人', '第一中标(成交)人', '第一候选人']
+        second_tenderer = ['第二中标候选人', '第二中标(成交)候选人', '第二候选人']
+        third_tenderer = ['第三中标候选人', '第三中标(成交)候选人', '第三候选人']
+        # n1 next one, n2 next two, l1 last one, l2 last two
+        for i in range(len(inner_table)):
+            row = inner_table[i]
+            n1_row, n2_row = None, None
+            if i+1 < len(inner_table):
+                n1_row = inner_table[i+1]
+            if i+2 < len(inner_table):
+                n2_row = inner_table[i+2]
+            for j in range(len(row)):
+                row_col = row[j]
+                n1_row_col, n2_row_col = None, None
+                row_n1_col, row_n2_col = None, None
+                n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
+                if n1_row:
+                    n1_row_col = n1_row[j]
+                if n2_row:
+                    n2_row_col = n2_row[j]
+                if j+1 < len(row):
+                    row_n1_col = row[j+1]
+                if j+2 < len(row):
+                    row_n2_col = row[j+2]
+                if n1_row and j+1 < len(n1_row):
+                    n1_row_n1_col = n1_row[j+1]
+                if n2_row and j+1 < len(n2_row):
+                    n2_row_n1_col = n2_row[j+1]
+                if n1_row and j+2 < len(n1_row):
+                    n1_row_n2_col = n1_row[j+2]
+
+                # 连续作为行表头
+                if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
+                    if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
+                        inner_table[i][j][1] = 1
+                        inner_table[i+1][j][1] = 1
+                    if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
+                        inner_table[i+2][j][1] = 1
+
+                # 连续作为列表头
+                if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
+                    if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
+                        inner_table[i][j][1] = 1
+                        inner_table[i][j+1][1] = 1
+                    if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
+                        inner_table[i][j+2][1] = 1
+
+        return inner_table
+
+    def repair_by_keywords(inner_table):
+        """
+        根据关键词修复当前格子的表头值
+        """
+        # 修复表头关键词未作为表头
+        # 末尾匹配匹配关键词,直接作为表头
+        head_keyword = ['供应商', '总价', '总价(元)', '总价\(元\)', '品目一', '品目二', '品目三']
+        # 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
+        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
+        # 开头匹配关键词,直接不做表头
+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
+        # 文本匹配关键词且前一列为表头,直接作为表头
+        head_keyword4 = ['综合排名']
+        # 文本在关键词中,直接不做表头
+        head_keyword5 = ['殡葬用地']
+        # 文本匹配关键词,直接不作表头
+        head_keyword6 = ['市场行情', '有限公司']
+
+        # n1 next one, n2 next two, l1 last one, l2 last two
+        for i in range(len(inner_table)):
+            row = inner_table[i]
+            for j in range(len(row)):
+                row_col = row[j]
+                row_l1_col = None
+                if j-1 >= 0:
+                    row_l1_col = row[j-1]
+
+                # match = re.search('[\u4e00-\u9fa50-9a-zA-Z::]+', row_col[0])
+                # if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
+                #     inner_table[i][j][1] = 1
+                for key in head_keyword:
+                    match = re.search(key+'$', row_col[0])
+                    if match:
+                        inner_table[i][j][1] = 1
+                for key in head_keyword2:
+                    match = re.search(key+'$', row_col[0])
+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
+                        inner_table[i][j][1] = 0
+                for key in head_keyword3:
+                    match = re.search('^'+key, row_col[0])
+                    if match and row_col[1] == 1:
+                        inner_table[i][j][1] = 0
+                for key in head_keyword4:
+                    match = re.search(key, row_col[0])
+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
+                        inner_table[i][j][1] = 1
+                if row_col[0] in head_keyword5:
+                    inner_table[i][j][1] = 0
+                for key in head_keyword6:
+                    match = re.search(key, row_col[0])
+                    if match:
+                        inner_table[i][j][1] = 0
+
+        return inner_table
+
+    _inner_table = pre_process(_inner_table)
+    compare_inner_table = copy.deepcopy(_inner_table)
+    if show:
+        print('table_head_repair_process1', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_colon(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair2 ' + str(docid))
+    if show:
+        print('table_head_repair_process2', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_keywords(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair3 ' + str(docid))
+    if show:
+        print('table_head_repair_process3', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_tenderer(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair4 ' + str(docid))
+    if show:
+        print('table_head_repair_process4', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_duplicate(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair5 ' + str(docid))
+    if show:
+        print('table_head_repair_process5', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_around(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair6 ' + str(docid))
+    if show:
+        print('table_head_repair_process6', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_tenderer(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair7 ' + str(docid))
+    if show:
+        print('table_head_repair_process7', show_row_index, _inner_table[show_row_index])
+
+    _inner_table = repair_by_keywords(_inner_table)
+    if _inner_table != compare_inner_table:
+        compare_inner_table = copy.deepcopy(_inner_table)
+        log('table_head repair8 ' + str(docid))
+    if show:
+        print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
+
+    return _inner_table
+
+
 re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
 num_dict = {
     "一": 1, "二": 2,
@@ -2191,7 +2715,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
         article_processed = get_preprocessed_outline(article_processed)
         # print('article_processed')
-        article_processed = tableToText(article_processed)
+        article_processed = tableToText(article_processed, doc_id)
         article_processed = segment(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号