1 жил өмнө · fe9e673dff
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
 
															 #
														
 
															-def tableToText(soup):
														
 
															+def tableToText(soup, docid=None):
														
 
															     '''
														
 
															     @param:
														
 
															         soup:网页html的soup
														
@@ -181,7 +181,7 @@ def tableToText(soup):
 
															             count_1 = 0
														
 
															             count_0 = 0
														
 
															             for i in range(len(line)):
														
 
															-                if line[i][0]==fix_value:
														
 
															+                if line[i][0] == fix_value:
														
 
															                     continue
														
 
															                 if line[i][1]==1:
														
 
															                     if first_1==-1:
														
@@ -211,12 +211,12 @@ def tableToText(soup):
 
															             """
														
 
															             @summary: 计算每个节点受到的挤压度来判断是否需要染色
														
 
															             """
														
 
															-            #print("B",inner_table[index])
														
 
															+            # print("B",inner_table[index])
														
 
															             min_presure = 3
														
 
															             list_dye = []
														
 
															             first = None
														
 
															             count = 0
														
 
															-            temp_set = set()
														
 
															+            temp_set = set(['~~'])
														
 
															             _index = 0
														
 
															             for item in inner_table[index]:
														
 
															                 if first is None:
														
@@ -272,7 +272,7 @@ def tableToText(soup):
 
															                             dye_set.add((inner_table[index][h][0],dye_type))
														
 
															                             key_set.add(inner_table[index][h][0])
														
 
															                     begin = end
														
 
															-                #print("E",inner_table[index])
														
 
															+                # print("E",inner_table[index])
														
 
															         def otherrepair(inner_table,index,dye_set,key_set):
														
 
															             list_provide_repair = []
														
@@ -327,25 +327,254 @@ def tableToText(soup):
 
															             return
														
 
															         repairTable(inner_table, dye_set, key_set)
														
 
															-    def repair_table2(inner_table):
														
 
															+    def repair_table2(inner_table, show=0, row_no=0):
														
 
															         """
														
 
															         @summary: 修复表头识别，将明显错误的进行修正
														
 
															         """
														
 
															-        # 修复第一第二第三中标候选人作为列表头
														
 
															-        if len(inner_table) >= 2 and len(inner_table[0]) >= 3:
														
 
															-            for i in range(len(inner_table[:3])):
														
 
															-                for j in range(len(inner_table[i])-2):
														
 
															-                    if inner_table[i][j][0] == '第一中标候选人' \
														
 
															-                            and inner_table[i][j+1][0] == '第二中标候选人' \
														
 
															-                            and inner_table[i][j+2][0] == '第三中标候选人' \
														
 
															-                            and i+1 < len(inner_table) \
														
 
															-                            and inner_table[i+1][j][1] == 0 \
														
 
															-                            and inner_table[i+1][j+1][1] == 0 \
														
 
															-                            and inner_table[i+1][j+2][1] == 0:
														
 
															+
														
 
															+        # 循环处理单元格，一次获取需要的
														
 
															+        one_head_index_list = []
														
 
															+        zero_head_index_list = []
														
 
															+        all_head_index_list = []
														
 
															+        for i in range(len(inner_table)):
														
 
															+            head_cnt = 0
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                # 删除前后逗号
														
 
															+                inner_table[i][j][0] = re.sub('^[，,]+', '', inner_table[i][j][0])
														
 
															+                inner_table[i][j][0] = re.sub('[，,]+$', '', inner_table[i][j][0])
														
 
															+
														
 
															+                # 统计表头数
														
 
															+                if inner_table[i][j][1] == 1:
														
 
															+                    head_cnt += 1
														
 
															+
														
 
															+            # 表头数list
														
 
															+            if head_cnt == 0:
														
 
															+                zero_head_index_list.append(i)
														
 
															+            elif head_cnt == 1:
														
 
															+                one_head_index_list.append(i)
														
 
															+            elif head_cnt == len(inner_table[i]):
														
 
															+                all_head_index_list.append(i)
														
 
															+
														
 
															+        # 修复冒号在文本中间的，不能作为表头；(冒号后面需多个字)
														
 
															+        # 冒号在括号中的除外
														
 
															+        # 冒号在最后的，判断后一个格子是否有重复的文字
														
 
															+        for i in range(len(inner_table)):
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                _text = inner_table[i][j][0]
														
 
															+                if len(_text) >= 3 and inner_table[i][j][1] == 1:
														
 
															+                    match = re.search('[:：]', _text)
														
 
															+                    if match:
														
 
															+                        start_index, end_index = match.span()
														
 
															+                        if start_index == 0:
														
 
															+                            continue
														
 
															+                        if end_index == len(_text):
														
 
															+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
														
 
															+                                inner_table[i][j][1] = 0
														
 
															+                                inner_table[i][j+1][1] = 0
														
 
															+                            else:
														
 
															+                                continue
														
 
															+                        if re.search('[(（]', _text[:start_index]) and re.search('[)）]', _text[end_index:]):
														
 
															+                            continue
														
 
															+
														
 
															+                        m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
														
 
															+                        m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
														
 
															+                        if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
														
 
															+                            inner_table[i][j][1] = 0
														
 
															+
														
 
															+        if show:
														
 
															+            print('inner_table[i]1', inner_table[row_no])
														
 
															+
														
 
															+        # 修复实际只有几列，但有一列由于重复占了太多行表头识别错误
														
 
															+        # for i in range(len(inner_table)):
														
 
															+        #     head_flag_dict = {}
														
 
															+        #     for j in range(len(inner_table[i])):
														
 
															+        #         if inner_table[i][j][0] in head_flag_dict.keys():
														
 
															+        #             head_flag_dict[inner_table[i][j][0]] += [inner_table[i][j][1]]
														
 
															+        #         else:
														
 
															+        #             head_flag_dict[inner_table[i][j][0]] = [inner_table[i][j][1]]
														
 
															+        #
														
 
															+        #     if len(head_flag_dict.keys()) == 2:
														
 
															+        #         col_flag = None
														
 
															+        #         col_value = None
														
 
															+        #         for key in head_flag_dict.keys():
														
 
															+        #             flag_list = head_flag_dict[key]
														
 
															+        #             if len(flag_list) >= 4 and len(set(flag_list)) == 2 and len(set(flag_list[1:])) == 1:
														
 
															+        #                 col_flag = flag_list[0]
														
 
															+        #                 col_value = key
														
 
															+        #                 break
														
 
															+        #
														
 
															+        #         if col_flag is not None:
														
 
															+        #             for j in range(len(inner_table[i])):
														
 
															+        #                 if inner_table[i][j][0] == col_value:
														
 
															+        #                     inner_table[i][j][1] = col_flag
														
 
															+
														
 
															+        # 多个重复列的预测值不同，以第一个为准
														
 
															+        for i in range(len(inner_table)):
														
 
															+            col = inner_table[i][0]
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                if inner_table[i][j][0] == col[0]:
														
 
															+                    if inner_table[i][j][1] != col[1]:
														
 
															+                        inner_table[i][j][1] = col[1]
														
 
															+                else:
														
 
															+                    col = inner_table[i][j]
														
 
															+
														
 
															+        if show:
														
 
															+            print('inner_table[i]2', inner_table[row_no])
														
 
															+
														
 
															+        # 修复多个重复的单元格表头不一致
														
 
															+        # for i in range(len(inner_table)):
														
 
															+        #     for j in range(len(inner_table[i])-1):
														
 
															+        #         only_chinese1 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j][0]))
														
 
															+        #         only_chinese2 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j+1][0]))
														
 
															+        #         if only_chinese1 == only_chinese2 and inner_table[i][j][1] != inner_table[i][j+1][1]:
														
 
															+        #             inner_table[i][j][1] = 1
														
 
															+        #             inner_table[i][j+1][1] = 1
														
 
															+
														
 
															+        # if show:
														
 
															+        #     print('inner_table[i]3', inner_table[row_no])
														
 
															+
														
 
															+        # # 修复一行几乎都是表头，个别不是；或者一行几乎都是非表头，个别是
														
 
															+        # for i in range(len(inner_table)):
														
 
															+        #     head_dict = {}
														
 
															+        #     not_head_dict = {}
														
 
															+        #     for j in range(len(inner_table[i])):
														
 
															+        #         if inner_table[i][j][1] == 1:
														
 
															+        #             if inner_table[i][j][0] not in head_dict:
														
 
															+        #                 head_dict[inner_table[i][j][0]] = 1
														
 
															+        #         else:
														
 
															+        #             if inner_table[i][j][0] not in not_head_dict:
														
 
															+        #                 not_head_dict[inner_table[i][j][0]] = 1
														
 
															+        #
														
 
															+        #     # 非表头:表头 <= 1:3
														
 
															+        #     # if len(head_dict.keys()) > 0 and len(not_head_dict.keys()) / len(head_dict.keys()) <= 1/3 and len(head_dict.keys()) >= 3:
														
 
															+        #     #     for j in range(len(inner_table[i])):
														
 
															+        #     #         if len(re.sub(' ', '', inner_table[i][j][0])) > 0:
														
 
															+        #     #             inner_table[i][j][1] = 1
														
 
															+        #
														
 
															+        #     # 表头数一个且非表头数大于2且上一行都是表头
														
 
															+        #     if i > 0 and len(head_dict.keys()) == 1 and len(not_head_dict.keys()) >= 2 and inner_table[i][0][1] == 0:
														
 
															+        #         last_row = inner_table[i-1]
														
 
															+        #         col_list = []
														
 
															+        #         for j in range(len(last_row)):
														
 
															+        #             if len(re.sub(' ', '', last_row[j][0])) > 0:
														
 
															+        #                 if last_row[j][1] == 0:
														
 
															+        #                     col_list = []
														
 
															+        #                     break
														
 
															+        #                 col_list.append(last_row[j][0])
														
 
															+        #         if col_list:
														
 
															+        #             col_list = list(set(col_list))
														
 
															+        #             if len(col_list) > 2:
														
 
															+        #                 for j in range(len(inner_table[i])):
														
 
															+        #                     if inner_table[i][j][1] == 1:
														
 
															+        #                         inner_table[i][j][1] = 0
														
 
															+
														
 
															+        # 一整个大表格，第一行为表头，下面行中有个别格子被识别为表头
														
 
															+        # 候选人后面修复
														
 
															+        for index in one_head_index_list:
														
 
															+            if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
														
 
															+                    or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
														
 
															+                    or (index - 1 in all_head_index_list):
														
 
															+                for j in range(len(inner_table[index])):
														
 
															+                    inner_table[index][j][1] = 0
														
 
															+                zero_head_index_list.append(index)
														
 
															+
														
 
															+
														
 
															+
														
 
															+        if show:
														
 
															+            print('inner_table[i]4', inner_table[row_no])
														
 
															+
														
 
															+        # 修复第一第二第三中标候选人作为表头
														
 
															+        first_tenderer = ['第一中标候选人', '第一中标人', '第一中标（成交）人', '第一候选人']
														
 
															+        second_tenderer = ['第二中标候选人', '第二中标（成交）候选人', '第二候选人']
														
 
															+        third_tenderer = ['第三中标候选人', '第三中标（成交）候选人', '第三候选人']
														
 
															+        # n1 next one, n2 next two, l1 last one, l2 last two
														
 
															+        for i in range(len(inner_table)):
														
 
															+            row = inner_table[i]
														
 
															+            n1_row, n2_row = None, None
														
 
															+            if i+1 < len(inner_table):
														
 
															+                n1_row = inner_table[i+1]
														
 
															+            if i+2 < len(inner_table):
														
 
															+                n2_row = inner_table[i+2]
														
 
															+            for j in range(len(row)):
														
 
															+                row_col = row[j]
														
 
															+                n1_row_col, n2_row_col = None, None
														
 
															+                row_n1_col, row_n2_col = None, None
														
 
															+                n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
														
 
															+                if n1_row:
														
 
															+                    n1_row_col = n1_row[j]
														
 
															+                if n2_row:
														
 
															+                    n2_row_col = n2_row[j]
														
 
															+                if j+1 < len(row):
														
 
															+                    row_n1_col = row[j+1]
														
 
															+                if j+2 < len(row):
														
 
															+                    row_n2_col = row[j+2]
														
 
															+                if n1_row and j+1 < len(n1_row):
														
 
															+                    n1_row_n1_col = n1_row[j+1]
														
 
															+                if n2_row and j+1 < len(n2_row):
														
 
															+                    n2_row_n1_col = n2_row[j+1]
														
 
															+                if n1_row and j+2 < len(n1_row):
														
 
															+                    n1_row_n2_col = n1_row[j+2]
														
 
															+
														
 
															+                # 连续作为行表头
														
 
															+                if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
														
 
															+                    if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
														
 
															+                        inner_table[i][j][1] = 1
														
 
															+                        inner_table[i+1][j][1] = 1
														
 
															+                    if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
														
 
															+                        inner_table[i+2][j][1] = 1
														
 
															+
														
 
															+                # 连续作为列表头
														
 
															+                if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
														
 
															+                    if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
														
 
															                         inner_table[i][j][1] = 1
														
 
															                         inner_table[i][j+1][1] = 1
														
 
															+                    if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
														
 
															                         inner_table[i][j+2][1] = 1
														
 
															-                        break
														
 
															+
														
 
															+        if show:
														
 
															+            print('inner_table[i]5', inner_table[row_no])
														
 
															+
														
 
															+        # 修复表头关键词未作为表头
														
 
															+        # 文本匹配关键词，直接作为表头
														
 
															+        head_keyword = ['供应商', '总价']
														
 
															+        # 末尾匹配关键词且前一列为表头且与前一列文本不同，直接不做表头
														
 
															+        head_keyword2 = ['管理中心', '有限公司', '项目采购', ]
														
 
															+        # 开头匹配关键词，直接不做表头
														
 
															+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
														
 
															+        # 文本匹配关键词且前一列为表头，直接作为表头
														
 
															+        head_keyword4 = ['综合排名']
														
 
															+        # 文本在关键词中，直接不做表头
														
 
															+        head_keyword5 = ['殡葬用地']
														
 
															+
														
 
															+        # n1 next one, n2 next two, l1 last one, l2 last two
														
 
															+        for i in range(len(inner_table)):
														
 
															+            row = inner_table[i]
														
 
															+            for j in range(len(row)):
														
 
															+                row_col = row[j]
														
 
															+                row_l1_col = None
														
 
															+                if j-1 > 0:
														
 
															+                    row_l1_col = row[j-1]
														
 
															+
														
 
															+                match = re.search('[\u4e00-\u9fa50-9a-zA-Z:：]+', row_col[0])
														
 
															+                if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
														
 
															+                    inner_table[i][j][1] = 1
														
 
															+                for key in head_keyword2:
														
 
															+                    match = re.search(key+'$', row_col[0])
														
 
															+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
														
 
															+                        inner_table[i][j][1] = 0
														
 
															+                for key in head_keyword3:
														
 
															+                    match = re.search('^'+key, row_col[0])
														
 
															+                    if match and row_col[1] == 1:
														
 
															+                        inner_table[i][j][1] = 0
														
 
															+                for key in head_keyword4:
														
 
															+                    match = re.search(key, row_col[0])
														
 
															+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
														
 
															+                        inner_table[i][j][1] = 1
														
 
															+                if row_col[0] in head_keyword5:
														
 
															+                    inner_table[i][j][1] = 0
														
 
															+
														
 
															+        if show:
														
 
															+            print('inner_table[i]6', inner_table[row_no])
														
 
															         # 修复姓名被作为表头 # 2023-02-10 取消修复，避免项目名称、编号，单位、单价等作为了非表头
														
 
															         # surname = [
														
@@ -358,6 +587,7 @@ def tableToText(soup):
 
															         #                 and (inner_table[i][j][0][0] in surname or inner_table[i][j][0][:2] in surname) \
														
 
															         #                 and re.search("[^\u4e00-\u9fa5]", inner_table[i][j][0]) is None:
														
 
															         #             inner_table[i][j][1] = 0
														
 
															+
														
 
															         return inner_table
														
 
															     def sliceTable(inner_table,fix_value="~~"):
														
@@ -456,7 +686,7 @@ def tableToText(soup):
 
															         return inner_table,head_list
														
 
															-    def set_head_model(inner_table):
														
 
															+    def set_head_model(inner_table, show=0):
														
 
															         origin_inner_table = copy.deepcopy(inner_table)
														
 
															         for i in range(len(inner_table)):
														
 
															             for j in range(len(inner_table[i])):
														
@@ -469,16 +699,30 @@ def tableToText(soup):
 
															         # 模型预测表头
														
 
															         predict_list = predict(inner_table)
														
 
															+        start_time = time.time()
														
 
															+        predict_list = predict(inner_table)
														
 
															+        print('table head predict cost: ', time.time()-start_time)
														
 
															+
														
 
															         # 组合结果
														
 
															         for i in range(len(inner_table)):
														
 
															             for j in range(len(inner_table[i])):
														
 
															                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
														
 
															-        # print("table_head before repair", inner_table)
														
 
															+        if show:
														
 
															+            print("table_head before repair")
														
 
															+            for r in inner_table:
														
 
															+                print('row', r)
														
 
															+            print("="*80)
														
 
															         # 表头修正
														
 
															-        repairTable(inner_table)
														
 
															-        inner_table = repair_table2(inner_table)
														
 
															+        # repairTable(inner_table)
														
 
															+        inner_table = table_head_repair_process(inner_table, docid)
														
 
															+
														
 
															+        if show:
														
 
															+            print("="*80)
														
 
															+            print("table_head after repair")
														
 
															+            for r in inner_table:
														
 
															+                print('row', r)
														
 
															         # 按表头分割表格
														
 
															         head_list = sliceTable(inner_table)
														
@@ -1139,6 +1383,7 @@ def tableToText(soup):
 
															             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
														
 
															                 in_attachment = True
														
 
															     #逆序处理嵌套表格
														
 
															+    # print('len(tbodies)1', len(tbodies))
														
 
															     for tbody_index in range(1,len(tbodies)+1):
														
 
															         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
														
 
															         inner_table = trunTable(tbody,_in_attachment)
														
@@ -1155,6 +1400,7 @@ def tableToText(soup):
 
															             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
														
 
															                 in_attachment = True
														
 
															     #逆序处理嵌套表格
														
 
															+    # print('len(tbodies)2', len(tbodies))
														
 
															     for tbody_index in range(1,len(tbodies)+1):
														
 
															         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
														
 
															         inner_table = trunTable(tbody,_in_attachment)
														
@@ -1163,6 +1409,284 @@ def tableToText(soup):
 
															     return soup
														
 
															     # return list_innerTable
														
 
															+
														
 
															+def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0):
														
 
															+    def pre_process(inner_table):
														
 
															+        """
														
 
															+        修复前的预处理
														
 
															+        """
														
 
															+        # 循环处理单元格，一次获取需要的
														
 
															+        for i in range(len(inner_table)):
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                # 删除前后逗号
														
 
															+                inner_table[i][j][0] = re.sub('^[，,]+', '', inner_table[i][j][0])
														
 
															+                inner_table[i][j][0] = re.sub('[，,]+$', '', inner_table[i][j][0])
														
 
															+        return inner_table
														
 
															+
														
 
															+    def repair_by_colon(inner_table):
														
 
															+        """
														
 
															+        根据冒号修复当前格子的表头值
														
 
															+        """
														
 
															+
														
 
															+        # 修复冒号在文本中间的，不能作为表头；(冒号后面需多个字)
														
 
															+        # 冒号在括号中的除外
														
 
															+        # 冒号在最后的，判断后一个格子是否有重复的文字
														
 
															+        for i in range(len(inner_table)):
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                _text = inner_table[i][j][0]
														
 
															+                if len(_text) >= 3 and inner_table[i][j][1] == 1:
														
 
															+                    match = re.search('[:：]', _text)
														
 
															+                    if match:
														
 
															+                        start_index, end_index = match.span()
														
 
															+                        if start_index == 0:
														
 
															+                            continue
														
 
															+                        if end_index == len(_text):
														
 
															+                            if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
														
 
															+                                inner_table[i][j][1] = 0
														
 
															+                                inner_table[i][j+1][1] = 0
														
 
															+                            else:
														
 
															+                                continue
														
 
															+                        if re.search('[(（]', _text[:start_index]) and re.search('[)）]', _text[end_index:]):
														
 
															+                            continue
														
 
															+
														
 
															+                        m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
														
 
															+                        m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
														
 
															+                        if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
														
 
															+                            inner_table[i][j][1] = 0
														
 
															+
														
 
															+        return inner_table
														
 
															+
														
 
															+    def repair_by_duplicate(inner_table):
														
 
															+        """
														
 
															+        根据列重复修复当前格子的表头值
														
 
															+        """
														
 
															+        # 多个重复列的预测值不同，以第一个为准
														
 
															+        for i in range(len(inner_table)):
														
 
															+            col = inner_table[i][0]
														
 
															+            for j in range(1, len(inner_table[i])):
														
 
															+                if inner_table[i][j][0] == col[0]:
														
 
															+                    if inner_table[i][j][1] != col[1]:
														
 
															+                        if col != inner_table[i][0]:
														
 
															+                            inner_table[i][j][1] = col[1]
														
 
															+                        else:
														
 
															+                            inner_table[i][0][1] = inner_table[i][j][1]
														
 
															+                            col = inner_table[i][0]
														
 
															+                else:
														
 
															+                    col = inner_table[i][j]
														
 
															+        return inner_table
														
 
															+
														
 
															+    def repair_by_around(inner_table):
														
 
															+        """
														
 
															+        根据周围的表头值修复当前格子的表头值
														
 
															+        """
														
 
															+        one_head_index_list = []
														
 
															+        zero_head_index_list = []
														
 
															+        all_head_index_list = []
														
 
															+        for i in range(len(inner_table)):
														
 
															+            head_cnt = 0
														
 
															+            head_index = None
														
 
															+            head_dict = {}
														
 
															+            for j in range(len(inner_table[i])):
														
 
															+                # 统计表头数
														
 
															+                if inner_table[i][j][1] == 1:
														
 
															+                    head_cnt += 1
														
 
															+                    head_index = j
														
 
															+                if inner_table[i][j][0] not in ['~~', '', ' ']:
														
 
															+                    if inner_table[i][j][0] in head_dict.keys():
														
 
															+                        head_dict[inner_table[i][j][0]] += 1
														
 
															+                    else:
														
 
															+                        head_dict[inner_table[i][j][0]] = 1
														
 
															+            # 表头数list
														
 
															+            if head_cnt == 0:
														
 
															+                zero_head_index_list.append(i)
														
 
															+            elif head_cnt == 1:
														
 
															+                # 这个单个表头需满足前面有非表头
														
 
															+                find_flag = 0
														
 
															+                for k in range(head_index):
														
 
															+                    if inner_table[i][k][1] == 0:
														
 
															+                        find_flag = 1
														
 
															+                if find_flag and len(head_dict.keys()) > 2:
														
 
															+                    one_head_index_list.append(i)
														
 
															+            elif head_cnt == len(inner_table[i]):
														
 
															+                all_head_index_list.append(i)
														
 
															+
														
 
															+        # 一整个大表格，第一行为表头，下面行中有个别格子被识别为表头
														
 
															+        # 候选人后面修复
														
 
															+        for index in one_head_index_list:
														
 
															+            if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
														
 
															+                    or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
														
 
															+                    or (index - 1 in all_head_index_list):
														
 
															+                for j in range(len(inner_table[index])):
														
 
															+                    inner_table[index][j][1] = 0
														
 
															+                zero_head_index_list.append(index)
														
 
															+        return inner_table
														
 
															+
														
 
															+    def repair_by_tenderer(inner_table):
														
 
															+        """
														
 
															+        根据第一第二第三候选人修复当前格子的表头值
														
 
															+        """
														
 
															+        # 修复第一第二第三中标候选人作为表头
														
 
															+        first_tenderer = ['第一中标候选人', '第一中标人', '第一中标（成交）人', '第一候选人']
														
 
															+        second_tenderer = ['第二中标候选人', '第二中标（成交）候选人', '第二候选人']
														
 
															+        third_tenderer = ['第三中标候选人', '第三中标（成交）候选人', '第三候选人']
														
 
															+        # n1 next one, n2 next two, l1 last one, l2 last two
														
 
															+        for i in range(len(inner_table)):
														
 
															+            row = inner_table[i]
														
 
															+            n1_row, n2_row = None, None
														
 
															+            if i+1 < len(inner_table):
														
 
															+                n1_row = inner_table[i+1]
														
 
															+            if i+2 < len(inner_table):
														
 
															+                n2_row = inner_table[i+2]
														
 
															+            for j in range(len(row)):
														
 
															+                row_col = row[j]
														
 
															+                n1_row_col, n2_row_col = None, None
														
 
															+                row_n1_col, row_n2_col = None, None
														
 
															+                n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
														
 
															+                if n1_row:
														
 
															+                    n1_row_col = n1_row[j]
														
 
															+                if n2_row:
														
 
															+                    n2_row_col = n2_row[j]
														
 
															+                if j+1 < len(row):
														
 
															+                    row_n1_col = row[j+1]
														
 
															+                if j+2 < len(row):
														
 
															+                    row_n2_col = row[j+2]
														
 
															+                if n1_row and j+1 < len(n1_row):
														
 
															+                    n1_row_n1_col = n1_row[j+1]
														
 
															+                if n2_row and j+1 < len(n2_row):
														
 
															+                    n2_row_n1_col = n2_row[j+1]
														
 
															+                if n1_row and j+2 < len(n1_row):
														
 
															+                    n1_row_n2_col = n1_row[j+2]
														
 
															+
														
 
															+                # 连续作为行表头
														
 
															+                if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
														
 
															+                    if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
														
 
															+                        inner_table[i][j][1] = 1
														
 
															+                        inner_table[i+1][j][1] = 1
														
 
															+                    if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
														
 
															+                        inner_table[i+2][j][1] = 1
														
 
															+
														
 
															+                # 连续作为列表头
														
 
															+                if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
														
 
															+                    if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
														
 
															+                        inner_table[i][j][1] = 1
														
 
															+                        inner_table[i][j+1][1] = 1
														
 
															+                    if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
														
 
															+                        inner_table[i][j+2][1] = 1
														
 
															+
														
 
															+        return inner_table
														
 
															+
														
 
															+    def repair_by_keywords(inner_table):
														
 
															+        """
														
 
															+        根据关键词修复当前格子的表头值
														
 
															+        """
														
 
															+        # 修复表头关键词未作为表头
														
 
															+        # 末尾匹配匹配关键词，直接作为表头
														
 
															+        head_keyword = ['供应商', '总价', '总价（元）', '总价\(元\)', '品目一', '品目二', '品目三']
														
 
															+        # 末尾匹配关键词且前一列为表头且与前一列文本不同，直接不做表头
														
 
															+        head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
														
 
															+        # 开头匹配关键词，直接不做表头
														
 
															+        head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
														
 
															+        # 文本匹配关键词且前一列为表头，直接作为表头
														
 
															+        head_keyword4 = ['综合排名']
														
 
															+        # 文本在关键词中，直接不做表头
														
 
															+        head_keyword5 = ['殡葬用地']
														
 
															+        # 文本匹配关键词，直接不作表头
														
 
															+        head_keyword6 = ['市场行情', '有限公司']
														
 
															+
														
 
															+        # n1 next one, n2 next two, l1 last one, l2 last two
														
 
															+        for i in range(len(inner_table)):
														
 
															+            row = inner_table[i]
														
 
															+            for j in range(len(row)):
														
 
															+                row_col = row[j]
														
 
															+                row_l1_col = None
														
 
															+                if j-1 >= 0:
														
 
															+                    row_l1_col = row[j-1]
														
 
															+
														
 
															+                # match = re.search('[\u4e00-\u9fa50-9a-zA-Z:：]+', row_col[0])
														
 
															+                # if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
														
 
															+                #     inner_table[i][j][1] = 1
														
 
															+                for key in head_keyword:
														
 
															+                    match = re.search(key+'$', row_col[0])
														
 
															+                    if match:
														
 
															+                        inner_table[i][j][1] = 1
														
 
															+                for key in head_keyword2:
														
 
															+                    match = re.search(key+'$', row_col[0])
														
 
															+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
														
 
															+                        inner_table[i][j][1] = 0
														
 
															+                for key in head_keyword3:
														
 
															+                    match = re.search('^'+key, row_col[0])
														
 
															+                    if match and row_col[1] == 1:
														
 
															+                        inner_table[i][j][1] = 0
														
 
															+                for key in head_keyword4:
														
 
															+                    match = re.search(key, row_col[0])
														
 
															+                    if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
														
 
															+                        inner_table[i][j][1] = 1
														
 
															+                if row_col[0] in head_keyword5:
														
 
															+                    inner_table[i][j][1] = 0
														
 
															+                for key in head_keyword6:
														
 
															+                    match = re.search(key, row_col[0])
														
 
															+                    if match:
														
 
															+                        inner_table[i][j][1] = 0
														
 
															+
														
 
															+        return inner_table
														
 
															+
														
 
															+    _inner_table = pre_process(_inner_table)
														
 
															+    compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+    if show:
														
 
															+        print('table_head_repair_process1', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_colon(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair2 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process2', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_keywords(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair3 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process3', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_tenderer(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair4 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process4', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_duplicate(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair5 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process5', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_around(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair6 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process6', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_tenderer(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair7 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process7', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    _inner_table = repair_by_keywords(_inner_table)
														
 
															+    if _inner_table != compare_inner_table:
														
 
															+        compare_inner_table = copy.deepcopy(_inner_table)
														
 
															+        log('table_head repair8 ' + str(docid))
														
 
															+    if show:
														
 
															+        print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
														
 
															+
														
 
															+    return _inner_table
														
 
															+
														
 
															+
														
 
															 re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
														
 
															 num_dict = {
														
 
															     "一": 1, "二": 2,
														
@@ -2191,7 +2715,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
															         article_processed = get_preprocessed_outline(article_processed)
														
 
															         # print('article_processed')
														
 
															-        article_processed = tableToText(article_processed)
														
 
															+        article_processed = tableToText(article_processed, doc_id)
														
 
															         article_processed = segment(article_processed)
														
 
															         article_processed = article_processed.replace('(', '（').replace(')', '）')  #2022/8/10 统一为中文括号