пре 7 месеци · 4de3db25cb
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
 
				 
			
 
				 
			
 
				 #
			
 
				-def tableToText(soup, docid=None):
			
 
				+def tableToText(soup, docid=None, return_kv=False):
			
 
				     '''
			
 
				     @param:
			
 
				         soup:网页html的soup
			
@@ -1289,7 +1289,359 @@ def tableToText(soup, docid=None):
 
				                 #         text += rank_text+entity_text+text_line
			
 
				                 #         text = text[:-1]+"。" if len(text)>0 else text
			
 
				         return text
			
 
				-    
			
 
				+
			
 
				+    def get_table_text_kv(inner_table, head_list, key_direct=False):
			
 
				+        packPattern = "(标包|标的|标项|品目|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则，补充采购类包名
			
 
				+        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))"  # 2020/11/23 大网站规则，添加序号为排序
			
 
				+        entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
			
 
				+        moneyPattern = "([中投]标|报价)(金额|价)"
			
 
				+        width = len(inner_table[0])
			
 
				+        text = ""
			
 
				+
			
 
				+        all_table_occurence = []
			
 
				+        for head_i in range(len(head_list) - 1):
			
 
				+            head_begin = head_list[head_i]
			
 
				+            head_end = head_list[head_i + 1]
			
 
				+            direct = getDirect(inner_table, head_begin, head_end)
			
 
				+            # print(inner_table[head_begin:head_end])
			
 
				+            # print('direct', direct)
			
 
				+
			
 
				+            # 构建一个共现矩阵
			
 
				+            table_occurence = []
			
 
				+            for i in range(head_begin, head_end):
			
 
				+                line_oc = []
			
 
				+                for j in range(width):
			
 
				+                    cell = inner_table[i][j]
			
 
				+                    line_oc.append(
			
 
				+                        {"text": cell[0], "type": cell[1], "occu_count": 0, "left_head": "", "top_head": "",
			
 
				+                         "left_dis": 0, "top_dis": 0,
			
 
				+                         "text_row_index": i, "text_col_index": j
			
 
				+                         })
			
 
				+                table_occurence.append(line_oc)
			
 
				+            occu_height = len(table_occurence)
			
 
				+            occu_width = len(table_occurence[0]) if len(table_occurence) > 0 else 0
			
 
				+
			
 
				+            # 为每个属性值寻找表头
			
 
				+            for i in range(occu_height):
			
 
				+                for j in range(occu_width):
			
 
				+                    cell = table_occurence[i][j]
			
 
				+                    # 是属性值
			
 
				+                    if cell["type"] == 0 and cell["text"] != "":
			
 
				+                        left_head = ""
			
 
				+                        top_head = ""
			
 
				+                        find_flag = False
			
 
				+                        temp_head = ""
			
 
				+                        head_row_col_list = []
			
 
				+                        for loop_i in range(1, i + 1):
			
 
				+                            if not key_direct:
			
 
				+                                key_values = [1, 2]
			
 
				+                            else:
			
 
				+                                key_values = [1]
			
 
				+                            if table_occurence[i - loop_i][j]["type"] in key_values:
			
 
				+                                if find_flag:
			
 
				+                                    if table_occurence[i - loop_i][j]["text"] != temp_head:
			
 
				+                                        if cell.get("top_head_list"):
			
 
				+                                            cell["top_head_list"] += [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                        else:
			
 
				+                                            cell["top_head_list"] = [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                        top_head = table_occurence[i - loop_i][j]["text"] + "：" + top_head
			
 
				+                                        head_row_col_list.append([i - loop_i, j])
			
 
				+                                else:
			
 
				+                                    if cell.get("top_head_list"):
			
 
				+                                        cell["top_head_list"] += [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                    else:
			
 
				+                                        cell["top_head_list"] = [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                    top_head = table_occurence[i - loop_i][j]["text"] + "：" + top_head
			
 
				+                                    head_row_col_list.append([i - loop_i, j])
			
 
				+                                find_flag = True
			
 
				+                                temp_head = table_occurence[i - loop_i][j]["text"]
			
 
				+                                table_occurence[i - loop_i][j]["occu_count"] += 1
			
 
				+                            else:
			
 
				+                                # 找到表头后遇到属性值就返回
			
 
				+                                if find_flag:
			
 
				+                                    break
			
 
				+                        cell["top_head"] += top_head
			
 
				+                        if cell.get("top_head_row_index"):
			
 
				+                            cell["top_head_row_index"] += [x[0] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["top_head_row_index"] = [x[0] for x in head_row_col_list]
			
 
				+                        if cell.get("top_head_col_index"):
			
 
				+                            cell["top_head_col_index"] += [x[1] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["top_head_col_index"] = [x[1] for x in head_row_col_list]
			
 
				+                        find_flag = False
			
 
				+                        temp_head = ""
			
 
				+                        head_row_col_list = []
			
 
				+                        for loop_j in range(1, j + 1):
			
 
				+                            if not key_direct:
			
 
				+                                key_values = [1, 2]
			
 
				+                            else:
			
 
				+                                key_values = [2]
			
 
				+                            if table_occurence[i][j - loop_j]["type"] in key_values:
			
 
				+                                if find_flag:
			
 
				+                                    if table_occurence[i][j - loop_j]["text"] != temp_head:
			
 
				+                                        if cell.get("left_head_list"):
			
 
				+                                            cell["left_head_list"] += [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                        else:
			
 
				+                                            cell["left_head_list"] = [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                        left_head = table_occurence[i][j - loop_j]["text"] + "：" + left_head
			
 
				+                                        head_row_col_list.append([i, j - loop_j])
			
 
				+                                else:
			
 
				+                                    if cell.get("left_head_list"):
			
 
				+                                        cell["left_head_list"] += [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                    else:
			
 
				+                                        cell["left_head_list"] = [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                    left_head = table_occurence[i][j - loop_j]["text"] + "：" + left_head
			
 
				+                                    head_row_col_list.append([i, j - loop_j])
			
 
				+                                find_flag = True
			
 
				+                                temp_head = table_occurence[i][j - loop_j]["text"]
			
 
				+                                table_occurence[i][j - loop_j]["occu_count"] += 1
			
 
				+                            else:
			
 
				+                                if find_flag:
			
 
				+                                    break
			
 
				+                        cell["left_head"] += left_head
			
 
				+                        if cell.get("left_head_row_index"):
			
 
				+                            cell["left_head_row_index"] += [x[0] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["left_head_row_index"] = [x[0] for x in head_row_col_list]
			
 
				+                        if cell.get("left_head_col_index"):
			
 
				+                            cell["left_head_col_index"] += [x[1] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["left_head_col_index"] = [x[1] for x in head_row_col_list]
			
 
				+            # 连接表头和属性值
			
 
				+            if direct == "row":
			
 
				+                for i in range(occu_height):
			
 
				+                    pack_text = ""
			
 
				+                    rank_text = ""
			
 
				+                    entity_text = ""
			
 
				+                    text_line = ""
			
 
				+                    money_text = ""
			
 
				+                    # 在同一句话中重复的可以去掉
			
 
				+                    text_set = set()
			
 
				+                    head = ""
			
 
				+                    last_text = ""
			
 
				+                    pack_text_location = []
			
 
				+                    rank_text_location = []
			
 
				+                    entity_text_location = []
			
 
				+                    text_line_location = []
			
 
				+                    money_text_location = []
			
 
				+                    for j in range(width):
			
 
				+                        cell = table_occurence[i][j]
			
 
				+                        if cell["type"] == 0 or (cell["type"] == 1 and cell["occu_count"] == 0):
			
 
				+                            cell = table_occurence[i][j]
			
 
				+                            head = (cell["top_head"] + "：") if len(cell["top_head"]) > 0 else ""
			
 
				+                            now_top_head = copy.deepcopy(head)
			
 
				+                            now_left_head = copy.deepcopy(cell["left_head"])
			
 
				+                            if re.search(
			
 
				+                                    "[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|("
			
 
				+                                    "履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))",
			
 
				+                                    head):
			
 
				+                                head = cell["left_head"] + head
			
 
				+                                left_first = 1
			
 
				+                            else:
			
 
				+                                head += cell["left_head"]
			
 
				+                                left_first = 0
			
 
				+                            # print('len(text), len(sub_text), len(head)', cell["text"], len(text), len(sub_text), len(head))
			
 
				+                            # print('text111', text)
			
 
				+                            # print('pack_text, rank_text, entity_text, money_text, text_line', '1'+pack_text, '2'+rank_text, '3'+entity_text, '4'+money_text, '5'+text_line)
			
 
				+                            # print('head', head)
			
 
				+                            # print('sub_text111', sub_text)
			
 
				+
			
 
				+                            if str(head + cell["text"]) in text_set:
			
 
				+                                cell['drop'] = 1
			
 
				+                                continue
			
 
				+                            if re.search(packPattern, head) is not None:
			
 
				+                                pack_text += head + cell["text"] + "，"
			
 
				+                                pack_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            # 2020/11/23 大网站规则发现问题，if 改elif 20240620修复同时有排名及评标情况造成错误
			
 
				+                            elif re.search(rankPattern, head) is not None and re.search('(排名|排序|名次|顺序)：?第?[\d一二三]', rank_text) is None:
			
 
				+                                # 排名替换为同一种表达
			
 
				+                                rank_text += head + cell["text"] + "，"
			
 
				+                                rank_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            elif re.search(entityPattern, head) is not None:
			
 
				+                                entity_text += head + cell["text"] + "，"
			
 
				+                                entity_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            else:
			
 
				+                                if re.search(moneyPattern, head) is not None and entity_text != "":
			
 
				+                                    money_text += head + cell["text"] + ","
			
 
				+                                    money_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
			
 
				+                                else:
			
 
				+                                    text_line += head + cell["text"] + "，"
			
 
				+                                    text_line_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            text_set.add(str(head + cell["text"]))
			
 
				+                            last_text = cell['text']
			
 
				+
			
 
				+                    # 计算key value在sentence的index
			
 
				+                    head_location_list = pack_text_location + rank_text_location + entity_text_location + text_line_location + money_text_location
			
 
				+                    current_loc = 0
			
 
				+                    for ii, jj, head_text, now_left_head, now_top_head, left_first in head_location_list:
			
 
				+                        cell = table_occurence[ii][jj]
			
 
				+                        # 左表头先于右表头
			
 
				+                        if left_first:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc + len(now_left_head)
			
 
				+                        else:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc + len(now_top_head)
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc
			
 
				+                        cell['text_sen_index'] = len(text) + current_loc + len(now_left_head + now_top_head)
			
 
				+                        current_loc += len(head_text)
			
 
				+
			
 
				+                    tr_text = pack_text + rank_text + entity_text + money_text + text_line
			
 
				+                    text += pack_text + rank_text + entity_text + money_text + text_line
			
 
				+
			
 
				+                    # 修复367694716分两行表达
			
 
				+                    if len(text_set - set([' '])) == 1 and head == '' and len(last_text) < 25:
			
 
				+                        text = text if re.search('\w$', text[:-1]) else text[:-1]
			
 
				+                    # 修复494731937只有两行的，分句不合理
			
 
				+                    elif (width == 2 or len(text_set) == 1) and head != '' and len(tr_text) < 50:
			
 
				+                        text = text if re.search('\w$', text[:-1]) else text[:-1]
			
 
				+                    else:
			
 
				+                        text = text[:-1] + "。"
			
 
				+            else:
			
 
				+                for j in range(occu_width):
			
 
				+                    pack_text = ""
			
 
				+                    rank_text = ""
			
 
				+                    entity_text = ""
			
 
				+                    text_line = ""
			
 
				+                    text_set = set()
			
 
				+                    pack_text_location = []
			
 
				+                    rank_text_location = []
			
 
				+                    entity_text_location = []
			
 
				+                    text_line_location = []
			
 
				+                    money_text_location = []
			
 
				+                    for i in range(occu_height):
			
 
				+                        cell = table_occurence[i][j]
			
 
				+                        if cell["type"] == 0 or (cell["type"] == 1 and cell["occu_count"] == 0):
			
 
				+                            cell = table_occurence[i][j]
			
 
				+                            head = (cell["left_head"] + "") if len(cell["left_head"]) > 0 else ""
			
 
				+                            now_top_head = copy.deepcopy(cell["top_head"])
			
 
				+                            now_left_head = copy.deepcopy(head)
			
 
				+                            if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
			
 
				+                                head = cell["top_head"] + head
			
 
				+                                left_first = 0
			
 
				+                            else:
			
 
				+                                head += cell["top_head"]
			
 
				+                                left_first = 1
			
 
				+                            if str(head + cell["text"]) in text_set:
			
 
				+                                cell['drop'] = 1
			
 
				+                                continue
			
 
				+                            if re.search(packPattern, head) is not None:
			
 
				+                                pack_text += head + cell["text"] + "，"
			
 
				+                                pack_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            # 2020/11/23 大网站规则发现问题，if 改elif
			
 
				+                            elif re.search(rankPattern, head) is not None:
			
 
				+                                # 排名替换为同一种表达
			
 
				+                                rank_text += head + cell["text"] + "，"
			
 
				+                                rank_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            # 2021/10/19 解决包含业绩的行调到前面问题
			
 
				+                            elif re.search(entityPattern, head) is not None and \
			
 
				+                                    re.search('业绩|资格|条件', head) is None and re.search('业绩', cell["text"]) is None:
			
 
				+                                entity_text += head + cell["text"] + "，"
			
 
				+                                entity_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            else:
			
 
				+                                text_line += head + cell["text"] + "，"
			
 
				+                                text_line_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            text_set.add(str(head + cell["text"]))
			
 
				+
			
 
				+                    # 计算key value在sentence的index
			
 
				+                    head_location_list = pack_text_location + rank_text_location + entity_text_location + text_line_location + money_text_location
			
 
				+                    current_loc = 0
			
 
				+                    for ii, jj, head_text, now_left_head, now_top_head, left_first in head_location_list:
			
 
				+                        cell = table_occurence[ii][jj]
			
 
				+                        # 左表头先于右表头
			
 
				+                        if left_first:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc + len(now_left_head)
			
 
				+                        else:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc + len(now_top_head)
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc
			
 
				+                        cell['text_sen_index'] = len(text) + current_loc + len(now_left_head + now_top_head)
			
 
				+                        current_loc += len(head_text)
			
 
				+
			
 
				+                    text += pack_text + rank_text + entity_text + text_line
			
 
				+                    text = text[:-1] + "。" if len(text) > 0 else text
			
 
				+            all_table_occurence += table_occurence
			
 
				+        return text, all_table_occurence
			
 
				+
			
 
				+    def process_dict(text, table):
			
 
				+        kv_list = []
			
 
				+        kv_dict_list = []
			
 
				+        # print('text', len(text), text, ),
			
 
				+        # print('table', table)
			
 
				+        for r_index, row in enumerate(table):
			
 
				+            for c_index, col in enumerate(row):
			
 
				+                # print('col', col)
			
 
				+
			
 
				+                if col['type'] == 1:
			
 
				+                    continue
			
 
				+                if col.get('drop'):
			
 
				+                    continue
			
 
				+                if not col.get('left_head_list') and not col.get('top_head_list'):
			
 
				+                    _d = {
			
 
				+                        'value': col['text'],
			
 
				+                        'value_row_index': col['text_row_index'],
			
 
				+                        'value_col_index': col['text_col_index'],
			
 
				+                        'value_sen_index': col['text_sen_index'],
			
 
				+                        'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
			
 
				+                    }
			
 
				+                    kv_dict_list.append(_d)
			
 
				+                    continue
			
 
				+                if col.get('text_sen_index') and col.get('text_sen_index') >= len(text):
			
 
				+                    # print('continue1')
			
 
				+                    continue
			
 
				+
			
 
				+                if col.get('left_head_list'):
			
 
				+                    # head, head_row_index, head_col_index 按文本顺序排序
			
 
				+                    zip_list = list(
			
 
				+                        zip(col.get('left_head_list'), col.get('left_head_row_index'), col.get('left_head_col_index')))
			
 
				+                    zip_list.sort(key=lambda x: (x[1], x[2]))
			
 
				+                    col['left_head_list'], col['left_head_row_index'], col['left_head_col_index'] = zip(*zip_list)
			
 
				+
			
 
				+                    last_head = ""
			
 
				+                    for h_index, head in enumerate(col.get('left_head_list')):
			
 
				+                        _d = {
			
 
				+                            'key': head,
			
 
				+                            'value': col['text'],
			
 
				+                            'key_row_index': col['left_head_row_index'][h_index],
			
 
				+                            'key_col_index': col['left_head_col_index'][h_index],
			
 
				+                            'key_sen_index': col['left_head_sen_index'] + len(last_head),
			
 
				+                            'value_row_index': col['text_row_index'],
			
 
				+                            'value_col_index': col['text_col_index'],
			
 
				+                            'value_sen_index': col['text_sen_index'],
			
 
				+                            'sen_key': text[
			
 
				+                                       col['left_head_sen_index'] + len(last_head):col['left_head_sen_index'] + len(
			
 
				+                                           last_head) + len(head)],
			
 
				+                            'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
			
 
				+                        }
			
 
				+                        kv_dict_list.append(_d)
			
 
				+                        last_head += head
			
 
				+
			
 
				+                if col.get('top_head_list'):
			
 
				+                    # head, head_row_index, head_col_index 按文本顺序排序
			
 
				+                    zip_list = list(
			
 
				+                        zip(col.get('top_head_list'), col.get('top_head_row_index'), col.get('top_head_col_index')))
			
 
				+                    zip_list.sort(key=lambda x: (x[1], x[2]))
			
 
				+                    col['top_head_list'], col['top_head_row_index'], col['top_head_col_index'] = zip(*zip_list)
			
 
				+
			
 
				+                    last_head = ""
			
 
				+                    for h_index, head in enumerate(col.get('top_head_list')):
			
 
				+                        _d = {
			
 
				+                            'key': head,
			
 
				+                            'value': col['text'],
			
 
				+                            'key_row_index': col['top_head_row_index'][h_index],
			
 
				+                            'key_col_index': col['top_head_col_index'][h_index],
			
 
				+                            'key_sen_index': col['top_head_sen_index'] + len(last_head),
			
 
				+                            'value_row_index': col['text_row_index'],
			
 
				+                            'value_col_index': col['text_col_index'],
			
 
				+                            'value_sen_index': col['text_sen_index'],
			
 
				+                            'sen_key': text[col['top_head_sen_index'] + len(last_head):col['top_head_sen_index'] + len(
			
 
				+                                last_head) + len(head)],
			
 
				+                            'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
			
 
				+                        }
			
 
				+                        kv_dict_list.append(_d)
			
 
				+                        last_head += head
			
 
				+        return kv_list, kv_dict_list
			
 
				+
			
 
				     def removeFix(inner_table,fix_value="~~"):
			
 
				         height = len(inner_table)
			
 
				         width = len(inner_table[0])
			
@@ -1319,14 +1671,22 @@ def tableToText(soup, docid=None):
 
				                     table_max_len = 30000
			
 
				                     tbody.string = tbody.string[:table_max_len]
			
 
				                     tbody.name = "turntable"
			
 
				+                    if return_kv:
			
 
				+                        return None, None, None
			
 
				                     return None
			
 
				         # fixSpan(tbody)
			
 
				         # inner_table = getTable(tbody)
			
 
				         # inner_table = fixTable(inner_table)
			
 
				 
			
 
				         table2list = TableTag2List()
			
 
				-        inner_table = table2list.table2list(tbody, segment)
			
 
				-        inner_table = fixTable(inner_table)
			
 
				+        return_html_table = True if return_kv else False
			
 
				+        if return_html_table:
			
 
				+            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table)
			
 
				+            inner_table = fixTable(inner_table)
			
 
				+            html_table = fixTable(html_table, "")
			
 
				+        else:
			
 
				+            inner_table = table2list.table2list(tbody, segment)
			
 
				+            inner_table = fixTable(inner_table)
			
 
				 
			
 
				         if inner_table == []:
			
 
				             string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
			
@@ -1335,6 +1695,8 @@ def tableToText(soup, docid=None):
 
				             tbody.string = tbody.string[:table_max_len]
			
 
				             # log('异常表格直接取全文')
			
 
				             tbody.name = "turntable"
			
 
				+            if return_kv:
			
 
				+                return None, None, None
			
 
				             return None
			
 
				 
			
 
				         if len(inner_table)>0 and len(inner_table[0])>0:
			
@@ -1347,6 +1709,8 @@ def tableToText(soup, docid=None):
 
				                         tbody.string = tbody.string[:table_max_len]
			
 
				                         # log('异常表格，不做表格处理，直接取全文')
			
 
				                         tbody.name = "turntable"
			
 
				+                        if return_kv:
			
 
				+                            return None, None, None
			
 
				                         return None
			
 
				 
			
 
				             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
			
@@ -1368,12 +1732,28 @@ def tableToText(soup, docid=None):
 
				             # for item in inner_table:
			
 
				             #     print(item)
			
 
				 
			
 
				-            tbody.string = getTableText(inner_table,head_list)
			
 
				+            # print('inner_table111', inner_table)
			
 
				+
			
 
				+            if return_kv:
			
 
				+                text, table = get_table_text_kv(inner_table, head_list)
			
 
				+                kv_list, kv_dict_list = process_dict(text, table)
			
 
				+                tbody.string = text
			
 
				+                # html放入dict
			
 
				+                for kv_dict in kv_dict_list:
			
 
				+                    html = html_table[kv_dict.get('value_row_index')][kv_dict.get('value_col_index')]
			
 
				+                    kv_dict['value_html'] = html
			
 
				+            else:
			
 
				+                tbody.string = getTableText(inner_table,head_list)
			
 
				             table_max_len = 30000
			
 
				             tbody.string = tbody.string[:table_max_len]
			
 
				             # print(tbody.string)
			
 
				             tbody.name = "turntable"
			
 
				-            return inner_table
			
 
				+            if return_kv:
			
 
				+                return inner_table, kv_dict_list, text
			
 
				+            else:
			
 
				+                return inner_table
			
 
				+        if return_kv:
			
 
				+            return None, None, None
			
 
				         return None
			
 
				 
			
 
				 
			
@@ -1404,6 +1784,10 @@ def tableToText(soup, docid=None):
 
				         elif _part.name=='div':
			
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				+
			
 
				+    if return_kv and tbodies:
			
 
				+        tbodies = tbodies[:1]
			
 
				+
			
 
				     #逆序处理嵌套表格
			
 
				     # print('len(tbodies)1', len(tbodies))
			
 
				     # for tbody_index in range(1,len(tbodies)+1):
			
@@ -1436,6 +1820,10 @@ def tableToText(soup, docid=None):
 
				         elif _part.name == 'div':
			
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				+
			
 
				+    if return_kv and tbodies:
			
 
				+        tbodies = tbodies[:1]
			
 
				+
			
 
				     #逆序处理嵌套表格
			
 
				     tbody_index = 1
			
 
				     # for tbody_index in range(1,len(tbodies)+1):
			
@@ -1457,6 +1845,10 @@ def tableToText(soup, docid=None):
 
				         list_innerTable.append(inner_table)
			
 
				         tbody_index += 1
			
 
				 
			
 
				+    if return_kv:
			
 
				+        kv_list = [x[1] for x in list_innerTable]
			
 
				+        list_innerTable = [x[0] for x in list_innerTable]
			
 
				+        return soup, kv_list
			
 
				     return soup
			
 
				     # return list_innerTable
			
 
				 
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -6583,10 +6583,11 @@ class DistrictPredictor():
 
				 
			
 
				 class TableTag2List():
			
 
				     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
			
 
				-    def table2list(self, table, text_process=None):
			
 
				+    def table2list(self, table, text_process=None, return_html_table=False):
			
 
				         self._output = []
			
 
				         row_ind = 0
			
 
				         col_ind = 0
			
 
				+        html_table = []
			
 
				         for row in table.find_all('tr'):
			
 
				             # record the smallest row_span, so that we know how many rows
			
 
				             # we should skip
			
@@ -6633,7 +6634,10 @@ class TableTag2List():
 
				                             text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "（").replace(')', '）').replace('?', '')
			
 
				                             # text = re.sub('\s', '', text)[:200] # 只需取前200字即可
			
 
				                             text = ' ' if text == "" else text
			
 
				+
			
 
				                         self._insert(row_ind, col_ind, row_span, col_span, text)
			
 
				+                        if return_html_table:
			
 
				+                            html_table = self._insert_new(row_ind, col_ind, row_span, col_span, str(cell), html_table)
			
 
				                     except UnicodeEncodeError:
			
 
				                         raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
			
 
				 
			
@@ -6645,7 +6649,20 @@ class TableTag2List():
 
				             # update row_ind
			
 
				             row_ind += smallest_row_span
			
 
				             col_ind = 0
			
 
				-        return self._output
			
 
				+        if return_html_table:
			
 
				+            temp_list = []
			
 
				+            for row in self._output:
			
 
				+                if len(row) > 0:
			
 
				+                    temp_list.append(row)
			
 
				+            self._output =  temp_list
			
 
				+            temp_list = []
			
 
				+            for row in html_table:
			
 
				+                if len(row) > 0:
			
 
				+                    temp_list.append(row)
			
 
				+            html_table = temp_list
			
 
				+            return self._output, html_table
			
 
				+        else:
			
 
				+            return self._output
			
 
				 
			
 
				     def _check_validity(self, i, j, height, width):
			
 
				         """
			
@@ -6680,6 +6697,24 @@ class TableTag2List():
 
				         if self._output[i][j] == "":
			
 
				             self._output[i][j] = val
			
 
				 
			
 
				+    def _insert_new(self, i, j, height, width, val, cell_list):
			
 
				+        # pdb.set_trace()
			
 
				+        for ii in range(i, i+height):
			
 
				+            for jj in range(j, j+width):
			
 
				+                cell_list = self._insert_cell_new(ii, jj, val, cell_list)
			
 
				+        return cell_list
			
 
				+
			
 
				+    def _insert_cell_new(self, i, j, val, cell_list):
			
 
				+        while i >= len(cell_list):
			
 
				+            cell_list.append([])
			
 
				+        while j >= len(cell_list[i]):
			
 
				+            cell_list[i].append("")
			
 
				+
			
 
				+        if cell_list[i][j] == "":
			
 
				+            cell_list[i][j] = val
			
 
				+        return cell_list
			
 
				+
			
 
				+
			
 
				 def is_head_line(list_item):
			
 
				     '''
			
 
				     调用表头识别模型判断是否为表头行