5 mēneši atpakaļ · e2161934a6
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -1033,6 +1033,8 @@ def find_package(content):
 
				         elif re.search('单位：包|1包\d|[张箱]', content[max(0, iter.start()-3): iter.end()+2]): # 处理 463166661 包号错误 钢丝，单位：包X10根。
			
 
				             # print('过滤掉错误包号，单位：包|1包', iter.group(0))
			
 
				             continue
			
 
				+        elif iter.group(0) == '劳务分包': # 20241203 修复562534840劳务分包作包号
			
 
				+            continue
			
 
				         packages.append(iter)
			
 
				         # print('提取到标段：%s， 前后文：%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
			
 
				     return packages
			
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -528,6 +528,8 @@ def match_enterprise_max_first(sentence):
 
				         while True:
			
 
				             if begin_index+ENTERPRISE_KEY_LEN<len(sentence):
			
 
				                 key_enter = sentence[begin_index:begin_index+ENTERPRISE_KEY_LEN]
			
 
				+                if key_enter.find('，') > 0:
			
 
				+                    key_enter = sentence[begin_index:begin_index + ENTERPRISE_KEY_LEN+1].replace('，', '') # 20241212 修复实体名称被分割问题 例：北，京千里马网信科技有限公司
			
 
				 
			
 
				                 # if key_enter in DICT_ENTERPRISE:
			
 
				                 #     _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
			
@@ -543,24 +545,27 @@ def match_enterprise_max_first(sentence):
 
				                     _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
			
 
				                     for _i in range(_len):
			
 
				                         enter_name = sentence[begin_index:begin_index+_len-_i]
			
 
				-                        enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
			
 
				+                        if enter_name.endswith('，'):
			
 
				+                            continue
			
 
				+                        fix_name = enter_name.replace('，', '') # 20241212 修复实体名称被分割问题 例：北，京千里马网信科技有限公司
			
 
				+                        enter_tail = fix_name[-ENTERPRISE_TAIL_LEN:]
			
 
				                         if re.search('[\u4e00-\u9fa5]', enter_tail) == None: # 20240111不包含中文后缀不要
			
 
				                             continue
			
 
				-                        elif enter_name in ['黄埔军校',  '五金建材', '铝合金门窗', '测试单位' ,'生产管理部', '华电XXX发电有限公司']: # '国有资产管理处',
			
 
				+                        elif fix_name in ['黄埔军校',  '五金建材', '铝合金门窗', '测试单位' ,'生产管理部', '华电XXX发电有限公司']: # '国有资产管理处',
			
 
				                             continue
			
 
				-                        elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$|交汇处$|大厦$|大楼$|^华电X{1,4}发电有限公司$', enter_name):
			
 
				+                        elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$|交汇处$|大厦$|大楼$|^华电X{1,4}发电有限公司$', fix_name):
			
 
				                             continue
			
 
				-                        if len(enter_name)<4: # 20240521 短于4个字的不要
			
 
				+                        if len(fix_name)<4: # 20240521 短于4个字的不要
			
 
				                             break
			
 
				                         if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail):
			
 
				-                            if enter_name not in business_dic:
			
 
				-                                have_bus, dic = get_business_data(enter_name) # 20210124 改为有工商数据的实体才添加
			
 
				-                                business_dic[enter_name] = (have_bus, dic)
			
 
				+                            if fix_name not in business_dic:
			
 
				+                                have_bus, dic = get_business_data(fix_name) # 20210124 改为有工商数据的实体才添加
			
 
				+                                business_dic[fix_name] = (have_bus, dic)
			
 
				                             else:
			
 
				-                                have_bus, dic = business_dic.get(enter_name) # 20240708 字典保存查询过的工商数据，避免重复查询redis
			
 
				+                                have_bus, dic = business_dic.get(fix_name) # 20240708 字典保存查询过的工商数据，避免重复查询redis
			
 
				                             if have_bus:
			
 
				                             # if is_enterprise_exist(enter_name):
			
 
				-                                match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
			
 
				+                                match_item = {"entity_text":"%s"%(fix_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
			
 
				                                 # print("match_item",key_enter,enter_name)
			
 
				                                 list_match.append(match_item)
			
 
				                                 begin_index += len(enter_name)-1
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
 
				 
			
 
				 
			
 
				 #
			
 
				-def tableToText(soup, docid=None):
			
 
				+def tableToText(soup, docid=None, return_kv=False):
			
 
				     '''
			
 
				     @param:
			
 
				         soup:网页html的soup
			
@@ -1289,7 +1289,359 @@ def tableToText(soup, docid=None):
 
				                 #         text += rank_text+entity_text+text_line
			
 
				                 #         text = text[:-1]+"。" if len(text)>0 else text
			
 
				         return text
			
 
				-    
			
 
				+
			
 
				+    def get_table_text_kv(inner_table, head_list, key_direct=False):
			
 
				+        packPattern = "(标包|标的|标项|品目|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则，补充采购类包名
			
 
				+        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))"  # 2020/11/23 大网站规则，添加序号为排序
			
 
				+        entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
			
 
				+        moneyPattern = "([中投]标|报价)(金额|价)"
			
 
				+        width = len(inner_table[0])
			
 
				+        text = ""
			
 
				+
			
 
				+        all_table_occurence = []
			
 
				+        for head_i in range(len(head_list) - 1):
			
 
				+            head_begin = head_list[head_i]
			
 
				+            head_end = head_list[head_i + 1]
			
 
				+            direct = getDirect(inner_table, head_begin, head_end)
			
 
				+            # print(inner_table[head_begin:head_end])
			
 
				+            # print('direct', direct)
			
 
				+
			
 
				+            # 构建一个共现矩阵
			
 
				+            table_occurence = []
			
 
				+            for i in range(head_begin, head_end):
			
 
				+                line_oc = []
			
 
				+                for j in range(width):
			
 
				+                    cell = inner_table[i][j]
			
 
				+                    line_oc.append(
			
 
				+                        {"text": cell[0], "type": cell[1], "occu_count": 0, "left_head": "", "top_head": "",
			
 
				+                         "left_dis": 0, "top_dis": 0,
			
 
				+                         "text_row_index": i, "text_col_index": j
			
 
				+                         })
			
 
				+                table_occurence.append(line_oc)
			
 
				+            occu_height = len(table_occurence)
			
 
				+            occu_width = len(table_occurence[0]) if len(table_occurence) > 0 else 0
			
 
				+
			
 
				+            # 为每个属性值寻找表头
			
 
				+            for i in range(occu_height):
			
 
				+                for j in range(occu_width):
			
 
				+                    cell = table_occurence[i][j]
			
 
				+                    # 是属性值
			
 
				+                    if cell["type"] == 0 and cell["text"] != "":
			
 
				+                        left_head = ""
			
 
				+                        top_head = ""
			
 
				+                        find_flag = False
			
 
				+                        temp_head = ""
			
 
				+                        head_row_col_list = []
			
 
				+                        for loop_i in range(1, i + 1):
			
 
				+                            if not key_direct:
			
 
				+                                key_values = [1, 2]
			
 
				+                            else:
			
 
				+                                key_values = [1]
			
 
				+                            if table_occurence[i - loop_i][j]["type"] in key_values:
			
 
				+                                if find_flag:
			
 
				+                                    if table_occurence[i - loop_i][j]["text"] != temp_head:
			
 
				+                                        if cell.get("top_head_list"):
			
 
				+                                            cell["top_head_list"] += [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                        else:
			
 
				+                                            cell["top_head_list"] = [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                        top_head = table_occurence[i - loop_i][j]["text"] + "：" + top_head
			
 
				+                                        head_row_col_list.append([i - loop_i, j])
			
 
				+                                else:
			
 
				+                                    if cell.get("top_head_list"):
			
 
				+                                        cell["top_head_list"] += [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                    else:
			
 
				+                                        cell["top_head_list"] = [table_occurence[i - loop_i][j]["text"] + "："]
			
 
				+                                    top_head = table_occurence[i - loop_i][j]["text"] + "：" + top_head
			
 
				+                                    head_row_col_list.append([i - loop_i, j])
			
 
				+                                find_flag = True
			
 
				+                                temp_head = table_occurence[i - loop_i][j]["text"]
			
 
				+                                table_occurence[i - loop_i][j]["occu_count"] += 1
			
 
				+                            else:
			
 
				+                                # 找到表头后遇到属性值就返回
			
 
				+                                if find_flag:
			
 
				+                                    break
			
 
				+                        cell["top_head"] += top_head
			
 
				+                        if cell.get("top_head_row_index"):
			
 
				+                            cell["top_head_row_index"] += [x[0] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["top_head_row_index"] = [x[0] for x in head_row_col_list]
			
 
				+                        if cell.get("top_head_col_index"):
			
 
				+                            cell["top_head_col_index"] += [x[1] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["top_head_col_index"] = [x[1] for x in head_row_col_list]
			
 
				+                        find_flag = False
			
 
				+                        temp_head = ""
			
 
				+                        head_row_col_list = []
			
 
				+                        for loop_j in range(1, j + 1):
			
 
				+                            if not key_direct:
			
 
				+                                key_values = [1, 2]
			
 
				+                            else:
			
 
				+                                key_values = [2]
			
 
				+                            if table_occurence[i][j - loop_j]["type"] in key_values:
			
 
				+                                if find_flag:
			
 
				+                                    if table_occurence[i][j - loop_j]["text"] != temp_head:
			
 
				+                                        if cell.get("left_head_list"):
			
 
				+                                            cell["left_head_list"] += [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                        else:
			
 
				+                                            cell["left_head_list"] = [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                        left_head = table_occurence[i][j - loop_j]["text"] + "：" + left_head
			
 
				+                                        head_row_col_list.append([i, j - loop_j])
			
 
				+                                else:
			
 
				+                                    if cell.get("left_head_list"):
			
 
				+                                        cell["left_head_list"] += [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                    else:
			
 
				+                                        cell["left_head_list"] = [table_occurence[i][j - loop_j]["text"] + "："]
			
 
				+                                    left_head = table_occurence[i][j - loop_j]["text"] + "：" + left_head
			
 
				+                                    head_row_col_list.append([i, j - loop_j])
			
 
				+                                find_flag = True
			
 
				+                                temp_head = table_occurence[i][j - loop_j]["text"]
			
 
				+                                table_occurence[i][j - loop_j]["occu_count"] += 1
			
 
				+                            else:
			
 
				+                                if find_flag:
			
 
				+                                    break
			
 
				+                        cell["left_head"] += left_head
			
 
				+                        if cell.get("left_head_row_index"):
			
 
				+                            cell["left_head_row_index"] += [x[0] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["left_head_row_index"] = [x[0] for x in head_row_col_list]
			
 
				+                        if cell.get("left_head_col_index"):
			
 
				+                            cell["left_head_col_index"] += [x[1] for x in head_row_col_list]
			
 
				+                        else:
			
 
				+                            cell["left_head_col_index"] = [x[1] for x in head_row_col_list]
			
 
				+            # 连接表头和属性值
			
 
				+            if direct == "row":
			
 
				+                for i in range(occu_height):
			
 
				+                    pack_text = ""
			
 
				+                    rank_text = ""
			
 
				+                    entity_text = ""
			
 
				+                    text_line = ""
			
 
				+                    money_text = ""
			
 
				+                    # 在同一句话中重复的可以去掉
			
 
				+                    text_set = set()
			
 
				+                    head = ""
			
 
				+                    last_text = ""
			
 
				+                    pack_text_location = []
			
 
				+                    rank_text_location = []
			
 
				+                    entity_text_location = []
			
 
				+                    text_line_location = []
			
 
				+                    money_text_location = []
			
 
				+                    for j in range(width):
			
 
				+                        cell = table_occurence[i][j]
			
 
				+                        if cell["type"] == 0 or (cell["type"] == 1 and cell["occu_count"] == 0):
			
 
				+                            cell = table_occurence[i][j]
			
 
				+                            head = (cell["top_head"] + "：") if len(cell["top_head"]) > 0 else ""
			
 
				+                            now_top_head = copy.deepcopy(head)
			
 
				+                            now_left_head = copy.deepcopy(cell["left_head"])
			
 
				+                            if re.search(
			
 
				+                                    "[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|("
			
 
				+                                    "履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))",
			
 
				+                                    head):
			
 
				+                                head = cell["left_head"] + head
			
 
				+                                left_first = 1
			
 
				+                            else:
			
 
				+                                head += cell["left_head"]
			
 
				+                                left_first = 0
			
 
				+                            # print('len(text), len(sub_text), len(head)', cell["text"], len(text), len(sub_text), len(head))
			
 
				+                            # print('text111', text)
			
 
				+                            # print('pack_text, rank_text, entity_text, money_text, text_line', '1'+pack_text, '2'+rank_text, '3'+entity_text, '4'+money_text, '5'+text_line)
			
 
				+                            # print('head', head)
			
 
				+                            # print('sub_text111', sub_text)
			
 
				+
			
 
				+                            if str(head + cell["text"]) in text_set:
			
 
				+                                cell['drop'] = 1
			
 
				+                                continue
			
 
				+                            if re.search(packPattern, head) is not None:
			
 
				+                                pack_text += head + cell["text"] + "，"
			
 
				+                                pack_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            # 2020/11/23 大网站规则发现问题，if 改elif 20240620修复同时有排名及评标情况造成错误
			
 
				+                            elif re.search(rankPattern, head) is not None and re.search('(排名|排序|名次|顺序)：?第?[\d一二三]', rank_text) is None:
			
 
				+                                # 排名替换为同一种表达
			
 
				+                                rank_text += head + cell["text"] + "，"
			
 
				+                                rank_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            elif re.search(entityPattern, head) is not None:
			
 
				+                                entity_text += head + cell["text"] + "，"
			
 
				+                                entity_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            else:
			
 
				+                                if re.search(moneyPattern, head) is not None and entity_text != "":
			
 
				+                                    money_text += head + cell["text"] + ","
			
 
				+                                    money_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
			
 
				+                                else:
			
 
				+                                    text_line += head + cell["text"] + "，"
			
 
				+                                    text_line_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            text_set.add(str(head + cell["text"]))
			
 
				+                            last_text = cell['text']
			
 
				+
			
 
				+                    # 计算key value在sentence的index
			
 
				+                    head_location_list = pack_text_location + rank_text_location + entity_text_location + text_line_location + money_text_location
			
 
				+                    current_loc = 0
			
 
				+                    for ii, jj, head_text, now_left_head, now_top_head, left_first in head_location_list:
			
 
				+                        cell = table_occurence[ii][jj]
			
 
				+                        # 左表头先于右表头
			
 
				+                        if left_first:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc + len(now_left_head)
			
 
				+                        else:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc + len(now_top_head)
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc
			
 
				+                        cell['text_sen_index'] = len(text) + current_loc + len(now_left_head + now_top_head)
			
 
				+                        current_loc += len(head_text)
			
 
				+
			
 
				+                    tr_text = pack_text + rank_text + entity_text + money_text + text_line
			
 
				+                    text += pack_text + rank_text + entity_text + money_text + text_line
			
 
				+
			
 
				+                    # 修复367694716分两行表达
			
 
				+                    if len(text_set - set([' '])) == 1 and head == '' and len(last_text) < 25:
			
 
				+                        text = text if re.search('\w$', text[:-1]) else text[:-1]
			
 
				+                    # 修复494731937只有两行的，分句不合理
			
 
				+                    elif (width == 2 or len(text_set) == 1) and head != '' and len(tr_text) < 50:
			
 
				+                        text = text if re.search('\w$', text[:-1]) else text[:-1]
			
 
				+                    else:
			
 
				+                        text = text[:-1] + "。"
			
 
				+            else:
			
 
				+                for j in range(occu_width):
			
 
				+                    pack_text = ""
			
 
				+                    rank_text = ""
			
 
				+                    entity_text = ""
			
 
				+                    text_line = ""
			
 
				+                    text_set = set()
			
 
				+                    pack_text_location = []
			
 
				+                    rank_text_location = []
			
 
				+                    entity_text_location = []
			
 
				+                    text_line_location = []
			
 
				+                    money_text_location = []
			
 
				+                    for i in range(occu_height):
			
 
				+                        cell = table_occurence[i][j]
			
 
				+                        if cell["type"] == 0 or (cell["type"] == 1 and cell["occu_count"] == 0):
			
 
				+                            cell = table_occurence[i][j]
			
 
				+                            head = (cell["left_head"] + "") if len(cell["left_head"]) > 0 else ""
			
 
				+                            now_top_head = copy.deepcopy(cell["top_head"])
			
 
				+                            now_left_head = copy.deepcopy(head)
			
 
				+                            if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
			
 
				+                                head = cell["top_head"] + head
			
 
				+                                left_first = 0
			
 
				+                            else:
			
 
				+                                head += cell["top_head"]
			
 
				+                                left_first = 1
			
 
				+                            if str(head + cell["text"]) in text_set:
			
 
				+                                cell['drop'] = 1
			
 
				+                                continue
			
 
				+                            if re.search(packPattern, head) is not None:
			
 
				+                                pack_text += head + cell["text"] + "，"
			
 
				+                                pack_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            # 2020/11/23 大网站规则发现问题，if 改elif
			
 
				+                            elif re.search(rankPattern, head) is not None:
			
 
				+                                # 排名替换为同一种表达
			
 
				+                                rank_text += head + cell["text"] + "，"
			
 
				+                                rank_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            # 2021/10/19 解决包含业绩的行调到前面问题
			
 
				+                            elif re.search(entityPattern, head) is not None and \
			
 
				+                                    re.search('业绩|资格|条件', head) is None and re.search('业绩', cell["text"]) is None:
			
 
				+                                entity_text += head + cell["text"] + "，"
			
 
				+                                entity_text_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            else:
			
 
				+                                text_line += head + cell["text"] + "，"
			
 
				+                                text_line_location += [[i, j, head + cell["text"] + "，", now_left_head, now_top_head, left_first]]
			
 
				+                            text_set.add(str(head + cell["text"]))
			
 
				+
			
 
				+                    # 计算key value在sentence的index
			
 
				+                    head_location_list = pack_text_location + rank_text_location + entity_text_location + text_line_location + money_text_location
			
 
				+                    current_loc = 0
			
 
				+                    for ii, jj, head_text, now_left_head, now_top_head, left_first in head_location_list:
			
 
				+                        cell = table_occurence[ii][jj]
			
 
				+                        # 左表头先于右表头
			
 
				+                        if left_first:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc + len(now_left_head)
			
 
				+                        else:
			
 
				+                            cell['left_head_sen_index'] = len(text) + current_loc + len(now_top_head)
			
 
				+                            cell['top_head_sen_index'] = len(text) + current_loc
			
 
				+                        cell['text_sen_index'] = len(text) + current_loc + len(now_left_head + now_top_head)
			
 
				+                        current_loc += len(head_text)
			
 
				+
			
 
				+                    text += pack_text + rank_text + entity_text + text_line
			
 
				+                    text = text[:-1] + "。" if len(text) > 0 else text
			
 
				+            all_table_occurence += table_occurence
			
 
				+        return text, all_table_occurence
			
 
				+
			
 
				+    def process_dict(text, table):
			
 
				+        kv_list = []
			
 
				+        kv_dict_list = []
			
 
				+        # print('text', len(text), text, ),
			
 
				+        # print('table', table)
			
 
				+        for r_index, row in enumerate(table):
			
 
				+            for c_index, col in enumerate(row):
			
 
				+                # print('col', col)
			
 
				+
			
 
				+                if col['type'] == 1:
			
 
				+                    continue
			
 
				+                if col.get('drop'):
			
 
				+                    continue
			
 
				+                if not col.get('left_head_list') and not col.get('top_head_list'):
			
 
				+                    _d = {
			
 
				+                        'value': col['text'],
			
 
				+                        'value_row_index': col['text_row_index'],
			
 
				+                        'value_col_index': col['text_col_index'],
			
 
				+                        'value_sen_index': col['text_sen_index'],
			
 
				+                        'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
			
 
				+                    }
			
 
				+                    kv_dict_list.append(_d)
			
 
				+                    continue
			
 
				+                if col.get('text_sen_index') and col.get('text_sen_index') >= len(text):
			
 
				+                    # print('continue1')
			
 
				+                    continue
			
 
				+
			
 
				+                if col.get('left_head_list'):
			
 
				+                    # head, head_row_index, head_col_index 按文本顺序排序
			
 
				+                    zip_list = list(
			
 
				+                        zip(col.get('left_head_list'), col.get('left_head_row_index'), col.get('left_head_col_index')))
			
 
				+                    zip_list.sort(key=lambda x: (x[1], x[2]))
			
 
				+                    col['left_head_list'], col['left_head_row_index'], col['left_head_col_index'] = zip(*zip_list)
			
 
				+
			
 
				+                    last_head = ""
			
 
				+                    for h_index, head in enumerate(col.get('left_head_list')):
			
 
				+                        _d = {
			
 
				+                            'key': head,
			
 
				+                            'value': col['text'],
			
 
				+                            'key_row_index': col['left_head_row_index'][h_index],
			
 
				+                            'key_col_index': col['left_head_col_index'][h_index],
			
 
				+                            'key_sen_index': col['left_head_sen_index'] + len(last_head),
			
 
				+                            'value_row_index': col['text_row_index'],
			
 
				+                            'value_col_index': col['text_col_index'],
			
 
				+                            'value_sen_index': col['text_sen_index'],
			
 
				+                            'sen_key': text[
			
 
				+                                       col['left_head_sen_index'] + len(last_head):col['left_head_sen_index'] + len(
			
 
				+                                           last_head) + len(head)],
			
 
				+                            'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
			
 
				+                        }
			
 
				+                        kv_dict_list.append(_d)
			
 
				+                        last_head += head
			
 
				+
			
 
				+                if col.get('top_head_list'):
			
 
				+                    # head, head_row_index, head_col_index 按文本顺序排序
			
 
				+                    zip_list = list(
			
 
				+                        zip(col.get('top_head_list'), col.get('top_head_row_index'), col.get('top_head_col_index')))
			
 
				+                    zip_list.sort(key=lambda x: (x[1], x[2]))
			
 
				+                    col['top_head_list'], col['top_head_row_index'], col['top_head_col_index'] = zip(*zip_list)
			
 
				+
			
 
				+                    last_head = ""
			
 
				+                    for h_index, head in enumerate(col.get('top_head_list')):
			
 
				+                        _d = {
			
 
				+                            'key': head,
			
 
				+                            'value': col['text'],
			
 
				+                            'key_row_index': col['top_head_row_index'][h_index],
			
 
				+                            'key_col_index': col['top_head_col_index'][h_index],
			
 
				+                            'key_sen_index': col['top_head_sen_index'] + len(last_head),
			
 
				+                            'value_row_index': col['text_row_index'],
			
 
				+                            'value_col_index': col['text_col_index'],
			
 
				+                            'value_sen_index': col['text_sen_index'],
			
 
				+                            'sen_key': text[col['top_head_sen_index'] + len(last_head):col['top_head_sen_index'] + len(
			
 
				+                                last_head) + len(head)],
			
 
				+                            'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
			
 
				+                        }
			
 
				+                        kv_dict_list.append(_d)
			
 
				+                        last_head += head
			
 
				+        return kv_list, kv_dict_list
			
 
				+
			
 
				     def removeFix(inner_table,fix_value="~~"):
			
 
				         height = len(inner_table)
			
 
				         width = len(inner_table[0])
			
@@ -1319,14 +1671,22 @@ def tableToText(soup, docid=None):
 
				                     table_max_len = 30000
			
 
				                     tbody.string = tbody.string[:table_max_len]
			
 
				                     tbody.name = "turntable"
			
 
				+                    if return_kv:
			
 
				+                        return None, None, None
			
 
				                     return None
			
 
				         # fixSpan(tbody)
			
 
				         # inner_table = getTable(tbody)
			
 
				         # inner_table = fixTable(inner_table)
			
 
				 
			
 
				         table2list = TableTag2List()
			
 
				-        inner_table = table2list.table2list(tbody, segment)
			
 
				-        inner_table = fixTable(inner_table)
			
 
				+        return_html_table = True if return_kv else False
			
 
				+        if return_html_table:
			
 
				+            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table)
			
 
				+            inner_table = fixTable(inner_table)
			
 
				+            html_table = fixTable(html_table, "")
			
 
				+        else:
			
 
				+            inner_table = table2list.table2list(tbody, segment)
			
 
				+            inner_table = fixTable(inner_table)
			
 
				 
			
 
				         if inner_table == []:
			
 
				             string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
			
@@ -1335,6 +1695,8 @@ def tableToText(soup, docid=None):
 
				             tbody.string = tbody.string[:table_max_len]
			
 
				             # log('异常表格直接取全文')
			
 
				             tbody.name = "turntable"
			
 
				+            if return_kv:
			
 
				+                return None, None, None
			
 
				             return None
			
 
				 
			
 
				         if len(inner_table)>0 and len(inner_table[0])>0:
			
@@ -1347,6 +1709,8 @@ def tableToText(soup, docid=None):
 
				                         tbody.string = tbody.string[:table_max_len]
			
 
				                         # log('异常表格，不做表格处理，直接取全文')
			
 
				                         tbody.name = "turntable"
			
 
				+                        if return_kv:
			
 
				+                            return None, None, None
			
 
				                         return None
			
 
				 
			
 
				             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
			
@@ -1368,12 +1732,28 @@ def tableToText(soup, docid=None):
 
				             # for item in inner_table:
			
 
				             #     print(item)
			
 
				 
			
 
				-            tbody.string = getTableText(inner_table,head_list)
			
 
				+            # print('inner_table111', inner_table)
			
 
				+
			
 
				+            if return_kv:
			
 
				+                text, table = get_table_text_kv(inner_table, head_list)
			
 
				+                kv_list, kv_dict_list = process_dict(text, table)
			
 
				+                tbody.string = text
			
 
				+                # html放入dict
			
 
				+                for kv_dict in kv_dict_list:
			
 
				+                    html = html_table[kv_dict.get('value_row_index')][kv_dict.get('value_col_index')]
			
 
				+                    kv_dict['value_html'] = html
			
 
				+            else:
			
 
				+                tbody.string = getTableText(inner_table,head_list)
			
 
				             table_max_len = 30000
			
 
				             tbody.string = tbody.string[:table_max_len]
			
 
				             # print(tbody.string)
			
 
				             tbody.name = "turntable"
			
 
				-            return inner_table
			
 
				+            if return_kv:
			
 
				+                return inner_table, kv_dict_list, text
			
 
				+            else:
			
 
				+                return inner_table
			
 
				+        if return_kv:
			
 
				+            return None, None, None
			
 
				         return None
			
 
				 
			
 
				 
			
@@ -1404,6 +1784,10 @@ def tableToText(soup, docid=None):
 
				         elif _part.name=='div':
			
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				+
			
 
				+    if return_kv and tbodies:
			
 
				+        tbodies = tbodies[:1]
			
 
				+
			
 
				     #逆序处理嵌套表格
			
 
				     # print('len(tbodies)1', len(tbodies))
			
 
				     # for tbody_index in range(1,len(tbodies)+1):
			
@@ -1436,6 +1820,10 @@ def tableToText(soup, docid=None):
 
				         elif _part.name == 'div':
			
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				+
			
 
				+    if return_kv and tbodies:
			
 
				+        tbodies = tbodies[:1]
			
 
				+
			
 
				     #逆序处理嵌套表格
			
 
				     tbody_index = 1
			
 
				     # for tbody_index in range(1,len(tbodies)+1):
			
@@ -1457,6 +1845,10 @@ def tableToText(soup, docid=None):
 
				         list_innerTable.append(inner_table)
			
 
				         tbody_index += 1
			
 
				 
			
 
				+    if return_kv:
			
 
				+        kv_list = [x[1] for x in list_innerTable]
			
 
				+        list_innerTable = [x[0] for x in list_innerTable]
			
 
				+        return soup, kv_list
			
 
				     return soup
			
 
				     # return list_innerTable
			
 
				 
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -275,8 +275,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
			
 
				         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
			
 
				         requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
			
 
				-    if addr_bidopen_text == '':
			
 
				-        addr_bidopen_text = extract_addr(list_articles[0].content)
			
 
				+    # if addr_bidopen_text == '':
			
 
				+    #     addr_bidopen_text = extract_addr(list_articles[0].content)
			
 
				+    addr_dic, time_dic, code_investment = predictor.getPredictor('entity_type_rule').predict(list_entitys, list_sentences, list_articles)
			
 
				+    if addr_bidopen_text != '' and 'addr_bidopen' not in addr_dic:
			
 
				+        addr_dic['addr_bidopen'] = addr_bidopen_text
			
 
				+    if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
			
 
				+        addr_dic['addr_bidsend'] = addr_bidsend_text
			
 
				 
			
 
				     # 过滤掉Redis里值为0的错误实体
			
 
				     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
			
@@ -464,9 +469,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     '''打标签'''
			
 
				     label_dic = get_all_label(title, list_articles[0].content, prem[0]['prem'])
			
 
				 
			
 
				+    '''评标评分提取'''
			
 
				+    bid_score = predictor.getPredictor('bid_score').predict(text, nlp_enterprise+nlp_enterprise_attachment)
			
 
				+
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-12-02'}
			
 
				+    version_date = {'version_date': '2024-12-12'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
@@ -520,10 +528,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     data_res['requirement'] = requirement_text[:1500]
			
 
				     # 打标签
			
 
				     data_res['label_dic'] = label_dic
			
 
				-    # 开标地点
			
 
				-    data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
			
 
				-    # 投标地址
			
 
				-    data_res['addr_dic']['addr_bidsend'] = addr_bidsend_text
			
 
				+    # 开标、投标、项目、收货等地址
			
 
				+    data_res['addr_dic'] = addr_dic
			
 
				     # 字数
			
 
				     text_main, text_attn = 0, 0
			
 
				     for sentence in list_sentences[0]:
			
@@ -537,8 +543,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
			
 
				     # 是否为存款项目
			
 
				     data_res['is_deposit_project'] = deposit_project
			
 
				-    data_res['pinmu_name'] = pinmu_name
			
 
				-    data_res['policies'] = list_policy
			
 
				+    data_res['pinmu_name'] = pinmu_name # 品目名称
			
 
				+    data_res['policies'] = list_policy # 政策法规
			
 
				+    data_res['bid_score'] = bid_score # 评标得分
			
 
				+    data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
			
 
				+    data_res['code_investment'] = code_investment # 投资项目编号
			
 
				 
			
 
				     # for _article in list_articles:
			
 
				     #         log(_article.content)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -2769,7 +2769,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				         :param max_sent_dist: 最大句子距离
			
 
				         :return:
			
 
				         '''
			
 
				-        l.sort(key=lambda x: [x[2],x[3]])
			
 
				+        l.sort(key=lambda x: [x[2],x[3],x[4]]) # 20241204 多个字段排序 修复 561998414 第一标段西铭矿清水泵采购 标段和包名开始位置一样的情况
			
 
				         link_dic = {}
			
 
				         i = 1
			
 
				         while i < len(l):
			
@@ -2780,7 +2780,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                     if ent1 not in link_dic:
			
 
				                         link_dic[ent1] = []
			
 
				                     if s1 == s2:
			
 
				-                        dist = abs(b2 - e1)
			
 
				+                        dist = abs(b2 - e1) if b2 > e1 else 0
			
 
				                     else:
			
 
				                         dist = len(list_sentence[s1].sentence_text) - e1
			
 
				                         for id in range(s1+1, s2):
			
@@ -2793,7 +2793,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                     if ent2 not in link_dic:
			
 
				                         link_dic[ent2] = []
			
 
				                     if s1 == s2:
			
 
				-                        dist = abs(b2 - e1)
			
 
				+                        dist = abs(b2 - e1) if b2 > e1 else 0
			
 
				                     else:
			
 
				                         dist = len(list_sentence[s1].sentence_text) - e1
			
 
				                         for id in range(s1+1, s2):
			
@@ -2801,7 +2801,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                         dist += b2
			
 
				                     if in_att1:
			
 
				                         dist += 100  # 附件的距离加100
			
 
				-                    dist += 30 # 包号在实体后面距离再加30
			
 
				+                    if s1!=s2 or e1!=e2:
			
 
				+                        dist += 30 # 包号在实体后面距离再加30
			
 
				                     link_dic[ent2].append((s2 - s1, dist, ent1))
			
 
				             i += 1
			
 
				         return link_dic
			
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -181,16 +181,14 @@ def extract_parameters(parse_document):
 
				                 if it not in list_policy:
			
 
				                     list_policy.append(it.group(0))
			
 
				 
			
 
				-    if re.search('时间：', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidopen_text):
			
 
				-        for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidopen_text):
			
 
				-            b, e = ser.span()
			
 
				-        addr_bidopen_text = addr_bidopen_text[b:e]
			
 
				-    elif re.search('开启', addr_bidopen_text) and re.search('时间：\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
			
 
				+    ser = re.search('地[址点][：为](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidopen_text) or re.search('[：，](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidopen_text)
			
 
				+    if ser:
			
 
				+        addr_bidopen_text = ser.group('addr')
			
 
				+    ser = re.search('地[址点][：为](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidsend_text) or re.search('[：，](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidsend_text)
			
 
				+    if ser:
			
 
				+        addr_bidsend_text = ser.group('addr')
			
 
				+    if re.search('开启', addr_bidopen_text) and re.search('时间：\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
			
 
				         addr_bidopen_text = ""
			
 
				-    if re.search('时间：', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidsend_text):
			
 
				-        for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidsend_text):
			
 
				-            b, e = ser.span()
			
 
				-        addr_bidsend_text = addr_bidsend_text[b:e]
			
 
				     ser = re.search(pinmu_name_pattern, pinmu_name)
			
 
				     if ser:
			
 
				         pinmu_name = pinmu_name[ser.end():]
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -45,12 +45,44 @@ file = os.path.dirname(__file__) + '/agency_set.pkl'
 
				 with open(file, 'rb') as f:
			
 
				     agency_set = pickle.load(f)
			
 
				 
			
 
				+with open(os.path.dirname(__file__) + '/header_set.pkl', 'rb') as f:
			
 
				+    header_set = pickle.load(f)
			
 
				+
			
 
				 def is_agency(entity_text):
			
 
				     if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[（）\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[（）\w]{,4}(中心|服务所)|法院$',
			
 
				                  entity_text) or entity_text in agency_set:
			
 
				         return True
			
 
				     return False
			
 
				 
			
 
				+def get_role(text, nlp_enterprise):
			
 
				+    '''
			
 
				+    获取字符串text角色实体
			
 
				+    :param text: 待获取实体字符串
			
 
				+    :param nlp_enterprise: 公告中的角色实体列表
			
 
				+    :return:
			
 
				+    '''
			
 
				+    text = re.sub('联合体：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|[(（][主成][）)]'
			
 
				+                  , '，', text)
			
 
				+    text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
			
 
				+    text = re.sub('[一二三四五六七八九十]+标段[：:]|标段[一二三四五六七八九十]+[：:]|第[一二三四五六七八九十]+名[：:]', '', text) # 2024/4/22 修复 372839375 三标段：宁夏一山科技有限公司
			
 
				+    text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
			
 
				+    if text in nlp_enterprise:
			
 
				+        return text
			
 
				+    if len(text) > 50 or len(text)<4:
			
 
				+        return ''
			
 
				+    ners = getNers([text], useselffool=True)
			
 
				+    roles = []
			
 
				+    if ners:
			
 
				+        for ner in ners[0]:
			
 
				+            if ner[2] in ['org', 'company']:
			
 
				+                roles.append(ner[3])
			
 
				+            elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
			
 
				+                roles.append(ner[3])
			
 
				+    if roles and len(''.join(roles)) > len(text)*0.8:
			
 
				+        return roles[0]
			
 
				+    else:
			
 
				+        return ''
			
 
				+
			
 
				 from threading import RLock
			
 
				 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
			
 
				               "prem":{"predictor":None,"Lock":RLock()},
			
@@ -76,7 +108,9 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
 
				                   'project_label': {"predictor": None, "Lock": RLock()},
			
 
				                   'pb_extract': {"predictor": None, "Lock": RLock()},
			
 
				                   'property_label': {"predictor": None, "Lock": RLock()},
			
 
				-                  'approval': {"predictor": None, "Lock": RLock()}  # 审批项目预测
			
 
				+                  'approval': {"predictor": None, "Lock": RLock()}, # 审批项目预测
			
 
				+                  'bid_score': {"predictor": None, "Lock": RLock()},  # 评标评分
			
 
				+                  'entity_type_rule': {"predictor": None, "Lock": RLock()},  # 地址、时间分类
			
 
				                   }
			
 
				 
			
 
				 
			
@@ -134,6 +168,10 @@ def getPredictor(_type):
 
				                     dict_predictor[_type]['predictor'] = PropertyLabel()
			
 
				                 if _type == 'approval':
			
 
				                     dict_predictor[_type]['predictor'] = ApprovalPredictor()
			
 
				+                if _type == 'bid_score':
			
 
				+                    dict_predictor[_type]['predictor'] = BiddingScore()
			
 
				+                if _type == 'entity_type_rule':
			
 
				+                    dict_predictor[_type]['predictor'] = EntityTypeRulePredictor()
			
 
				             return dict_predictor[_type]["predictor"]
			
 
				     raise NameError("no this type of predictor")
			
 
				 
			
@@ -988,9 +1026,9 @@ class PREMPredict():
 
				                 # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释，单价单独存放
			
 
				                 #     label = 2
			
 
				             elif label ==0: # 错误招标金额处理
			
 
				-                if re.search('投资(金额|规模)：$', front): # 545988699 金额不大的投资金额作为备选招标金额
			
 
				+                if entity.notes in ["投资", "总投资","工程造价"] or re.search('投资(金额|规模)：$', front): # 545988699 金额不大的投资金额作为备选招标金额
			
 
				                     values[label] = 0.51
			
 
				-                elif entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价：?$|注册资本', front) or re.search('服务内容：([\d,.]+万?亿?元?-?)$', front):
			
 
				+                elif re.search('最低限价：?$|注册资本', front) or re.search('服务内容：([\d,.]+万?亿?元?-?)$', front):
			
 
				                     values[label] = 0.49
			
 
				                     label = 2
			
 
				                 elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[（）]?[+×*-][\d.%]+|（含）', behind):
			
@@ -999,6 +1037,9 @@ class PREMPredict():
 
				                 #     values[label] = 0.49
			
 
				                 # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释，单价单独存放
			
 
				                 #     label = 2
			
 
				+                elif re.search('招标金额|限价|预算|控制价|拦标价', front) == None and re.search('预计约?为?$',
			
 
				+                                                                                  front):  # 20241206纠正 565894149（预计约2500元）预测为预算
			
 
				+                    label = 2
			
 
				             elif re.search('报价：预估不?含税总价[为：]$', front) and (label != 1 or values[label]<0.5):
			
 
				                 label = 1
			
 
				                 values[label] = 0.8
			
@@ -6542,10 +6583,11 @@ class DistrictPredictor():
 
				 
			
 
				 class TableTag2List():
			
 
				     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
			
 
				-    def table2list(self, table, text_process=None):
			
 
				+    def table2list(self, table, text_process=None, return_html_table=False):
			
 
				         self._output = []
			
 
				         row_ind = 0
			
 
				         col_ind = 0
			
 
				+        html_table = []
			
 
				         for row in table.find_all('tr'):
			
 
				             # record the smallest row_span, so that we know how many rows
			
 
				             # we should skip
			
@@ -6592,7 +6634,10 @@ class TableTag2List():
 
				                             text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "（").replace(')', '）').replace('?', '')
			
 
				                             # text = re.sub('\s', '', text)[:200] # 只需取前200字即可
			
 
				                             text = ' ' if text == "" else text
			
 
				+
			
 
				                         self._insert(row_ind, col_ind, row_span, col_span, text)
			
 
				+                        if return_html_table:
			
 
				+                            html_table = self._insert_new(row_ind, col_ind, row_span, col_span, str(cell), html_table)
			
 
				                     except UnicodeEncodeError:
			
 
				                         raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
			
 
				 
			
@@ -6604,7 +6649,20 @@ class TableTag2List():
 
				             # update row_ind
			
 
				             row_ind += smallest_row_span
			
 
				             col_ind = 0
			
 
				-        return self._output
			
 
				+        if return_html_table:
			
 
				+            temp_list = []
			
 
				+            for row in self._output:
			
 
				+                if len(row) > 0:
			
 
				+                    temp_list.append(row)
			
 
				+            self._output =  temp_list
			
 
				+            temp_list = []
			
 
				+            for row in html_table:
			
 
				+                if len(row) > 0:
			
 
				+                    temp_list.append(row)
			
 
				+            html_table = temp_list
			
 
				+            return self._output, html_table
			
 
				+        else:
			
 
				+            return self._output
			
 
				 
			
 
				     def _check_validity(self, i, j, height, width):
			
 
				         """
			
@@ -6639,6 +6697,24 @@ class TableTag2List():
 
				         if self._output[i][j] == "":
			
 
				             self._output[i][j] = val
			
 
				 
			
 
				+    def _insert_new(self, i, j, height, width, val, cell_list):
			
 
				+        # pdb.set_trace()
			
 
				+        for ii in range(i, i+height):
			
 
				+            for jj in range(j, j+width):
			
 
				+                cell_list = self._insert_cell_new(ii, jj, val, cell_list)
			
 
				+        return cell_list
			
 
				+
			
 
				+    def _insert_cell_new(self, i, j, val, cell_list):
			
 
				+        while i >= len(cell_list):
			
 
				+            cell_list.append([])
			
 
				+        while j >= len(cell_list[i]):
			
 
				+            cell_list[i].append("")
			
 
				+
			
 
				+        if cell_list[i][j] == "":
			
 
				+            cell_list[i][j] = val
			
 
				+        return cell_list
			
 
				+
			
 
				+
			
 
				 def is_head_line(list_item):
			
 
				     '''
			
 
				     调用表头识别模型判断是否为表头行
			
@@ -6785,35 +6861,6 @@ class TablePremExtractor(object):
 
				             contain_header = True
			
 
				         return flag, contain_header, dict(), not_sure_winner
			
 
				 
			
 
				-    def get_role(self, text, nlp_enterprise):
			
 
				-        '''
			
 
				-        获取字符串text角色实体
			
 
				-        :param text: 待获取实体字符串
			
 
				-        :param nlp_enterprise: 公告中的角色实体列表
			
 
				-        :return:
			
 
				-        '''
			
 
				-        text = re.sub('联合体：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|[(（][主成][）)]'
			
 
				-                      , '，', text)
			
 
				-        text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
			
 
				-        text = re.sub('[一二三四五六七八九十]+标段[：:]|标段[一二三四五六七八九十]+[：:]|第[一二三四五六七八九十]+名[：:]', '', text) # 2024/4/22 修复 372839375 三标段：宁夏一山科技有限公司
			
 
				-        text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
			
 
				-        if text in nlp_enterprise:
			
 
				-            return text
			
 
				-        if len(text) > 50 or len(text)<4:
			
 
				-            return ''
			
 
				-        ners = getNers([text], useselffool=True)
			
 
				-        roles = []
			
 
				-        if ners:
			
 
				-            for ner in ners[0]:
			
 
				-                if ner[2] in ['org', 'company']:
			
 
				-                    roles.append(ner[3])
			
 
				-                elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
			
 
				-                    roles.append(ner[3])
			
 
				-        if roles and len(''.join(roles)) > len(text)*0.8:
			
 
				-            return roles[0]
			
 
				-        else:
			
 
				-            return ''
			
 
				-
			
 
				     def extract_from_df(self, df, headers, web_source_name, all_winner=False):
			
 
				         prem_dic = {}
			
 
				         previous_package = ""  # 上一行包号
			
@@ -6890,8 +6937,8 @@ class TablePremExtractor(object):
 
				                 if len(pk_l) == 1:
			
 
				                     package = uniform_package_name(pk_l[0].group(0))
			
 
				 
			
 
				-            tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
			
 
				-            tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
			
 
				+            tenderee = get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
			
 
				+            tenderer = get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
			
 
				             tenderee = cut_repeat_name(tenderee)
			
 
				             tenderer = cut_repeat_name(tenderer)
			
 
				 
			
@@ -7695,10 +7742,13 @@ def get_header_line(list_item):
 
				         x.append(getPredictor("form").encode(item))
			
 
				     predict_y = getPredictor("form").predict(np.array(x), type="item")
			
 
				     for item, values in zip(list_item, list(predict_y)):
			
 
				+        item = str(item)
			
 
				         lb = 1 if values[1] > 0.5 else 0
			
 
				-        if item in ['许可/同意', '办结（通过）', '办结（准予许可）','批准']:
			
 
				+        if item in ['许可/同意', '办结（通过）', '办结（准予许可）','批准', '合格']:
			
 
				             lb = 0
			
 
				-        elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺']:
			
 
				+        elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺'] or re.search('^比例\d{1,2}%$', item):
			
 
				+            lb = 1
			
 
				+        elif lb == 0 and item in header_set:
			
 
				             lb = 1
			
 
				         rs.append(lb)
			
 
				     return rs
			
@@ -7976,6 +8026,267 @@ class ApprovalPredictor():
 
				             return [rs_dic]
			
 
				         return []
			
 
				 
			
 
				+class BiddingScore():
			
 
				+    def __init__(self):
			
 
				+        self.head_rule_dic = {
			
 
				+            "tenderer": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
			
 
				+            "score_price": "(价格|报价|单价|总价|经济)(部分|\w{,2})?([得评]分|评审)",
			
 
				+            "score_technical": "技术(部分|\w{,2})?标?([得评]分|评审)",
			
 
				+            "score_commercial": "商务(部分|\w{,2})?标?([得评]分|评审)",
			
 
				+            "score_integrity": "诚信(部分|\w{,2})?([得评]分|评审)",
			
 
				+            "score_comprehensive": "(综合(标|评估)?|总|最终)得?分$",
			
 
				+            "ranking": "(得分)?排名",
			
 
				+            "qualification_review": "资格性审查|是否通过资格",
			
 
				+            "compliance_review": "符合性审查|是否通过符合"
			
 
				+        }
			
 
				+        self.tb = TableTag2List()
			
 
				+
			
 
				+    def get_table_info(self, df, nlp_enterprise):
			
 
				+        def get_header_index(datas):
			
 
				+            '''
			
 
				+            根据表格表头判断结果0/1 得到哪些行和列是表头
			
 
				+            :param datas: 表格内容表头判断结果数据[[1,1,1,1],[0,0,0,0]]
			
 
				+            :return: 表头所在的行和列序号
			
 
				+            '''
			
 
				+            header_row = []
			
 
				+            header_col = []
			
 
				+            df_h = pd.DataFrame(datas)  # 表头判断数据 , columns=columns
			
 
				+            for i in df_h.index:
			
 
				+                line = df_h.loc[i].values
			
 
				+                if sum(line) == len(line):
			
 
				+                    header_row.append((i, sum(line) / len(line)))
			
 
				+                elif sum(line) / len(line) > 0.8:
			
 
				+                    header_row.append((i, sum(line) / len(line)))
			
 
				+                elif len(line) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
			
 
				+                        re.findall('10', ''.join([str(it) for it in line]))):
			
 
				+                    header_row.append((i, sum(line) / len(line)))
			
 
				+            for i in df_h.columns:
			
 
				+                col = df_h[i].values
			
 
				+                if sum(col) == len(col):
			
 
				+                    header_col.append((i, sum(col) / len(col)))
			
 
				+                elif sum(col) / len(col) > 0.8:
			
 
				+                    header_col.append((i, sum(col) / len(col)))
			
 
				+                elif len(col) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
			
 
				+                        re.findall('10', ''.join([str(it) for it in line]))):
			
 
				+                    header_col.append((i, sum(col) / len(col)))
			
 
				+            return header_row, header_col
			
 
				+
			
 
				+        def get_header(l, head_rule_dic):
			
 
				+            header_dic = {}
			
 
				+            for i in range(len(l)):
			
 
				+                text = l[i]
			
 
				+                num = 0
			
 
				+                tmp_dic = {}
			
 
				+                for k, v in head_rule_dic.items():
			
 
				+                    # print('k : ', k)
			
 
				+                    if re.search(v, text):
			
 
				+                        tmp_dic[k] = i
			
 
				+                        num += 1
			
 
				+                # if num > 1:
			
 
				+                #     if tmp_dic.keys() == set(['qualification_review', 'compliance_review']):
			
 
				+                #         for k, v in tmp_dic.items():
			
 
				+                #             if k not in header_dic:
			
 
				+                #                 header_dic[k] = v
			
 
				+                # elif tmp_dic:
			
 
				+                for k, v in tmp_dic.items():
			
 
				+                    if k not in header_dic:
			
 
				+                        header_dic[k] = v
			
 
				+            return header_dic
			
 
				+
			
 
				+        def get_score(text):
			
 
				+            text = text.strip()
			
 
				+            if re.search('^\d{1,2}(\.\d{2})$', text):
			
 
				+                return text
			
 
				+            elif re.search('^\d{1,2}(\.\d{2})?[\d,，；\.]*$', text):
			
 
				+                return text
			
 
				+            return ''
			
 
				+
			
 
				+        result_l = []
			
 
				+        datas = []
			
 
				+        for i in df.index:
			
 
				+            line = get_header_line(df.loc[i].values)
			
 
				+            datas.append(line)
			
 
				+        header_row, header_col = get_header_index(datas)
			
 
				+        if len(header_col) == 1 and header_col[0][0] > 1: # 列表头不可能在第1列后面开始
			
 
				+            header_col = []
			
 
				+        if len(header_row) >= 1 and len(header_col) == 0:  # 有行表头无列表头
			
 
				+            i = 0
			
 
				+            while i < len(header_row):
			
 
				+                idx, ratio = header_row[i]
			
 
				+                if idx + 1 >= len(df):
			
 
				+                    break
			
 
				+                header_dic = get_header(df.loc[idx].values, self.head_rule_dic)
			
 
				+                i += 1
			
 
				+                range_from = idx + 1
			
 
				+                range_to = len(df)
			
 
				+                if i < len(header_row):
			
 
				+                    next_header = i
			
 
				+                    for j in range(i, len(header_row)):
			
 
				+                        idx2, ratio2 = header_row[j]
			
 
				+                        if idx2 - idx == 1:
			
 
				+                            header_dic2 = get_header(df.loc[idx2].values, self.head_rule_dic)
			
 
				+                            if set(df.loc[idx].values) & set(df.loc[idx2].values) != set():
			
 
				+                                header_dic.update(header_dic2)
			
 
				+                            else:
			
 
				+                                header_dic = header_dic2
			
 
				+                            range_from = idx2 + 1
			
 
				+                            range_to = len(df)
			
 
				+                            next_header = j + 1
			
 
				+                            idx = idx2
			
 
				+                        else:
			
 
				+                            range_from = idx + 1
			
 
				+                            range_to = idx2
			
 
				+                            next_header = j
			
 
				+                            break
			
 
				+                    i = next_header
			
 
				+                if len(header_dic) >= 2 and 'tenderer' in header_dic:
			
 
				+                    for index in range(range_from, range_to):
			
 
				+                        tmp_dic = {}
			
 
				+                        for k, v in header_dic.items():
			
 
				+                            if k.startswith('score'):
			
 
				+                                content = get_score(df.loc[index, v])
			
 
				+                            elif k == 'tenderer':
			
 
				+                                content = get_role(df.loc[index, v], nlp_enterprise)
			
 
				+                            elif k == 'ranking':
			
 
				+                                content = df.loc[index, v] if re.search('^第?[\d一二三四五六七八九十]+名?$',df.loc[index, v]) else ''
			
 
				+                            else:
			
 
				+                                content = df.loc[index, v]
			
 
				+                            if content != '':
			
 
				+                                tmp_dic[k] = content
			
 
				+                        if len(tmp_dic) > 1 and 'tenderer' in tmp_dic and tmp_dic not in result_l:
			
 
				+                            result_l.append(tmp_dic)
			
 
				+        elif len(header_row) == 0 and len(header_col) >= 1:
			
 
				+            i = 0
			
 
				+            while i < len(header_col):
			
 
				+                idx, ratio = header_col[i]
			
 
				+                if idx + 1 >= len(df.columns):
			
 
				+                    break
			
 
				+                header_dic = get_header(df[idx].values, self.head_rule_dic)
			
 
				+                i += 1
			
 
				+                range_from = idx + 1
			
 
				+                range_to = len(df.columns)
			
 
				+                if i < len(header_col):
			
 
				+                    next_header = i
			
 
				+                    for j in range(i, len(header_col)):
			
 
				+                        idx2, ratio2 = header_col[j]
			
 
				+                        if idx2 - idx == 1:
			
 
				+                            header_dic2 = get_header(df[idx2].values, self.head_rule_dic)
			
 
				+                            if set(df[idx].values) & set(df[idx2].values) != set():
			
 
				+                                header_dic.update(header_dic2)
			
 
				+                            else:
			
 
				+                                header_dic = header_dic2
			
 
				+                            range_from = idx2 + 1
			
 
				+                            range_to = len(df.columns)
			
 
				+                            next_header = j + 1
			
 
				+                            idx = idx2
			
 
				+                        else:
			
 
				+                            range_from = idx + 1
			
 
				+                            range_to = idx2
			
 
				+                            next_header = j
			
 
				+                            break
			
 
				+                    i = next_header
			
 
				+                if len(header_dic.keys()&set(['tenderer','score_technical', 'score_commercial', 'score_price', 'score_comprehensive'])) >= 2 and 'tenderer' in header_dic:
			
 
				+                    for index in range(range_from, range_to):
			
 
				+                        tmp_dic = {}
			
 
				+                        for k, v in header_dic.items():
			
 
				+                            if k.startswith('score'):
			
 
				+                                content = get_score(df.loc[v, index])
			
 
				+                            elif k == 'tenderer':
			
 
				+                                content = get_role(df.loc[v, index], nlp_enterprise)
			
 
				+                            elif k == 'ranking':
			
 
				+                                content = df.loc[v, index] if re.search('^第?[\d一二三四五六七八九十]+名?$', df.loc[v, index]) else ''
			
 
				+                            else:
			
 
				+                                content = df.loc[v, index]
			
 
				+                            if content != '':
			
 
				+                                tmp_dic[k] = content
			
 
				+                        if len(tmp_dic) > 2 and 'tenderer' in tmp_dic and tmp_dic not in result_l:
			
 
				+                            result_l.append(tmp_dic)
			
 
				+        elif len(header_row) == 1 and len(header_col) == 1:
			
 
				+            pass
			
 
				+        return result_l
			
 
				+
			
 
				+    def predict(self, html, nlp_enterprise=[]):
			
 
				+        html = re.sub("<html>|</html>|<body>|</body>", "", html)
			
 
				+        html = re.sub("##attachment##", "", html)
			
 
				+        soup = BeautifulSoup(html, 'lxml')
			
 
				+        richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
			
 
				+        self.nlp_enterprise = nlp_enterprise
			
 
				+        if richText:
			
 
				+            richText = richText.extract()  # 过滤掉附件
			
 
				+        tables = soup.find_all('table')
			
 
				+        if len(tables) == 0 and richText:
			
 
				+            tables = richText.find_all('table')
			
 
				+        tables.reverse()
			
 
				+        rs_dic = {}
			
 
				+        for table in tables:
			
 
				+            trs = self.tb.table2list(table)
			
 
				+            if len(trs)>1 and len(trs[0])>1 and len(set([len(tr) for tr in trs])) == 1:
			
 
				+                df = pd.DataFrame(trs)
			
 
				+                rs_l = self.get_table_info(df, nlp_enterprise)
			
 
				+                for d in rs_l:
			
 
				+                    if d['tenderer'] not in rs_dic:
			
 
				+                        rs_dic[d['tenderer']] = d
			
 
				+                    elif len(d) > len(rs_dic[d['tenderer']]):
			
 
				+                        rs_dic[d['tenderer']] = d
			
 
				+            table.extract()
			
 
				+        return list(rs_dic.values())
			
 
				+
			
 
				+class EntityTypeRulePredictor():
			
 
				+    def __init__(self):
			
 
				+        self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)）?(会议)?地[点址]([(（]网址[)）])?[：为]'
			
 
				+        self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([(（]网址[)）])?[：为]'
			
 
				+        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?）?地[点址][：为]'
			
 
				+        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[：为]|项目位于'
			
 
				+        self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
			
 
				+        self.pattern_code_investment = '投资(审批)?项目[编代]码[：为]'
			
 
				+    def predict(self, list_entitys, list_sentences, list_articles):
			
 
				+        addr_dic = {}
			
 
				+        time_dic = {}
			
 
				+        code_investment = ''
			
 
				+        for entity in list_entitys[0]:
			
 
				+            if entity.entity_type == 'location':
			
 
				+                b = entity.wordOffset_begin
			
 
				+                s_index = entity.sentence_index
			
 
				+                sentance_text = list_sentences[0][s_index].sentence_text
			
 
				+                if re.search(self.pattern_addr_bidopen, sentance_text[max(0, b-10): b]):
			
 
				+                    addr_dic['addr_bidopen'] = entity.entity_text
			
 
				+                elif re.search(self.pattern_addr_bidsend, sentance_text[max(0, b-10): b]):
			
 
				+                    addr_dic['addr_bidsend'] = entity.entity_text
			
 
				+                elif re.search(self.pattern_addr_delivery, sentance_text[max(0, b-10): b]):
			
 
				+                    addr_dic['addr_delivery'] = entity.entity_text
			
 
				+                elif re.search(self.pattern_addr_project, sentance_text[max(0, b-10): b]):
			
 
				+                    addr_dic['addr_project'] = entity.entity_text
			
 
				+            elif entity.entity_type == 'time':
			
 
				+                b = entity.wordOffset_begin
			
 
				+                s_index = entity.sentence_index
			
 
				+                sentance_text = list_sentences[0][s_index].sentence_text
			
 
				+                if re.search(self.pattern_time_planned, sentance_text[max(0, b-12): b]):
			
 
				+                    time_dic['time_planned'] = entity.entity_text
			
 
				+            elif entity.entity_type == 'code':
			
 
				+                b = entity.wordOffset_begin
			
 
				+                s_index = entity.sentence_index
			
 
				+                sentance_text = list_sentences[0][s_index].sentence_text
			
 
				+                if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
			
 
				+                    code_investment = entity.entity_text
			
 
				+
			
 
				+        ser1 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_bidopen, list_articles[0].content)
			
 
				+        ser2 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_bidsend, list_articles[0].content)
			
 
				+        ser3 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_delivery, list_articles[0].content)
			
 
				+        ser4 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_project, list_articles[0].content)
			
 
				+        ser5 = re.search('(%s)(?P<code>[\da-zA-Z（）-]{5,30})[，。]'%self.pattern_code_investment, list_articles[0].content)
			
 
				+        if ser1 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
			
 
				+            addr_dic['addr_bidopen'] = ser1.group('addr')
			
 
				+        if ser2 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
			
 
				+            addr_dic['addr_bidsend'] = ser2.group('addr')
			
 
				+        if ser3 and re.search('\w{2,5}[省市区]|\d号', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
			
 
				+            addr_dic['addr_delivery'] = ser3.group('addr')
			
 
				+        if ser4 and re.search('\w{2,5}[省市区]|\d号', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
			
 
				+            addr_dic['addr_project'] = ser4.group('addr')
			
 
				+        if ser5 and code_investment == '':
			
 
				+            code_investment = ser5.group('code')
			
 
				+
			
 
				+        return addr_dic, time_dic, code_investment
			
 
				+
			
 
				 def getSavedModel():
			
 
				     #predictor = FormPredictor()
			
 
				     graph = tf.Graph()
			
@@ -8336,12 +8647,16 @@ if __name__=="__main__":
 
				     title = '甘肃省妇幼保健院（甘肃省中心医院）2024年度大额资金定期存款竞争性存放项目（第二期）采购结果公告'
			
 
				     with open('d:/html/2.html', 'r', encoding='utf-8') as f:
			
 
				         html = f.read()
			
 
				-    tb_extract = TablePremExtractor()
			
 
				-    rs = tb_extract.predict(html, [
			
 
				-        "江苏中联铸本混凝土有限公司",
			
 
				-        "鼓楼区协荣机械设备经销部"
			
 
				-    ], web_source_name = '', all_winner=False)
			
 
				-    print('标段数：',len(rs[0]))
			
 
				+    # tb_extract = TablePremExtractor()
			
 
				+    # rs = tb_extract.predict(html, [
			
 
				+    #     "江苏中联铸本混凝土有限公司",
			
 
				+    #     "鼓楼区协荣机械设备经销部"
			
 
				+    # ], web_source_name = '', all_winner=False)
			
 
				+    # print('标段数：',len(rs[0]))
			
 
				+    # print(rs)
			
 
				+    bdscore = BiddingScore()
			
 
				+    rs = bdscore.predict(html)
			
 
				+    print(type(rs), len(rs))
			
 
				     print(rs)
			
 
				 
			
 
				     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]