luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805
							#coding:utf8
'''
Created on 2019年1月4日

@author: User
'''
import os

from bs4 import BeautifulSoup, Comment
import copy
import re
import sys
import os
import codecs
import requests
import time
from unicodedata import normalize

_time1 = time.time()
sys.path.append(os.path.abspath("../.."))

from BiddingKG.dl.common.Utils import *
import BiddingKG.dl.entityLink.entityLink as entityLink
import BiddingKG.dl.interface.predictor as predictor
import BiddingKG.dl.interface.Preprocessing as Preprocessing
import BiddingKG.dl.interface.getAttributes as getAttributes
import BiddingKG.dl.complaint.punish_predictor as punish_rule
import json
from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
from BiddingKG.dl.ratio.re_ratio import extract_ratio
from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
from BiddingKG.dl.interface.get_label_dic import get_all_label
from BiddingKG.dl.channel.channel_bert import merge_channel
from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
from BiddingKG.dl.interface.special_debt_extract import get_debt_info


# 自定义jsonEncoder
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj,str):
            return obj
        return json.JSONEncoder.default(self, obj)

def get_login_web_set():

    file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
    list_web = []
    try:
        if os.path.exists(file):
            with open(file,"r",encoding="utf8") as f:
                while 1:
                    line = f.readline()
                    if not line:
                        break
                    line = line.strip()
                    if line:
                        list_web.append(line)

    except Exception as e:
        traceback.print_exc()
    _set = set(list_web)
    log("get_login_web_set length %d"%(len(_set)))
    return _set

set_login_web = get_login_web_set()


def extractCount(extract_dict,page_attachments,web_source_name,page_time):
    # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"

    if len(extract_dict):
        _extract = extract_dict
    else:
        _extract = {}
    # print(_extract)
    dict_pack = _extract.get("prem",{})
    extract_count = 0
    list_code = _extract.get("code",[])
    word_count = _extract.get("word_count",{})
    if word_count.get("正文",0)>500:
        extract_count += 3
    if len(list_code)>0:
        project_code = list_code[0]
    else:
        project_code = ""
    project_name = _extract.get("name","")
    bidding_budget = ""
    win_tenderer = ""
    win_bid_price = ""
    linklist_count = 0

    time_getFileEnd = _extract.get("time_getFileEnd","")
    time_bidopen = _extract.get("time_bidopen","")
    time_bidclose = _extract.get("time_bidclose","")
    time_publicityEnd = _extract.get("time_publicityEnd","")

    if page_time!="":
        if time_getFileEnd!="" and time_getFileEnd<page_time:
            extract_count -= 5
        if time_bidopen!="" and time_bidopen<page_time:
            extract_count -= 5
        if time_bidclose!="" and time_bidclose<page_time:
            extract_count -= 5
        if time_publicityEnd!="" and time_publicityEnd<page_time:
            extract_count -= 5


    for _key in dict_pack.keys():
        if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
            extract_count += 1
            if bidding_budget=="":
                bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
        for _role in dict_pack[_key]["roleList"]:
            if isinstance(_role,list):
                extract_count += 1
                if _role[2]!='' and float(_role[2])>0:
                    extract_count += 1
                if _role[0]=="tenderee":
                    tenderee = _role[1]
                if _role[0]=="win_tenderer":
                    if _role[1] is not None and _role[1]!="":
                        extract_count += 2
                    if  win_tenderer=="":
                        win_tenderer = _role[1]
                    if _role[2]!='' and float(_role[2])>0:
                        extract_count += 2
                        if win_bid_price=="":
                            win_bid_price = str(float(_role[2]))
                if _role[0]=="agency":
                    agency = _role[1]
            if isinstance(_role,dict):
                extract_count += 1
                if "role_money" in _role:
                    if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
                        extract_count += 1
                if _role.get("role_name")=="tenderee":
                    tenderee = _role["role_text"]
                if _role.get("role_name")=="win_tenderer":
                    if _role["role_text"] is not None and _role["role_text"]!="":
                        extract_count += 2
                    if  win_tenderer=="":
                        win_tenderer = _role["role_text"]
                    if "role_money" in _role:
                        if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
                            extract_count += 2
                            if win_bid_price=="":
                                win_bid_price = str(float(_role["role_money"]["money"]))
                if _role["role_name"]=="agency":
                    agency = _role["role_text"]

                linklist = _role.get("linklist",[])
                for link in linklist:
                    for l in link:
                        if l!="":
                            linklist_count += 1

    extract_count += linklist_count//2

    if project_code!="":
        extract_count += 1
    if project_name!="":
        extract_count += 1

    if page_attachments is not None and page_attachments!='':
        try:
            _attachments = json.loads(page_attachments)
            set_md5 = set()
            has_zhaobiao = False
            has_qingdan = False
            if len(_attachments)>0:
                for _atta in _attachments:
                    classification = _atta.get("classification","")
                    set_md5.add(_atta.get("fileMd5"))
                    if str(classification)=='招标文件':
                        has_zhaobiao = True
                    if str(classification)=='采购清单':
                        has_qingdan = True


                extract_count += len(set_md5)//2+1
            if has_zhaobiao:
                extract_count += 2
            if has_qingdan:
                extract_count += 1
        except Exception as e:
            traceback.print_exc()
            pass

    list_approval_dict = _extract.get("approval",[])
    for _dict in list_approval_dict:
        for k,v in _dict.items():
            if v is not None and v!='' and v!="未知":
                extract_count += 1


    punish_dict = _extract.get("punish",{})
    for k,v in punish_dict.items():
        if v is not None and v!='' and v!="未知":
            extract_count += 1

    if web_source_name in set_login_web:
        extract_count -= 3

    product = _extract.get("product","")
    extract_count += len(str(product).split(","))//5

    return extract_count

# 字符编码标准化
def str_normalize(text):
    # time1 = time.time()
    cn_punctuation = "￥，｡：；｛｝！？（）＜"
    text_split = re.split("([{}])+".format(cn_punctuation),text)
    # print(text_split)
    new_text = ""
    for s in text_split:
        if re.search("^[{}]+$".format(cn_punctuation),s):
            new_text += s
        else:
            new_text += normalize('NFKD', s)
    # print("str_normalize cost time %s"%str(time.time()-time1))
    # print(new_text)

    return new_text
# 修复prem中地区前缀不完整实体
def repair_entity(prem,district_dict,list_articles):
    district_dict = district_dict['district']
    province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
    city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
    district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
    content_text = list_articles[0].content

    autonomous_region_dict = {
        "新疆":"新疆维吾尔",
        "西藏":"西藏",
        "内蒙古":"内蒙古",
        "广西":"广西壮族",
        "宁夏":"宁夏回族"
    }

    for package,_prem in prem[0]['prem'].items():
        for role in _prem['roleList']:
            if role['role_name'] in ['tenderee','agency']:
                role_text = role['role_text']
                if re.search("^[省市县区]",role_text):
                    if role_text[0]=='省' and role_text[:2] not in ['省道']:
                        role['role_text'] = province + role_text
                    elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
                        if district+'市' in content_text:
                            # 县级市
                            role['role_text'] = district + role_text
                        else:
                            role['role_text'] = city + role_text
                    elif role_text[0] in ['县','区']:
                        role['role_text'] = district + role_text
                elif re.search("^自治[区州县]",role_text):
                    if role_text[:3]=='自治区':
                        role['role_text'] = autonomous_region_dict.get(province,"") + role_text
                    elif role_text[:3] in ['自治县',"自治州"]:
                        if re.search("自治[县州]?$",district):
                            role['role_text'] = re.sub("自治[县州]?","",district) + role_text
                        elif re.search("族$",district):
                            role['role_text'] = district + role_text
                        elif re.search("自治[县州]?$",city):
                            role['role_text'] = re.sub("自治[县州]?","",city) + role_text
                        elif re.search("族$",city):
                            role['role_text'] = city + role_text

def fix_table_structure_preserve_order(html):
    """
    修复table结构中tr与tbody平级的问题
    保持原有行顺序不变
    """
    soup = BeautifulSoup(html, 'html.parser')

    for table in soup.find_all('table'):
        if table.find_all('tr', recursive=False) != []:
            # 获取table下所有直接子节点
            children = list(table.children)
            tbody_new = soup.new_tag('tbody')
            table.append(tbody_new)
            for child in children:
                if child.name:
                    if child.name == 'tbody':
                        for tag in list(child.children):
                            tbody_new.append(tag.extract())
                        child.extract()
                    else:
                        tbody_new.append(child.extract())
    return str(soup)

def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
    cost_time = dict()
    if web_source_no == None:
        web_source_no = ''
    if web_source_name == None:
        web_source_name = ''

    start_time = time.time()
    log("start process doc %s"%(str(doc_id)))
    # 字符编码标准化
    text = str_normalize(text)
    text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
    log("get preprocessed done of doc_id%s"%(doc_id))
    cost_time["preprocess"] = round(time.time()-start_time,2)
    cost_time.update(_cost_time)

    '''大纲提取及大纲内容相关提取'''
    start_time = time.time()
    sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
    parse_document = ParseDocument(text, True,list_obj=sentence2_list)
    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope = extract_parameters(parse_document)

    if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
        parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope = extract_parameters(parse_document)
    # print('out_lines',out_lines)
    # if addr_bidopen_text == '':
    #     addr_bidopen_text = extract_addr(list_articles[0].content)
    addr_dic, time_dic, code_investment = predictor.getPredictor('entity_type_rule').predict(list_entitys, list_sentences, list_articles)
    if addr_bidopen_text != '' and 'addr_bidopen' not in addr_dic:
        addr_dic['addr_bidopen'] = addr_bidopen_text
    if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
        addr_dic['addr_bidsend'] = addr_bidsend_text
    log("get outline done of doc_id%s"%(doc_id))
    cost_time["outline"] = round(time.time()-start_time,2)

    '''从 kvtree 正则匹配要素'''
    start_time = time.time()
    kv_single_dic, kv_addr_dic = get_kvtree_value(text)
    log("get kvtree done of doc_id%s"%(doc_id))
    cost_time["kvtree"] = round(time.time()-start_time,2)

    # 过滤掉Redis里值为0的错误实体
    # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
    # #依赖句子顺序
    # start_time = time.time() # 公告类型/生命周期提取  此处作废 换到后面预测 2022/4/29
    # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
    #                                                         web_source_no=web_source_no,original_docchannel=original_docchannel)
    # cost_time["channel"] = round(time.time()-start_time,2)

    start_time = time.time() # 项目编号、名称提取
    codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
    if re.search('破产清算案', title):
        end = re.search('破产清算案', title).end()
        codeName[0]['name'] = title[:end]
    log("get codename done of doc_id%s"%(doc_id))
    cost_time["codename"] = round(time.time()-start_time,2)

    start_time = time.time()  # 公告类别预测
    channel_dic, msc = predictor.getPredictor("channel").predict_merge(title, list_sentences[0], text,original_docchannel, web_source_no)
    cost_time["rule_channel"] = round(time.time() - start_time, 2)

    start_time = time.time() # 角色金额模型提取
    predictor.getPredictor("prem").predict(list_sentences,list_entitys)
    log("get prem done of doc_id%s"%(doc_id))
    cost_time["prem"] = round(time.time()-start_time,2)

    # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
    # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
    # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因，产品已加入到Entity类
    # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
    # log("get product done of doc_id%s"%(doc_id))
    # cost_time["product"] = round(time.time()-start_time,2)

    start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ； 项目、需求、预算、时间
    product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
    log("get product attributes done of doc_id%s"%(doc_id))
    cost_time["product_attrs"] = round(time.time()-start_time,2)

    # 是否为存款类项目
    deposit_project = is_deposit_project(title, codeName[0]['name'], requirement_text)

    start_time = time.time() #正则角色提取
    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, channel_dic, all_winner=is_all_winner(title), req_scope=requirement_scope, deposit_project=deposit_project)
    cost_time["rule"] = round(time.time()-start_time,2)

    '''正则补充最后一句实体日期格式为招标或代理 2021/12/30；正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
    start_time = time.time() #正则角色提取
    predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
    cost_time["roleRuleFinal"] = round(time.time()-start_time,2)

    start_time = time.time() #正则招标人召回
    predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
    cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)

    '''规则调整角色概率'''
    start_time = time.time() #
    predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel)
    cost_time["rolegrade"] = round(time.time()-start_time,2)

    '''规则调整金额概率'''
    start_time = time.time() #
    predictor.getPredictor("moneygrade").predict(list_sentences,list_entitys)
    cost_time["moneygrade"] = round(time.time()-start_time,2)

    start_time = time.time() #联系人模型提取
    predictor.getPredictor("epc").predict(list_sentences,list_entitys)
    log("get epc done of doc_id%s"%(doc_id))
    cost_time["person"] = round(time.time()-start_time,2)

    start_time = time.time() # 时间类别提取
    predictor.getPredictor("time").predict(list_sentences, list_entitys)
    log("get time done of doc_id%s"%(doc_id))
    cost_time["time"] = round(time.time()-start_time,2)

    start_time = time.time() # 保证金支付方式
    payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
    cost_time["deposit"] = round(time.time()-start_time,2)

    # 需在getPredictor("prem").predict后  getAttributes.getPREMs 前  规则调整 监理|施工|设计|勘察类别公告的费用 为招标或中标金额
    predictor.getPredictor("prem").correct_money_by_rule(title, list_entitys, list_articles)


    # 2021-12-29新增：提取：总价,单价
    start_time = time.time()  # 总价单价提取
    predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
    cost_time["total_unit_money"] = round(time.time()-start_time, 2)

    # 依赖句子顺序
    start_time = time.time()  # 实体链接
    entityLink.link_entitys(list_entitys)
    doctitle_refine = entityLink.doctitle_refine(title)
    nlp_enterprise,nlp_enterprise_attachment, dict_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time,winter_scope)
    log("get attributes done of doc_id%s"%(doc_id))
    cost_time["attrs"] = round(time.time()-start_time,2)

    if original_docchannel != 302:  # 审批项目不做下面提取
        '''表格要素提取'''
        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title))
        # print('表格提取中标人：', table_prem)
        # print('原提取角色：', prem[0]['prem'])
        if table_prem:
            getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)

    '''候选人提取'''
    candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise+nlp_enterprise_attachment)
    # print('表格提取候选人：', candidate_top3_prem)
    getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)

    '''获取联合体信息'''
    getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)

    '''修正采购公告表格形式多种采购产品中标价格；中标金额小于所有产品总金额则改为总金额'''
    getAttributes.correct_rolemoney(prem, total_product_money, list_articles)

    '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
    start_time = time.time()
    # content = list_articles[0].content
    # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
    if original_docchannel == 302:
        channel_dic = {"docchannel":
             { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
        }
    else:
        channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
    # print('msc', msc)
    channel_dic = merge_channel(list_articles,channel_dic,original_docchannel) # channel_dic 根据新模型预测结合判断,整合结果
    cost_time["rule_channel2"] = round(time.time()-start_time,2)

    '''一包多中标人提取及所有金额提取'''
    all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))

    start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
    fail = channel_dic['docchannel']['docchannel'] == "废标公告"
    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因，产品已加入到Entity类 #2022/7/29补充返回产品，方便行业分类调用
    # predictor.getPredictor("product").predict(list_sentences, list_entitys)
    log("get product done of doc_id%s"%(doc_id))
    cost_time["product"] = round(time.time()-start_time,2)
    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time,prem,channel_dic))

    '''更新单一来源招标公告中标角色为预中标'''
    getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)

    '''公告无表格格式时，采购意向预测'''  #依赖 docchannel结果 依赖产品及prem
    '''把产品要素提取结果在项目名称的添加到 采购需求，预算时间，采购时间 要素中'''
    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)

    '''行业分类提取，需要用标题、项目名称、产品、及prem 里面的角色'''
    industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem, product_attrs=product_attrs)

    '''地区获取'''
    start_time = time.time()
    # district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
    district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic, list_entity=list_entitys[0])
    cost_time["district"] = round(time.time() - start_time, 2)

    '''根据district提取结果修复实体'''
    repair_entity(prem,district,list_articles)

    '''根据数据源最后召回招标人角色'''
    prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)

    '''根据关键词表生成项目标签'''
    project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
    # 额外需求的标签
    project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
    # print(project_label)

    '''产权分类二级标签'''
    property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)

    '''最终验证prem'''
    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])

    '''规则补充招标无招标人中标无中标人角色'''
    getAttributes.rule_add_role(doc_id,prem[0]['prem'], channel_dic, list_articles[0].content, web_source_no, nlp_enterprise)

    '''通过产品补充标段包名20241203'''
    getAttributes.add_package_name(prem[0]['prem'], list_entitys[0], product_list, name=codeName[0]['name'])

    # 提取拟在建所需字段
    start_time = time.time()
    pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text, web_source_name, industry)
    log("pb_extract done of doc_id%s"%(doc_id))
    cost_time["pb_extract"] = round(time.time() - start_time, 2)

    '''打标签'''
    label_dic = get_all_label(title, list_articles[0].content, prem[0]['prem'])

    '''评标评分提取'''
    bid_score = predictor.getPredictor('bid_score').predict(text, nlp_enterprise+nlp_enterprise_attachment)

    # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
    # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
    version_date = {'version_date': '2025-06-06'}
    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)

    if original_docchannel == 302:
        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text, nlp_enterprise=nlp_enterprise+nlp_enterprise_attachment)
        approval = predictor.getPredictor("approval").add_ree2approval(approval , prem[0]['prem'])
        approval = predictor.getPredictor("approval").add_codename2approval(approval , codeName)
        data_res['prem'] = {}  # 审批项目不要这项
        data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目

    if web_source_no == 'XM6486':
        debt_dic = get_debt_info(text) # 专项债信息提取
        if debt_dic.get('district', '') != '':
            district = predictor.getPredictor('district').predict_area(debt_dic['district'], '', web_source_name)
            debt_dic['district'] = district['district']
            data_res['district'] = district['district']
        # 提取专项债信息
        data_res['debt_dic'] = debt_dic
        data_res['docchannel'] = { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }

    if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
        start_time = time.time() #失信数据要素提取
        punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
        cost_time["punish"] = round(time.time()-start_time,2)
        data_res['punish'] = punish_dic
        if "Project" in data_res['prem']:
            for d in data_res['prem']['Project']['roleList']:
                if d['role_name'] == 'tenderee' and d.get('role_prob', 0.6) < 0.6:  # 处罚公告 去掉低概率招标人
                    data_res['prem']['Project']['roleList'] = [d for d in data_res['prem']['Project']['roleList'] if d['role_name'] != 'tenderee']
                    break
            if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
                data_res['prem'].pop('Project')

    # 把产品属性里面的产品补充到产品列表
    if len(data_res['product_attrs']['data']) > 0: # 20241108 如果产品单价数量提取到产品的，原来提取的产品只保留标题中的
        data_res['product'] = [it for it in data_res['product'] if it in title]
    for d in data_res['product_attrs']['data']:
        if isinstance(d['product'], str) and d['product'] not in data_res['product']:
            data_res['product'].append(d['product'])

    '''最终检查修正招标、中标金额'''
    getAttributes.limit_maximum_amount(data_res, list_entitys[0])

    '''利用采购意向需求信息补充项目'''
    if channel_dic['docchannel']['docchannel'] == '采购意向':
        getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])

    data_res["project_label"] = project_label
    data_res["property_label"] = property_label
    data_res["doctitle_refine"] = doctitle_refine
    data_res["nlp_enterprise"] = nlp_enterprise
    data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
    data_res["dict_enterprise"] = dict_enterprise

    # 要素的个数
    data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name,page_time)
    # 是否有表格
    data_res['exist_table'] = 1 if re.search("<td",text) else 0
    data_res["cost_time"] = cost_time
    data_res["success"] = True

    # 拟在建需建索引字段
    data_res["proportion"] = pb_json.get('pb').get('proportion', '')
    data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')

    # 资质要求
    data_res['aptitude'] = aptitude_text[:1500]
    # 采购内容
    data_res['requirement'] = requirement_text[:1500]
    # 打标签
    data_res['label_dic'] = label_dic
    # 开标、投标、项目、收货等地址
    data_res['addr_dic'] = addr_dic
    # 字数
    text_main, text_attn = 0, 0
    for sentence in list_sentences[0]:
        if sentence.in_attachment:
            text_attn += len(sentence.sentence_text)
        else:
            text_main += len(sentence.sentence_text)
    data_res['word_count'] = {'正文': text_main, '附件': text_attn}
    # 限制产品数量
    data_res['product'] = data_res['product'][:500]
    data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
    # 是否为存款项目
    data_res['is_deposit_project'] = deposit_project
    data_res['pinmu_name'] = pinmu_name # 品目名称
    data_res['policies'] = list_policy # 政策法规
    data_res['bid_score'] = bid_score # 评标得分
    data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
    data_res['code_investment'] = code_investment # 投资项目编号
    for k, v in kv_single_dic.items(): # 没获取到的用kv_tree补充
        if data_res.get(k, '') == '':
            data_res[k] = v
    for k, v in kv_addr_dic.items(): # 没获取到地址的用kv_tree补充
        if data_res['addr_dic'].get(k, '') == '' or re.search('时间：', data_res['addr_dic'][k]):
            data_res['addr_dic'][k] = v

    # for _article in list_articles:
    #         log(_article.content)
    #
    # for list_entity in list_entitys:
    #     for _entity in list_entity:
    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
    #                str(_entity.begin_index),str(_entity.end_index)))
    _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
    _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
    return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)


def test1(name,content):
    user = {
        "content": content,
        "id":name
    }
    myheaders = {'Content-Type': 'application/json'}
    _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
    resp_json = _resp.content.decode("utf-8")
    # print(resp_json)
    return resp_json

def get_ent_context(list_sentences, list_entitys):
    rs_list = []
    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
    for list_entity in list_entitys:
        for _entity in list_entity:
            if _entity.entity_type in ['org', 'company', 'money']:
                s = sentences[_entity.sentence_index].sentence_text
                b = _entity.wordOffset_begin
                e = _entity.wordOffset_end
                # print("%s %d %.4f; %s  %s  %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
                rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
    return '\n'.join(rs_list)

def get_role_context(docid, list_sentences, list_entitys):
    rs_list = []
    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
    for list_entity in list_entitys:
        for _entity in list_entity:
            if _entity.entity_type in ['org', 'company']:
                idx = _entity.entity_id
                sentence = sentences[_entity.sentence_index]
                # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
                #                    center_include=False, word_flag=True, text=_entity.entity_text)
                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
                rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
                _entity.entity_text, _span[1]))
    return rs_list

if __name__=="__main__":
    import pandas as pd
    t1 = time.time()
    # text = '中标人：广州中医药有限公司，招标人：广州市第一人民医院， 代理机构：希达招标代理有限公司。招标金额：100万元， 手续费：100元，总投资：1亿元。中标金额：50000元。合同金额：50000万元。'
    title = '打印机'
    # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
    # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
    # for i in range(30,50,1):
    #     text = df.loc[i, 'dochtmlcon']
    #     rs = json.loads(predict('', text, ''))
    #     print(rs['demand_info'])
    #     print(rs['product'])
    #     print(rs['product_attrs'])
    # print(rs)

    # df2 = pd.read_csv('E:/导出数据/存款入围框架采购等公告_输入要素.csv')
    # df = pd.read_csv('E:\导出数据/存款入围框架采购等公告_预测结果0830.csv')
    # df1 = pd.read_csv('E:\导出数据/存款入围框架采购等公告_html.csv')
    # df = df.merge(df1, on='docid', how='left')
    # print(len(df), df.columns)
    # df['rs'] = df['extract_json1'].apply(lambda x: json.loads(x))
    # docids = []
    # n1 = n2 = 0
    # for docid, d, html in zip(df['docid'], df['rs'], df['dochtmlcon']):
    #     if d['docchannel']['docchannel'] == '招标公告' and '元' in html and 'Project' in d['prem'] and float(
    #             d['prem']['Project']['tendereeMoney']) == 0:
    #         docids.append(docid)
    #         n1 += 1
    #     else:
    #         n2 += 1
    # print(n1, n2)
    # df = df[df['docid'].isin(docids)]
    #
    # df = df.merge(df2, on='docid', how='left')
    # df.fillna('', inplace=True)
    # df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
    # print(df.columns)
    # # df = df[:10]
    # print(len(df))
    #
    # l = []
    # for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
    #                                                                    df['web_source_no'], df['web_source_name'], df['original_docchannel']):
    #     rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
    #     l.append((docid, rs, content, roles))
    # df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
    # df.to_csv('E:\导出数据/存款入围框架采购等公告_招标公告无招标金额预测结果.csv')

    # df = pd.read_csv('E:\角色金额数据/银行类招标金额缺失公告_输入要素.csv')
    # # df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_3_html.csv')
    # df2 = pd.read_csv('E:/角色金额数据/银行缺招标金额公告_012_html.csv')
    # print(len(df), len(df2))
    # df = df[df['docid'].isin(df2['docid'])]
    # print(len(df))
    # df = df.merge(df2, how='left', on='docid')
    # print(len(df))
    #
    # # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2.xlsx')
    # # # df1 = pd.read_excel('E:\角色金额数据/银行缺招标金额公告_检查汇总2_补充招标内容金额后1105.xlsx')
    # # df1.fillna('', inplace=True)
    # # df1 = df1[df1['tendereeMoney']==0]
    # # df = df.merge(df1, on='docid', how='right')
    #
    # df = df[['docid', 'doctitle', 'page_time', 'web_source_no', 'web_source_name', 'original_docchannel', 'dochtmlcon']]
    # print(df.columns)
    # # df = df[:10]
    # print(len(df))
    #
    # l = []
    # for docid, text, title, page_time,web_no, web_name, channel in zip(df['docid'], df['dochtmlcon'], df['doctitle'], df['page_time'],
    #                                                                    df['web_source_no'], df['web_source_name'], df['original_docchannel']):
    #     rs, content, roles = predict('', text, title, page_time, web_no, web_name, channel)
    #     l.append((docid, rs, content, roles))
    # df = pd.DataFrame(l, columns=['docid', 'rs', 'content', 'roles'])
    # # df.to_csv('E:\角色金额数据/银行缺招标金额公告_3预测结果.csv')
    # # df.to_csv('E:\角色金额数据/银行缺招标金额公告_012预测结果.csv')
    # # df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果.csv')
    # df.to_csv('E:\角色金额数据/银行缺招标金额公告_检查汇总2_修复后预测结果1106.csv')
    #
    # with open('2.html', 'r', encoding='utf-8') as f:
    #     text = f.read()
    #     t1 = time.time()
    #     print(predict('', text, title))
    #     t2 = time.time()
    #     print(predict('', text, title))
    #     t3 = time.time()
    #     print('第一次耗时：%.4f, 第二次耗时：%.4f'%(t2-t1, t3-t2))
    # print(predict('',text,title))

    # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
    # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
    # new_prem = []
    # for i in range(len(df)):
    #     i = 530
    #     doc_id = df.loc[i, 'docid']
    #     text = df.loc[i, 'html']
    #     # title = df.loc[i, 'doctitle']
    #     rs = predict(doc_id,text)
    #     rs = json.loads(rs)
    #     prem = json.dumps(rs['prem'], ensure_ascii=False)
    #     # print(rs)
    #     new_prem.append(prem)
    #     print(prem)
    #     break
    # df['new_prem'] = pd.Series(new_prem)
    # print('耗时：', time.time()-t1)
    # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
    # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
    # # pass

    import json
    a = json.loads('''
    { "addr_dic": { "addr_bidopen": "七、开标时间和地点 1.时间：2025年04月14日 09:00(北京时间) 2.地点(网址：浙商银行数智采购一体化管理平台(https://ccgp.szcgpt.czbank.com，现场开标地点为https://www.zcygov.cn", "addr_bidsend": "六、投标截止时间和地点1.时间：2025年05月14日 09:30(北京时间)2.地点(网址：浙商银行数智采购一体化管理平台(https://ccgp.szcgpt.czbank.com)" }, "aptitude": "四、投标人的资格条件，(1)投标人须为具有独立承担民事责任能力的法人或具备国家认可经营资，格的其他组织。(提供合法有效的营业执照复印件，并加盖公章)，(2)投标人在最近三年内(2022年04月01日起至今，以法院判决书落款，日期为准)的经营活动中没有行贿犯罪、串通投标犯罪等重大违法记录；在“信，用中国(www.creditchina.gov.cn)”无失信惩戒等情况。(提供承诺书及信用，中国网站相关截图，并加盖公章)，第2页(3)投标人须承诺：在最近三年内(2022年04月01日起至今)与浙商银行，的项目(如有)合作过程中，没有出现重大合同违约、泄露商业秘密或技术秘密，等事件。(提供承诺书，并加盖公章)，(4)投标人须承诺：如在本次公开招标中入围，在入围期间不得拒绝接受，招标人后续采购合同，不得接受合同后采取各种方式拒绝履约；否则，招标人有，权取消入围资格，并作出相应处理。(提供承诺书，并加盖公章)，(5)单位负责人为同一人或者存在控股、管理关系的不同单位，不得同时，参加本次采购活动。(提供承诺书，并加盖公章)，(6)除单一来源采购项目外，为采购项目提供整体设计、规范编制或者项，目管理、监理、检测等服务的供应商，不得再参加该采购项目的其他采购活动。(提供承诺书，并加盖公章)，(7)投标人若为经营型企业须提供具有行业主管部门颁发的《食品经营许，可证》；若为生产型企业须提供《食品生产许可证》(提供有效的证书复印件加，盖公章)。(8)投标人没有出现在市场监督管理局“经营异常名录”中。(提供国家，企业信用信息公示系统“www.gsxt.gov.cn”列入经营异常名录信息截图，并加，盖公章)，(9)本项目不接受联合体(提供承诺书，并加盖公章)，本项目(/)接收联合体投标，", "attachmentTypes": "pdf", "bid_score": [], "bidway": "公开招标", "candidate": "", "code": [ "NJJC2024MYGKb024", "HG-241224-YH01" ], "code_investment": "", "cost_time": { "attrs": 0.74, "codename": 1.23, "deposit": 0.0, "district": 0.14, "kvtree": 0.04, "moneygrade": 0.0, "nerToken": 2.7, "outline": 0.08, "pb_extract": 0.55, "person": 0.01, "prem": 0.07, "preprocess": 4.22, "product": 1.25, "product_attrs": 0.12, "roleRuleFinal": 0.02, "rolegrade": 0.01, "rule": 0.07, "rule_channel": 0.04, "rule_channel2": 0.25, "tableToText": 0.5400543594360352, "tendereeRuleRecall": 0.0, "time": 0.05, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "dict_enterprise": { "中华人民共和国": { "in_text": 0 }, "京东": { "in_text": 0 }, "京东自营店": { "in_text": 0 }, "人民法院": { "in_text": 0 }, "南京恒广工程管理有限公司": { "credit_code": "913201136749013829", "in_text": 0 }, "南京银行和燕路支行": { "in_text": 0 }, "天猫旗舰店": { "in_text": 0 }, "浙商银行": { "in_text": 2 }, "浙商银行南京分行": { "in_text": 2 }, "浙商银行股份有限公司": { "credit_code": "91330000761336668H", "in_text": 1 }, "浙商银行股份有限公司南京分行": { "credit_code": "91320000682167245K", "in_text": 2 }, "高新技术企业": { "credit_code": "91620100MA71X5MY2B", "in_text": 0 } }, "district": { "area": "华东", "city": "南京", "district": "未知", "is_in_text": false, "province": "江苏" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告", "use_original_docchannel": 0 }, "docid": "", "doctitle_refine": "关于浙商银行股份有限公司南京分行南京地区2025年宣传物品集中项目", "exist_table": 1, "extract_count": 11, "fail_reason": "", "fingerprint": "md5=ebde4b7a154fc5a996093bad41a37b5a", "industry": { "class": "租赁和商务服务业", "class_name": "广告业", "subclass": "商务服务业" }, "is_deposit_project": false, "label_dic": { "mode_of_partipation": 1, "need_ca": 1, "need_performance": 1, "need_qualification": 1 }, "match_enterprise": [ { "from": "南京恒广工程管理有限公司", "to": "南京恒广工程管理有限公司", "type": "update" }, { "from": "", "to": "高新技术企业", "type": "add" } ], "match_enterprise_type": 3, "moneys": [], "moneys_attachment": [ 4750000.0, 600.0, 40000.0, 100000.0 ], "moneysource": "", "name": "浙商银行南京分行南京地区2025年度宣传物品集中采购入围项目", "nlp_enterprise": [ "浙商银行股份有限公司南京分行", "浙商银行股份有限公司", "浙商银行", "浙商银行南京分行" ], "nlp_enterprise_attachment": [ "浙商银行南京分行", "浙商银行股份有限公司南京分行", "南京恒广工程管理有限公司", "浙商银行", "南京银行和燕路支行", "中华人民共和国", "人民法院", "京东自营店", "天猫旗舰店", "京东", "高新技术企业" ], "pb": { "industry": "司法", "location": "南京地区", "projectDigest": "项目内容：三、采购方式：公开招标，四、投标人的资格条件，（1）投标人须为具有独立承担民事责任能力的法人或具备国家认可经营资，格的其他组织。（提供合法有效的营业执照复印件，并加盖公章），（2）投标人在最近三年内（2022年04月01日起至今，以法院判决书落款，日期为准）的经营活动中没有行贿犯罪、串通投标犯罪等重大违法记录；在“信，用中国（www.creditchina.gov.cn）”无失信惩戒等情况。（提供承诺书及信用，中国网站相关截图，并加盖公章），第2页（3）投标人须承诺：在最近三年内（2022年04月01日起至今）与浙商银行，的项目（如有）合作过程中，没有出现重大合同违约、泄露商业秘密或技术秘密", "project_name_refind": "浙商银行南京分行南京地区2025年宣传物品集中采购入围", "project_property": "新建" }, "pb_project_name": "浙商银行南京分行南京地区2025年宣传物品集中采购入围", "person_review": [], "pinmu_name": "", "policies": [], "prem": { "Project": { "code": "", "name": "浙商银行南京分行南京地区2025年度宣传物品集中采购入围项目", "roleList": [ { "address": "南京市鼓楼区中山北路9号", "linklist": [ [ "杜老师", "025-86823959" ], [ "杜宇烽", "025-86823959" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_prob": 0.9499998331069947, "role_text": "浙商银行股份有限公司南京分行", "serviceTime": "" }, { "address": "南京市栖霞区马群街道紫东路2号紫东国际创意园C14栋601", "linklist": [ [ "朱工", "19951754271" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "南京恒广工程管理有限公司", "serviceTime": "" } ], "tendereeMoney": "4750000", "tendereeMoneyUnit": "万元", "uuid": "663f7f3b-f7b6-48fe-a91c-c98df446214b" } }, "process_time": "2025-04-23 11:13:53", "product": [ "宣传物品", "本招标项目采购期限一年。本招标项目采购的物品为浙商银行南京地区机构在客户宣传、营销活动中的实物礼品，包括食品粮油类、日用品类、数码电器类、文创用品类、家纺服饰类等品类。" ], "product_attrs": { "data": [ { "product": "本招标项目采购期限一年。本招标项目采购的物品为浙商银行南京地区机构在客户宣传、营销活动中的实物礼品，包括食品粮油类、日用品类、数码电器类、文创用品类、家纺服饰类等品类。", "quantity": "1", "quantity_unit": "项" } ], "header": [ "_数量______项目内容___" ], "header_col": [ "项目内容_数量" ] }, "project_contacts": [ [ "朱工", "19951754271" ] ], "project_label": { "标题": {}, "核心字段": { "营销物料": [ [ "宣传品", 1 ] ], "食品": [ [ "食品", 1 ] ] } }, "property_label": "", "proportion": "", "requirement": "二、项目内容：，十一、电子招投标的说明，1.1电子招投标：本项目以数据电文形式，依托“浙商银行数智采购一体化管理平台(https://ccgp.szcgpt.czbank.com)”进行招投标活动，不接受纸质投标文件；2投标准备：注册账号--点击“供应商注册”，进行采购供应商资料填写；申领CA数字证书---浙商银行数智采购一体化管理平台CA申领使用操作手册”；安装“浙商银行数智采购一体化管理投标客户端”--前往“浙商银行数智采购一体化管理平台-帮助中心-资料下载-浙商银行数智采购一体化管理投标客户端下载”进行下载并安装；3招标文件的获取：使用账号登录或者使用CA登录浙商银行数智采购一体化管理平台；进入“项目采购”应用，在获取采购文件菜单中选择项目，获取招标文件；4投标文件的制作：在“浙商银行数智采购一体化管理投标客户端”中完成“填写基本信息”、“导入投标文件”、“标书关联”、“标书检查”、“电子签名”、“生成电子标书”等操作；5采购人、采购代理机构将依托浙商银行数智采购一体化管理平台完成本项目的电子交易活动，平台不接受未按上述方式获取招标文件的供应商进行投标活动；6对未按上述方式获取招标文件的供应商对该文件提出的质疑，采购人或采购代理机构将不予处理；7不提供招标文件纸质版；8投标文件的传输递交：投标人在投标截止时间前将加密的投标文件上传至浙商银行数智采购一体化管理平台，还可以在投标截止时间前直接提交或者以邮政快递方式递交备份投标文件1份。备份投标文件的制作、存储、密封详见招标文件第三章投标人须知投标人须知前附表；9投标文件的解密：投标人按照平台提示和招标文件的规定在半小时内完成在线解密。通过“浙商银行数智采购一体化管理平台”上传递交的投标文件无法按时解密，投标供应商递交了备份投标文件的，以备份投标文件为依据，否则视为投标文件撤回。通过“浙商银行数智采购一体化管理平台”上传递交的投标文件已按时解密的，备份投标文件自动失效。投标人仅提交备份投标文件，未在电子交易平台传输递交投标文件的，投标无效；10具体操作指南：详见浙商银行数智采购一体化管理平台“帮助中心-资料下载-浙商银行数智采购一体化管理平台供应商操作手册”。2.若对项目采购电子交易系统操作有疑问，可平台服务热线95763获取热线服务帮助。CA问题联系电话(人工：汇信CA400-888-4636；天谷CA400-087-8198。附件信息：浙商银行南京分行南京地区2025年度宣传物品集中采购入围项目.pdf，1.4M。", "serviceTime": { "service_days": 365, "service_end": "", "service_start": "" }, "success": true, "time_bidclose": "2025-05-14 09:30:00", "time_bidopen": "2025-04-14 09:00:00", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_contractEnd": "", "time_contractStart": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2025-04-21", "time_getFileStart": "2025-04-14 09:00:00", "time_listingEnd": "", "time_listingStart": "", "time_planned": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "time_signContract": "", "total_tendereeMoney": "4750000", "total_tendereeMoneyUnit": "万元", "version_date": "2025-04-22", "word_count": { "正文": 3019, "附件": 28802 } }
    ''')
    b = '''
    [{"fileTitle":"浙商银行南京分行南京地区2025年度宣传物品集中采购入围项目.pdf","fileMd5":"cc435162fb36014f339844ec91fec1e0","classification":"招标文件"}]
    '''
    c = "浙商银行数智采购平台"
    d = '2025-04-23'
    print(extractCount(a,b,c,d))