Эх сурвалжийг харах

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

znj 1 жил өмнө
parent
commit
58a6d152e6

+ 53 - 6
BiddingKG/dl/entityLink/entityLink.py

@@ -69,13 +69,41 @@ def get_place_list():
 place_list = get_place_list()
 place_pattern = "|".join(place_list)
 
+def is_short(shorter_cut, longer):
+    '''
+    判断是否为简称
+    :param shorter_cut: 简称
+    :param longer: 全称
+    :return:
+    '''
+    flag = 1
+    for words in shorter_cut:
+        if words in longer:
+            longer = longer[longer.find(words) + len(words):]
+        else:
+            flag = 0
+            break
+    if flag:
+        return 1
+    else:
+        return 0
 
 def link_entitys(list_entitys,on_value=1):#on_value=0.81
     for list_entity in list_entitys:
         range_entity = []
+        short_entity = []
+        long_entity = []
+        n = 0
         for _entity in list_entity:
             if _entity.entity_type in ["org","company"]:
                 range_entity.append(_entity)
+                if len(_entity.entity_text) in [4, 5, 6]:
+                    short_entity.append(_entity)
+                if len(_entity.entity_text)>6:
+                    long_entity.append(_entity)
+                n += 1
+                if n > 1000:
+                    break
         range_entity = range_entity[:1000]
         #替换公司的逻辑有问题,先取消
         # for first_i in range(len(range_entity)):
@@ -90,12 +118,31 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
         #             _entity.linked_entitys.append(_ent)
         #             _ent.linked_entitys.append(_entity)
         #             print("=-===",_entity.entity_text,_ent.entity_text,_score)
-        #替换公司名称
-        for _entity in range_entity:
-            if re.search("公司",_entity.entity_text) is None:
-                for _ent in _entity.linked_entitys:
-                    if re.search("公司$",_ent.entity_text) is not None:
-                        if len(_ent.entity_text)>len(_entity.entity_text):
+        # #替换公司名称
+        # for _entity in range_entity:
+        #     if re.search("公司",_entity.entity_text) is None:
+        #         for _ent in _entity.linked_entitys:
+        #             if re.search("公司$",_ent.entity_text) is not None:
+        #                 if len(_ent.entity_text)>len(_entity.entity_text):
+        #                     _entity.entity_text = _ent.entity_text
+
+        if short_entity and long_entity:
+            for first_i in range(len(short_entity)):
+                _entity = short_entity[first_i]
+                if is_enterprise_exist(_entity.entity_text): # 实体表存在的不替换
+                    continue
+                if _entity.label == 0 and re.search('(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校)', _entity.entity_text)==None:
+                    ree_l = []
+                    other_l = []
+                    for second_i in range(len(long_entity)):
+                        _ent = long_entity[second_i]
+                        if _ent.label in [0,1,5] and is_short(_entity.entity_text, _ent.entity_text):
+                            if _ent.label in [0 ,1]:
+                                ree_l.append(_ent)
+                            elif _ent.label in [5]:
+                                other_l.append(_ent)
+                    for _ent in ree_l + other_l:
+                        if is_enterprise_exist(_ent.entity_text) or re.search('有限(责任)?公司', _ent.entity_text):
                             _entity.entity_text = _ent.entity_text
 
         # 2021/12/21 替换通过字典识别到的取长度最大的相似实体

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 92 - 12
BiddingKG/dl/interface/Preprocessing.py


+ 6 - 3
BiddingKG/dl/interface/extract.py

@@ -337,10 +337,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # '''限制行业最高金额'''
     # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
 
+    '''根据数据源最后召回招标人角色'''
+    prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, prem)
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-09-13'}
+    version_date = {'version_date': '2023-11-09'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''
@@ -399,11 +401,12 @@ def get_role_context(docid, list_sentences, list_entitys):
     for list_entity in list_entitys:
         for _entity in list_entity:
             if _entity.entity_type in ['org', 'company']:
+                idx = _entity.entity_id
                 sentence = sentences[_entity.sentence_index]
                 # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
                 #                    center_include=False, word_flag=True, text=_entity.entity_text)
-                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=20, center_include=False)
-                rs_list.append((docid, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
+                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
+                rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
                 _entity.entity_text, _span[1]))
     return rs_list
 

+ 23 - 0
BiddingKG/dl/interface/getAttributes.py

@@ -830,6 +830,8 @@ def getPackagesFromArticle(list_sentence, list_entity):
     if len(True_package2) > 2: # 同时包含多标段及多中标人的
         PackageList_scope = PackageList_scope + PackageList_scope2
     PackageList = get_package_scope(PackageList_scope)
+    if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project
+        return [], set(), {}
     return PackageList, PackageSet, dict_packageCode
 
 
@@ -3543,6 +3545,21 @@ def limit_maximum_amount(dic, list_entity):
     :param list_entity: 实体列表
     :return:
     '''
+    indu_amount = {
+        '计算机设备': 200000000,
+        '办公设备': 100000000,
+        '家具用具': 500000000,
+        '办公消耗用品及类似物品': 100000000,
+        '日杂用品': 100000000,
+        '餐饮业': 1000000000,
+        '物业管理': 1000000000,
+        '工程技术与设计服务': 1000000000,
+        '工程评价服务': 100000000,
+        '其他工程服务': 100000000,
+        '工程监理服务': 100000000,
+        '工程造价服务': 100000000,
+        '会计、审计及税务服务': 100000000,
+    }
     title = dic.get('doctitle_refine', '')
     name = dic.get('name', '')
     product = ','.join(dic.get('product', []))
@@ -3579,6 +3596,8 @@ def limit_maximum_amount(dic, list_entity):
         # print('快递限额')
         maximum_amount = 80000000
         minximum_amount = 10
+    elif industry in indu_amount:
+        maximum_amount = indu_amount[industry]
     # print('maximum_amount:', maximum_amount)
     for value in dic['prem'].values():
         for l in value['roleList']:
@@ -3598,6 +3617,8 @@ def limit_maximum_amount(dic, list_entity):
                     if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])):
                         l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
                         # print('行业限额纠正连接金额')
+                    elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
+                        l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
                     # elif flag and l["role_money"]['money_unit'] == '元':
                     #     l["role_money"]['money'] = 0
                 elif 0<float(l["role_money"]['money']) < minximum_amount:
@@ -3619,6 +3640,8 @@ def limit_maximum_amount(dic, list_entity):
                 if flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney'])):
                     value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
                     # print('行业限额纠正连接金额')
+                elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
+                    value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
                 # elif flag and value['tendereeMoneyUnit'] == '元':
                 #     value['tendereeMoney'] = 0
             elif 0<float(value['tendereeMoney']) < minximum_amount:

+ 19 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -81,6 +81,23 @@ class Model_role_classify_word():
         # print(_encode_span)
         return _encode_span
 
+    def fix_digit_eng(self, text):
+        '''
+        处理数字及英文编号等
+        :param text:
+        :return:
+        '''
+        text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
+        text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
+        text = re.sub('标段[一二三1-3]', '标段d', text)
+        text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
+        text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
+        text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
+        text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
+        text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
+        text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
+        return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
+
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
         '''
         上下文数字化,使用字偏移
@@ -93,7 +110,8 @@ class Model_role_classify_word():
         '''
         _span = get_context(sentence_text, begin_index, end_index,size=size, center_include=False)  # size=12 center_include=True
         # print(_span)
-        _encode_span = encodeInput(_span, word_len=20, word_flag=True, userFool=False)  # word_len=20
+        _span = [self.fix_digit_eng(text) for text in _span]
+        _encode_span = encodeInput(_span, word_len=30, word_flag=True, userFool=False)  # word_len=20
         # print(_encode_span)
         return _encode_span
     

+ 148 - 41
BiddingKG/dl/interface/predictor.py

@@ -60,6 +60,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "district": {"predictor": None, "Lock": RLock()},
                   'tableprem': {"predictor": None, "Lock": RLock()},
                   'candidate': {"predictor": None, "Lock": RLock()},
+                  'websource_tenderee': {"predictor": None, "Lock": RLock()},
                   }
 
 
@@ -107,6 +108,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = TablePremExtractor()
                 if _type == 'candidate':
                     dict_predictor[_type]["predictor"] = CandidateExtractor()
+                if _type == 'websource_tenderee':
+                    dict_predictor[_type]['predictor'] = WebsourceTenderee()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -790,6 +793,14 @@ class PREMPredict():
                 if re.search('拟邀请$', front):
                     label = 2
                     values[label] = 0.501
+                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?)(名称)?[是为:]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
+                    label = 1
+                    values[label] = 0.501
+                elif re.search('采用$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
+                    label = 5
+                elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
+                    label = 2
+                    values[label] = 0.501
             elif label == 2:
                 if re.search('中标单位和.{,25}签订合同', whole):
                     label = 0
@@ -806,6 +817,19 @@ class PREMPredict():
                 elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
+                elif re.search('税费', front) and re.search('^承担', behind):
+                    label = 5
+                elif re.search('第一候补|第一后备|备选', front):
+                    label = 2
+                    values[label] = 0.6
+                elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
+                    values[2] = 0.5
+                    label = 5
+                elif re.search('(承包权人|帐户名称):$', front):
+                    label = 5
+                elif re.search('合同供方:?$', front):
+                    label = 0
+                    values[label] = 0.5
             elif re.search('是否中标:是,供应商', front) and label == 5:
                 label = 2
                 values[label] = 0.9
@@ -821,6 +845,11 @@ class PREMPredict():
                     values[label] = 0.501
                 elif re.search('^:受', behind):  # 354009560 附件格式问题 ,中选中介服务机构通知书,编号:HZ2305120541,中汕项目管理有限公司:受惠东县人民政府大岭街道办事处委托
                     label = 5
+                elif re.search('发布机构', front) and re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站|馆)$', entity.entity_text):
+                    label = 0
+                    values[label] = 0.501
+                elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
+                    label = 5
             elif label in [3,4]:
                 if re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
                     label = 2
@@ -834,6 +863,9 @@ class PREMPredict():
                 elif re.search('\d+\.\d+[,、]?(中标|成交)候选人', front):
                     label = 5
                     values[label] = 0.501
+                elif re.search('第一名:$', front):
+                    label = 2
+                    values[label] = 0.7
             elif re.search('(中标|成交)通知书[,:]$', front) and re.search('^:', behind) and label != 2:
                 label = 2
                 values[label] = 0.8
@@ -1318,12 +1350,12 @@ class RoleRulePredictor():
                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
                                         "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\
-                                        "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
+                                        "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
         self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
-                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^拟采购|^拟招标|^须购[买置]一批)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
         self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
@@ -1331,14 +1363,15 @@ class RoleRulePredictor():
         self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_50>" \
                "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
-               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?" \
+               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
                "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)"  # 承办单位:不作为中标 83914772
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
                                            "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)"  # 解决表头识别不到加逗号情况,需前面为,。空
         self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
-                                           "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
+                                           "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
+                                           "|结果公示如下:摇出球号:\d+号,中介机构:$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
 
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
                                          "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
@@ -1355,7 +1388,7 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|盖章)?[::是为]+$)"
+        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?\w{,5})?|如下|:?牵头人)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1530,9 +1563,11 @@ class RoleRulePredictor():
                                                            word_flag=True, use_text=True, text=re.sub(")", ")",
                                                                                                       re.sub("(", "(",
                                                                                                              p_entity.entity_text)))
-                                        if str(_span[1] + _span[2][:len(str(_name))]).find(
+                                        if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标  368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目
+                                            break
+                                        if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
                                                 _name) >= 0:
-                                            if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
+                                            if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
                                                 find_flag = True
                                                 _label = 1
                                                 p_entity.label = _label
@@ -1543,6 +1578,8 @@ class RoleRulePredictor():
                                                 _label = 0
                                                 p_entity.label = _label
                                                 p_entity.values[int(_label)] = on_value
+                                                if 6<len(p_entity.entity_text) < 20: # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
+                                                    p_entity.values[int(_label)] += 0.005
                                                 break
                                     if p_entity.sentence_index >= 4:
                                         break
@@ -1697,7 +1734,7 @@ class RoleRulePredictor():
                                     p_entity.label = 0
                                     # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
             if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search(
-                    '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
+                    '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
                     article.title+article.content[:100]):
                 for p_entity in candidates:
                     # print('只有一个候选人的作为中标人', p_entity.entity_text)
@@ -2143,16 +2180,18 @@ class TendereeRuleRecall():
 
 class RoleGrade():
     def __init__(self):
-        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
-        self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
-        self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包|甲)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
+        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
+        self.tenderee_center_8 = "(?P<tenderee_center_8>受.{5,20}委托)"
+        self.tenderee_left_8 = "(?P<tenderee_left_8>(尊敬的供应商|(需求|最终|发包|征集|甲|转让|出租|处置)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
+        self.tenderee_left_6 = "(?P<tenderee_left_6>(发布|业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方|发布机构)"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]|排名:1)"
+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]|排[序]:1|名次:1)"
         self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
-        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
-        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
-        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
-                             self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
+        self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
+        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.agency_left_9, self.winTenderer_left_9,
+                             self.winTenderer_left_8,self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9]
     def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
         '''
         根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
@@ -2172,7 +2211,7 @@ class RoleGrade():
             if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
                 text = sentences[entity.sentence_index].sentence_text
                 in_att = sentences[entity.sentence_index].in_attachment
-                pre_prob = entity.values[entity.label]
+                pre_prob = entity.values[entity.label] # 模型预测角色概率
                 b = entity.wordOffset_begin
                 e = entity.wordOffset_end
                 not_found = 1
@@ -2195,9 +2234,11 @@ class RoleGrade():
                         _prob = int(_prob)*0.1
                         # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
                         if in_att:
-                            _prob = _prob - 0.2
-                        if pre_prob < _prob:
+                            _prob = _prob - 0.1 # 0.2
+                        if pre_prob < _prob: # 如果模型预测概率小于关键词概率
                             _prob = 0.65
+                        if len(entity.entity_text) < 6: # 如果实体名称小于6个字,概率再降0.05
+                            _prob -= 0.05
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0
                         # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
@@ -2964,7 +3005,7 @@ class ProductAttributesPredictor():
         total_price_list = []  # 总价列表,拥有判断是否为几行产品合计总价
         # print('表格数:', len(tables))
 
-        for i in range(len(tables)-1, -1, -1):
+        for i in range(len(tables)):  # (len(tables)-1, -1, -1) 由从最后到前改为 前到后
             table = tables[i]
             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
                 table.string = table.get_text()
@@ -3096,7 +3137,7 @@ class ProductAttributesPredictor():
                             else:
                                 header_quan_unit = ""
 
-                    if found_header and len(headers)<1:  # 只保留出现的第一个表头
+                    if found_header and ('_'.join(header_list) not in headers or '_'.join(header_list2) not in headers_demand):# and len(headers)<1:  # 只保留出现的第一个表头
                         headers.append('_'.join(header_list))
                         headers_demand.append('_'.join(header_list2))
                         header_col.append('_'.join(tds))
@@ -3300,13 +3341,15 @@ class ProductAttributesPredictor():
                                 #         except:
                                 #             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
 
-                                if (product, unitPrice, quantity) not in product_set:
-                                    product_set.add((product, unitPrice, quantity))
+                                # if (product, unitPrice, quantity) not in product_set:
+                                #     product_set.add((product, unitPrice, quantity))
+                                if (product, unitPrice,) not in product_set: # 2023/09/22 改为只判断产品/单价,只要两个一样就不作为新产品 避免多个表格重复表达有些没数量造成重复提取 353858683
+                                    product_set.add((product, unitPrice))
                                     product_link.append(link)
+                                    if link['unitPrice']:
+                                        unit_price_list.append(link['unitPrice'])
                                     if link['unitPrice'] != "" and link['quantity'] != '':
                                         try:
-                                            if link['unitPrice']:
-                                                unit_price_list.append(link['unitPrice'])
                                             total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
                                         except:
                                             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
@@ -3326,17 +3369,17 @@ class ProductAttributesPredictor():
                     i += 1
                 else:
                     i += 1
-        if len(total_price_list)>0 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
-            # print('总价一半以上重复的为多行一个总价,需去掉')
-            for link in product_link:
+        if len(total_price_list)>1 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
+            # print('总价一半以上重复的为多行一个总价,需去掉', total_price_list)
+            for link in product_link:  # 预防最后一列总价为所有产品总价,列补全后所有产品总价一样情况
                 if 'total_price' in link:
                     link['total_price'] = ""
         if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
             # print('如果单价重复率高不算总产品价避免错误')
             total_product_money = 0
-            for link in product_link:
-                if 'unitPrice' in link:
-                    link['unitPrice'] = ""
+            # for link in product_link:
+            #     if 'unitPrice' in link:
+            #         link['unitPrice'] = ""
 
         if len(product_link)>0:
             attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
@@ -3768,11 +3811,13 @@ class DocChannel():
           '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
           '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
           '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
-          '合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$',
+          '合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
           '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
-          '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$',
+          '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$', # |开标(记录|信息|情况)
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
+          '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果',
+          '验收合同': '验收公告|验收单公示|验收结果公告|验收报告公示|验收意见报告|履约公告|履约结果公告'
       }
 
   def load_life(self,life_model,config):
@@ -4165,6 +4210,12 @@ class DocChannel():
                       return life_list[0], msc
                   return '', msc
               return '招标答疑', msc
+          elif '开标记录' in life_kw_title:
+              if '开标结果' in title and is_contain_winner(prem_json):
+                  return '中标信息', msc
+              return '开标记录', msc
+          elif '验收合同' in life_kw_title:
+              return '验收合同', msc
           elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
               if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
                   return '招标公告', msc
@@ -5276,7 +5327,10 @@ class TableTag2List():
                     try:
                         if text_process != None:
                             # text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
-                            td_text = re.sub('\xa0', '', text_process(cell, final=False))
+                            # td_text = re.sub('\xa0', '', text_process(cell, final=False))
+                            td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
+                            if len(td_text)>30:
+                                td_text = re.sub('\xa0', '', text_process(cell, final=False))
                             if td_text == "":
                                 td_text = ' '
                             text = [td_text,0]
@@ -5339,7 +5393,7 @@ class TablePremExtractor(object):
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
             "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
-            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因",
+            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
@@ -5353,7 +5407,7 @@ class TablePremExtractor(object):
 
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -5426,6 +5480,7 @@ class TablePremExtractor(object):
         '''
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
+        text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -5460,7 +5515,7 @@ class TablePremExtractor(object):
             bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
             win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
 
-            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
+            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
                 break
             if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2:  # 内容为空或全部一样 停止匹配
@@ -5480,7 +5535,7 @@ class TablePremExtractor(object):
                 project_name = ''
             previous_package = package_code
 
-            if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
+            if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
                 break
             if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
                 continue
@@ -5691,7 +5746,7 @@ class CandidateExtractor(object):
             self.headerset = pickle.load(f)
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -5748,6 +5803,7 @@ class CandidateExtractor(object):
         '''
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
+        text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -5782,7 +5838,7 @@ class CandidateExtractor(object):
             second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
 
-            if set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
                 # print('包含表头, 停止匹配')
                 break
             if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
@@ -6010,6 +6066,57 @@ def role_special_predictor(web_source_name, content, nlp_enterprise):
         if ser and ser.group(1) in nlp_enterprise:
             return ser.group(1)
 
+class WebsourceTenderee():
+    def __init__(self):
+        with open(os.path.dirname(__file__)+'/websource_tenderee.pkl', 'r', encoding='utf-8') as f:
+            self.webno2ree = json.load(f)
+
+    def get_websource_tenderee(self, web_source_no, prem):
+        '''
+        通过数据源唯一招标人召回调整prem中的招标人,
+        :param web_source_no:
+        :param prem:
+        :return:
+        '''
+        p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
+        web_ree = self.webno2ree.get(web_source_no, '')
+        if web_ree != '':
+            if 'Project' in prem[0]['prem']:
+                find_tenderee = False
+                for d in prem[0]['prem']['Project']['roleList']:
+                    if d['role_name'] == 'tenderee':
+                        find_tenderee = True
+                        if d['role_text'] == "":
+                            d['role_text'] = web_ree
+                        # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
+                        #     d['role_text'] = web_ree
+                        # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
+                        #     d['role_text'] = web_ree
+                        break
+                if not find_tenderee: # 没招标人的添加
+                    prem[0]['prem']['Project']['roleList'].append({'role_name': 'tenderee',
+                                                                   'role_text': '%s' % web_ree,
+                                                                   'role_money': {'money': 0, 'money_unit': '',
+                                                                                  'floating_ratio': '',
+                                                                                  'downward_floating_ratio': '',
+                                                                                  'discount_ratio': ''},
+                                                                   'linklist': [],
+                                                                   'serviceTime': '',
+                                                                   'address': ''})
+            else:
+                prem[0]['prem']['Project'] = {'code': '',
+                                              'tendereeMoney': 0,
+                                              'roleList': [
+                                                  {'role_name': 'tenderee',
+                                                   'role_text': '%s' % web_ree,
+                                                   'role_money': {'money': 0, 'money_unit': '', 'floating_ratio': '',
+                                                                  'downward_floating_ratio': '', 'discount_ratio': ''},
+                                                   'linklist': [],
+                                                   'serviceTime': '',
+                                                   'address': ''}
+                                              ]}
+        return prem
+
 
 def getSavedModel():
     #predictor = FormPredictor()

BIN
BiddingKG/dl/interface/role_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.index


Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 0 - 0
BiddingKG/dl/interface/websource_tenderee.pkl


BIN
BiddingKG/dl/product/data/dev_data.pkl


BIN
BiddingKG/dl/product/data/dev_data2.pkl


BIN
BiddingKG/dl/product/data/train_data.pkl


BIN
BiddingKG/dl/product/data/train_data2.pkl


+ 619 - 0
BiddingKG/dl_dev/role/context_model.py

@@ -0,0 +1,619 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/7/28 0028 11:32 
+
+import os
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+import sys
+sys.path.append(os.path.abspath("../../.."))
+import pandas as pd
+
+from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
+from BiddingKG.dl.common.Utils import *
+import tensorflow as tf
+import tensorflow.keras.backend as K
+# from tensorflow.keras import layers, models,optimizers,losses,callbacks
+
+from keras import layers, models,optimizers,losses,callbacks
+# import keras.backend as K
+# from keras.models import Model
+from keras.engine.topology import Layer
+
+from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+seq_len = 30 # 20
+sp = 30
+lb2id = {'招标人':0,
+         '代理人':1,
+         '中标人':2,
+         '第二候选人':3,
+         '第三候选人':4,
+         '其他角色':5}
+
+
+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
+    # assert len(input_shape)==3
+    list_input = []
+    for i in range(input_shape[0]):
+        list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
+    list_embedding = []
+
+    embedding_input = list_input
+    embedding = layers.Embedding(len(vocab),input_shape[2],
+                                 weights=[embedding_weights] if embedding_weights is not None else None,
+                                 mask_zero=True,trainable=True,name="char_embeding")
+    for i in range(len(embedding_input)):
+        list_embedding.append(embedding(embedding_input[i]))
+
+    list_w2v = list_embedding
+    list_lstm = []
+
+    # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
+    # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
+
+    list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
+    list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
+
+    concat = layers.concatenate(list_lstm, axis=1)
+
+    out = layers.Dense(classes,activation="softmax")(concat)
+    model = models.Model(list_input,out)
+    model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
+    model.summary()
+
+    return model
+
+def labeling(label, out_len=6):
+    out = np.zeros((out_len))
+    out[label] = 1
+    return out
+
+def word2id(df, seq_len=seq_len, is_test=False):
+    train_x = []
+    train_y = []
+    test_x = []
+    test_y = []
+    # print(set(df['label']))
+    # print(set(lb2id))
+    # if set(df['label']) == set(lb2id):
+    #     df['label'] = df['label'].apply(lambda x:lb2id[x])
+
+    for before, text, after, label in zip(df["front20"], df["entity_text"], df["behind20"], df["new_label"]):
+        before = before if isinstance(before, str) else ""
+        text = text if isinstance(text, str) else ""
+        after = after if isinstance(after, str) else ""
+
+        b = before.find('。')
+        if b!=-1: # 分句看不到前面句子
+            before = before[b+1:]
+        e = after.find('。')
+        if e!=-1:
+            after = after[:e+1]
+
+        x = encodeInput([before, after], word_len=seq_len, word_flag=True, userFool=False)
+        if is_test:
+            y = label
+        else:
+            y = labeling(label)
+        train_x.append(x)
+        train_y.append(y)
+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
+
+def fix_digit_eng(text):
+    '''
+    处理数字及英文编号等
+    :param text:
+    :return:
+    '''
+    text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
+    text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
+    text = re.sub('标段[一二三1-3]', '标段d', text)
+    text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
+    text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
+    text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
+    text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
+    text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
+    text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
+    return text.replace('(', '(').replace(')', ')')
+
+def train():
+    # df_train = pd.read_excel('traindata/df_train_20230908.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230908.xlsx')
+
+    # df_train = pd.read_excel('traindata/df_train_20230912.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230912.xlsx')
+
+    # df_train = pd.read_excel('traindata/df_train_20230912_predict.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+
+    # df_train = pd.read_excel('traindata/df_train_20230912_2.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230912_2.xlsx')
+    # df1 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_train.xlsx')
+    # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
+    # df3 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_train.xlsx')
+    # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
+    # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    # df_train = df_train.append([df1,df3, df, df, df, df], ignore_index=True)
+    # df_test = df_test.append([df2,df4, df], ignore_index=True)
+
+    df_train = pd.read_excel('traindata/所有训练测试数据_train.xlsx')
+    df_test = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
+    df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    df_train = df_train.append([df, df, df, df], ignore_index=True)
+    df_test = df_test.append([df], ignore_index=True)
+
+
+    df_train = df_train.sample(frac=1)
+    df_test = df_test.sample(frac=1)
+
+    df_train['front20'] = df_train['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
+    df_train['behind20'] = df_train['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
+    df_test['front20'] = df_test['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
+    df_test['behind20'] = df_test['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
+
+    # df_train['front20'] = df_train['front'].apply(lambda x: str(x)[-seq_len:])
+    # df_train['behind20'] = df_train['behind'].apply(lambda x: str(x)[:seq_len])
+    # df_test['front20'] = df_test['front'].apply(lambda x: str(x)[-seq_len:])
+    # df_test['behind20'] = df_test['behind'].apply(lambda x: str(x)[:seq_len])
+
+    df_train.fillna("", inplace=True)
+    df_test.fillna("", inplace=True)
+
+
+
+
+    if 'relabel' in df_train.columns:
+        df_train['new_label'] = df_train.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
+    if 'relabel' in df_test.columns:
+        df_test['new_label'] = df_test.apply(lambda x: int(x['relabel']) if x['relabel'] !=""  else int(x['new_label']), axis=1)
+    print('df_train', set(df_train['new_label']), set(df_train['relabel']))
+    print('df_test', set(df_test['new_label']), set(df_test['relabel']))
+
+    df_train = df_train[df_train['new_label'].isin([0,1,2,3,4,5])]
+    df_test = df_test[df_test['new_label'].isin([0,1,2,3,4,5])]
+    print('训练数据:%d,测试数据:%d'%(len(df_train), len(df_test)))
+
+    print(set(df_train['new_label']), set(lb2id.values()))
+    assert set(df_train['new_label'])==set(lb2id.values())
+
+    train_x, train_y = word2id(df_train)
+    print('train_x.shape', train_x.shape)
+    print('train_y.shape', train_y.shape)
+    print('train_x: ', train_x[0])
+    test_x, test_y = word2id(df_test)
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 加 多一个dense
+        # model.load_weights("log/ep021-loss0.078-val_loss0.104-f1_score0.969.h5",by_name=True) # 加 多一个lstm连接前后lstm输出
+
+        callback = callbacks.ModelCheckpoint(
+            filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
+            monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
+        model.fit(x=[train_x[0],train_x[1]], y=train_y, batch_size=512, epochs=100, callbacks=[callback],
+                  validation_data=[[test_x[0],test_x[1]], test_y])
+def test():
+    # df_val = pd.read_excel('traindata/df_test_20230908.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230908_predict.xlsx')
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+    # df_val = pd.read_excel('traindata/df_train_20230912_2.xlsx')
+    # df_val = pd.read_excel('traindata/角色实体分类新旧数据汇总.xlsx')
+    # df_val = pd.read_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx')
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx')
+    # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx')
+
+    # df_val = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='不确定角色表达')
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_筛选前后文不同的数据.xlsx')
+    # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果.xlsx')
+    # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx')
+    # df_val = pd.read_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx')
+
+    # df_val = pd.read_excel('traindata/所有训练测试数据_add.xlsx')
+    df_val = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
+
+    # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    # df_val = df_val.append([df], ignore_index=True)
+    # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
+    #    'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
+    #    'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
+
+    # df_val = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据.xlsx')
+
+    # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx')
+    # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx')
+
+    lb2id = {'招标人': 0, '代理人': 1, '中标人': 2, '第二候选人': 3, '第三候选人': 4, '其他角色': 5}
+    # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据.xlsx')
+    # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel']!='' else x['label'], axis=1)
+    # df_val['new_label'] = df_val['label'].apply(lambda x: lb2id[x])
+    # df_val['label'] = df_val['label'].apply(lambda x: lb2id[x])
+    # df_val['relabel'] = df_val['relabel'].apply(lambda x: lb2id.get(x, ''))
+
+    # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx')
+    # df_val.fillna('', inplace=True)
+    # print('测试公告数量:', len(df_val), set(df_val['new_label']))
+    # df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] != '' else x['new_label'], axis=1)
+
+    # # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
+    # print(df_val.columns)
+    # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
+    # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
+    # # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    # print(df2.columns)
+    # df_val = df_val.append([df2, df4], ignore_index=True)
+    # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
+    #    'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
+    #    'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
+
+    df_val.fillna('', inplace=True)
+
+    # df_val = df_val[df_val['relabel']!=6]
+
+    # for i in df_val.index:
+    #     b = df_val.loc[i, 'front']
+    #     e = df_val.loc[i, 'behind']
+    #     if not isinstance(b, str):
+    #         print('异常数据', i, type(b))
+    #     if not isinstance(e, str):
+    #         print('异常数据', i, type(e))
+
+    if 'new_label' in df_val.columns:
+        if 'relabel' in df_val.columns:
+            df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] in [0,1,2,3,4,5] else x['new_label'], axis=1)
+    else:
+        df_val['new_label'] = df_val['label']
+    # df_val['new_label'] = df_val['new_label'].apply(lambda x: x if x in [0, 1, 2, 3, 4, 5] else 5)
+    # df_val = df_val[df_val['new_label'].isin([0,1,2,3,4,5])]
+    print('测试公告数量:', len(df_val), set(df_val['new_label']))
+    df_val['new_label'] = df_val['new_label'].apply(lambda x: int(x))
+
+    df_val['front20'] = df_val['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
+    df_val['behind20'] = df_val['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
+
+    # df_val.drop_duplicates(subset=['front20', 'behind20'], inplace=True)
+    # print('测试公告去重后数量:', len(df_val))
+
+    # df_val['front20'] = df_val['front'].apply(lambda x: str(x)[-seq_len:])
+    # df_val['behind20'] = df_val['behind'].apply(lambda x: str(x)[:seq_len])
+
+    df_val['front_reverse'] = df_val['front20'].apply(lambda x: x[-6:][::-1])
+
+
+    # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] !="" else x['label'], axis=1)
+    # df_val['label'] = df_val['label'].apply(lambda x:lb2id[x] if x in lb2id else x)
+
+    df_val.reset_index(drop=True, inplace=True)
+    val_x, val_y = word2id(df_val, seq_len=seq_len, is_test=True)
+    # val_x = np.transpose(np.array(train_x), (1, 0, 2))
+
+    # old_x, old_y = word2id(df_val, seq_len=50)
+    # old_x = np.transpose(np.array(old_x), (1, 0, 2))
+    role_old = Model_role_classify_word()
+
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep008-loss0.162-val_loss0.173-f1_score0.947.h5",by_name=True)  # 20230425 取消实体,合并前后输入  效果不佳,招标代理分不清,特别是 受。。。委托这种
+        # model.load_weights("log/ep009-loss0.104-val_loss0.115-f1_score0.966.h5",by_name=True)  # 20230425 取消实体,前后分别输入
+        # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True)  # 20230425 取消实体,前后分别输入 多加一个danse
+        # model.load_weights("log/ep019-loss0.087-val_loss0.106-f1_score0.968.h5",by_name=True)  # 20230425 前后分别输入 中间用公司代替,三输入lstm后合并再次经过lstm
+        # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5",by_name=True)  # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm
+        # model.load_weights("log/20ep045-loss0.140-val_loss0.181-f1_score0.941.h5",by_name=True)  # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
+        # model.load_weights("log/20912ep038-loss0.123-val_loss0.181-f1_score0.947.h5",by_name=True)  # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
+        # model.load_weights("log/ep068-loss0.075-val_loss0.190-f1_score0.941.h5",by_name=True)  # 20230908 前后分别输入gru 去掉实体
+        # model.load_weights("log/gruep043-loss0.124-val_loss0.177-f1_score0.947.h5",by_name=True)  # 20230908 前后分别输入gru 去掉实体
+        # model.load_weights("log/ep052-loss0.130-val_loss0.216-f1_score0.931.h5",by_name=True)  # 20230919 前后分别输入gru 去掉实体 新标注数据+旧数据重新标注
+        model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5",by_name=True)  # 20231008 前后分别输入lstm 去掉实体 最终选择结果
+
+
+        # lg_old = role_old.predict(old_x)
+        # df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
+        # df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
+
+        # logit = model.predict([val_x[0], val_x[1], val_x[2]])
+        # print('新模型预测结果',logit[:3])
+        # print('旧模型预测结果:',lg_old[:3])
+        # df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
+        # df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
+        # # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
+        # df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+        # df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
+        # df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
+
+        # logit = model.predict([val_x])
+        logit = model.predict([val_x[0],val_x[1]])
+        print('新模型预测结果', logit[:3])
+        # df_val['pred_new2'] = df_val['pred_new']
+
+        df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
+        df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
+        # df_val['new=new2'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_new2'] else 0, axis=1)
+        df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['new_label'] else 0, axis=1)
+
+
+        for it in set(df_val['new_label']):
+            df_tmp = df_val[df_val['new_label']==it]
+            lb = len(df_tmp)
+            eq = sum(df_tmp['new=lb'])
+            pr = len(df_val[df_val['pred_new']==it])
+            acc = eq/pr if pr>0 else 0
+            recall = eq/lb if lb>0 else 0
+            f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
+            print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
+
+        print('旧模型:')
+        df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['new_label'] else 0, axis=1)
+        for it in set(df_val['label']):
+            df_tmp = df_val[df_val['new_label']==it]
+            lb = len(df_tmp)
+            eq = sum(df_tmp['old=lb'])
+            pr = len(df_val[df_val['label']==it])
+            acc = eq/pr if pr>0 else 0
+            recall = eq/lb if lb>0 else 0
+            f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
+            print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
+
+        # df_val.to_excel('traindata/df_val_predict.xlsx')
+        # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
+        # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
+        # df_val.to_excel('traindata/df_test_20230908_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx', index=False)
+        # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/df_test_20230912_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/df_test_20230912_加补充数据_predict.xlsx', index=False)
+        # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果.xlsx', index=False)
+        # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx', index=False)
+        # df_val.to_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx', index=False)
+
+        # df_val.to_excel('traindata/所有训练测试数据_add_predict.xlsx', index=False)
+
+        # df_val.to_excel('traindata/所有训练测试数据_test_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/df_train_20230912_predict.xlsx', index=False)
+
+        # df_val = df_val[df_val['new=lb']==0]
+        # for i in df_val.index:
+        #     if ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])):
+        #         print('过滤异常数据',i ,ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])))
+        #         df_val.drop(index=i, inplace=True)
+        # print('不一致数量: ', len(df_val))
+        # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果_重新不一致结果.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_pred.xlsx', index=False)
+
+        # df_val.to_excel('traindata/角色实体分类新旧数据汇总_predict.xlsx', index=False)
+        # df_val.to_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926_predict.xlsx', index=False)
+        # df_val.to_excel('E:\角色金额数据/易错角色表达_predict.xlsx', index=False)
+        print('df_val.columns', df_val.columns)
+
+
+'''
+类别:0, acc:0.4199, recall:0.6492, f1: 0.5099
+类别:1, acc:0.5126, recall:0.7846, f1: 0.6201
+类别:2, acc:0.4416, recall:0.6632, f1: 0.5301
+类别:3, acc:0.7455, recall:0.7961, f1: 0.7700
+类别:4, acc:0.7471, recall:0.8553, f1: 0.7975
+类别:5, acc:0.9664, recall:0.9100, f1: 0.9373
+
+类别:0, acc:0.9537, recall:0.9777, f1: 0.9655
+类别:1, acc:0.9589, recall:0.9722, f1: 0.9655
+类别:2, acc:0.9227, recall:0.9502, f1: 0.9363
+类别:3, acc:0.8750, recall:0.9333, f1: 0.9032
+类别:4, acc:0.9643, recall:1.0000, f1: 0.9818
+类别:5, acc:0.9476, recall:0.8690, f1: 0.9066
+
+类别:0, acc:0.9393, recall:0.9319, f1: 0.9356
+类别:1, acc:0.9500, recall:0.9620, f1: 0.9560
+类别:2, acc:0.9156, recall:0.9406, f1: 0.9279
+类别:3, acc:0.8857, recall:0.9394, f1: 0.9118
+类别:4, acc:0.9655, recall:0.9333, f1: 0.9492
+类别:5, acc:0.9102, recall:0.8990, f1: 0.9046
+
+类别:0, acc:0.9357, recall:0.9615, f1: 0.9484
+类别:1, acc:0.9538, recall:0.9483, f1: 0.9510
+类别:2, acc:0.9271, recall:0.9366, f1: 0.9318
+类别:3, acc:0.9600, recall:0.9863, f1: 0.9730
+类别:4, acc:0.9429, recall:0.9851, f1: 0.9635
+类别:5, acc:0.9407, recall:0.9098, f1: 0.9250
+
+类别:0, acc:0.9402, recall:0.9556, f1: 0.9478
+类别:1, acc:0.9593, recall:0.9375, f1: 0.9483
+类别:2, acc:0.9243, recall:0.9412, f1: 0.9327
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
+类别:4, acc:0.9452, recall:0.9857, f1: 0.9650
+类别:5, acc:0.9296, recall:0.9058, f1: 0.9176
+
+类别:0, acc:0.9468, recall:0.9568, f1: 0.9518
+类别:1, acc:0.9489, recall:0.9489, f1: 0.9489
+类别:2, acc:0.9388, recall:0.9312, f1: 0.9350
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
+类别:4, acc:0.9324, recall:0.9857, f1: 0.9583
+类别:5, acc:0.9316, recall:0.9202, f1: 0.9258
+
+类别:0, acc:0.9455, recall:0.9478, f1: 0.9467
+类别:1, acc:0.9375, recall:0.9538, f1: 0.9456
+类别:2, acc:0.9275, recall:0.9295, f1: 0.9285
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
+类别:4, acc:0.9583, recall:0.9857, f1: 0.9718
+类别:5, acc:0.9262, recall:0.9159, f1: 0.9210
+
+类别:0, acc:0.9331, recall:0.9516, f1: 0.9423
+类别:1, acc:0.9524, recall:0.9467, f1: 0.9496
+类别:2, acc:0.9437, recall:0.9089, f1: 0.9260
+类别:3, acc:0.9565, recall:0.9565, f1: 0.9565
+类别:4, acc:0.9242, recall:0.9683, f1: 0.9457
+类别:5, acc:0.9270, recall:0.9261, f1: 0.9266
+
+新模型:
+类别:0, acc:0.9336, recall:0.9225, f1: 0.9280
+类别:1, acc:0.9389, recall:0.9762, f1: 0.9572
+类别:2, acc:0.8937, recall:0.9439, f1: 0.9181
+类别:3, acc:0.9130, recall:1.0000, f1: 0.9545
+类别:4, acc:0.9545, recall:0.8936, f1: 0.9231
+类别:5, acc:0.9445, recall:0.9292, f1: 0.9368
+旧模型:
+类别:0, acc:0.8323, recall:0.7694, f1: 0.7996
+类别:1, acc:0.9565, recall:0.8730, f1: 0.9129
+类别:2, acc:0.8800, recall:0.8491, f1: 0.8643
+类别:3, acc:0.8723, recall:0.9762, f1: 0.9213
+类别:4, acc:0.9778, recall:0.9362, f1: 0.9565
+类别:5, acc:0.8402, recall:0.8878, f1: 0.8633
+'''
+
+def get_savedModel():
+    sess = tf.Session(graph=tf.Graph())
+    with sess.as_default():
+        with sess.graph.as_default():
+            vocab, matrix = getVocabAndMatrix(getModel_word())
+            model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+            sess.run(tf.global_variables_initializer())
+            # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
+            # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5")  #7月30日训练最优模型20字
+            # model.load_weights(filepath="../../dl_dev/role/log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
+            # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5", # 20230427
+            # model.load_weights("log/ep059-loss0.096-val_loss0.180-f1_score0.945.h5", # 20231008 重新整理标注数据后结果
+            # model.load_weights("log/ep059-loss0.101-val_loss0.191-f1_score0.940.h5", # 20231012 重新整理标注数据后结果
+            # model.load_weights("log/ep052-loss0.123-val_loss0.194-f1_score0.937.h5", # 20231012 重新整理标注数据后结果
+            model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5", # 20231026 重新整理标注数据后结果
+                               by_name=True)  # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm 2023/04/27
+            tf.saved_model.simple_save(session=sess,
+                                       export_dir="role_savedmodel2023-10-26", # role_savedmodel2021-8-5
+                                       inputs={"input0": model.input[0],
+                                               "input1": model.input[1],
+                                               }, #"input2": model.input[2]
+                                       outputs={"outputs": model.output})
+
+def predict_pb():
+    # df_val = pd.read_excel('traindata/df_val.xlsx')
+    df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+    old_x, old_y = word2id(df_val, seq_len=seq_len)
+    # old_x = np.transpose(np.array(old_x), (1, 0, 2))
+
+    sess_role = tf.Session()
+    with sess_role.as_default() as sess:
+        with sess_role.graph.as_default():
+            meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
+                                                        export_dir="role_savedmodel2023-10-08") # role_savedmodel2021-8-5  role_savedmodel2023-04-27
+            signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+            signature_def = meta_graph_def.signature_def
+
+            input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
+            input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
+            # input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
+            output = sess_role.graph.get_tensor_by_name(
+                signature_def[signature_key].outputs["outputs"].name)
+            model_role = [[input0, input1], output] #, input2
+            lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
+                                                      input1:old_x[1],
+                                                      }) # input2:old_x[2]
+            print(lg_old[:3])
+            pos = neg = 0
+            for i in range(len(lg_old)):
+                if np.argmax(lg_old[i]) !=  np.argmax(old_y[i]):
+                    print(np.argmax(lg_old[i]) , np.argmax(old_y[i]))
+                    neg += 1
+                else:
+                    pos += 1
+            print(pos, neg, pos/(pos+neg))
+
+
+if __name__ == "__main__":
+    # train()
+    test()
+    # get_savedModel()
+    # predict_pb()
+
+    # import tensorflow as tf
+    #
+    # # X = tf.constant([[[1, 1, 1], [2, 2, 2]],
+    # #                  [[3, 3, 3], [4, 4, 4]],
+    # #                  [[5, 5, 5], [6, 6, 6]]])
+    # X = tf.constant([[1, 1, 1], [2, 2, 2]]
+    #                  )
+    # print(X.shape)
+    # rs = tf.slice(X, [0, 0], [1, -1])
+    # with tf.Session() as sess:
+    #     print(sess.run(rs))
+
+
+
+
+
+
+
+
+
+

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно