Преглед изворни кода

更新角色模型;调整角色规则、表格提取、预处理表头规则等

lsm пре 1 година
родитељ
комит
9b97466121

Разлика између датотеке није приказан због своје велике величине
+ 92 - 10
BiddingKG/dl/interface/Preprocessing.py


+ 4 - 3
BiddingKG/dl/interface/extract.py

@@ -340,7 +340,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-09-13'}
+    version_date = {'version_date': '2023-10-30'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''
@@ -399,11 +399,12 @@ def get_role_context(docid, list_sentences, list_entitys):
     for list_entity in list_entitys:
         for _entity in list_entity:
             if _entity.entity_type in ['org', 'company']:
+                idx = _entity.entity_id
                 sentence = sentences[_entity.sentence_index]
                 # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
                 #                    center_include=False, word_flag=True, text=_entity.entity_text)
-                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=20, center_include=False)
-                rs_list.append((docid, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
+                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=40, center_include=False)
+                rs_list.append((docid,idx, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
                 _entity.entity_text, _span[1]))
     return rs_list
 

+ 19 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -81,6 +81,23 @@ class Model_role_classify_word():
         # print(_encode_span)
         return _encode_span
 
+    def fix_digit_eng(self, text):
+        '''
+        处理数字及英文编号等
+        :param text:
+        :return:
+        '''
+        text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
+        text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
+        text = re.sub('标段[一二三1-3]', '标段d', text)
+        text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
+        text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
+        text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
+        text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
+        text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
+        text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
+        return text.replace('(', '(').replace(')', ')')
+
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
         '''
         上下文数字化,使用字偏移
@@ -93,7 +110,8 @@ class Model_role_classify_word():
         '''
         _span = get_context(sentence_text, begin_index, end_index,size=size, center_include=False)  # size=12 center_include=True
         # print(_span)
-        _encode_span = encodeInput(_span, word_len=20, word_flag=True, userFool=False)  # word_len=20
+        _span = [self.fix_digit_eng(text) for text in _span]
+        _encode_span = encodeInput(_span, word_len=30, word_flag=True, userFool=False)  # word_len=20
         # print(_encode_span)
         return _encode_span
     

+ 59 - 24
BiddingKG/dl/interface/predictor.py

@@ -790,6 +790,14 @@ class PREMPredict():
                 if re.search('拟邀请$', front):
                     label = 2
                     values[label] = 0.501
+                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?)(名称)?[是为:]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
+                    label = 1
+                    values[label] = 0.501
+                elif re.search('采用$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
+                    label = 5
+                elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
+                    label = 2
+                    values[label] = 0.501
             elif label == 2:
                 if re.search('中标单位和.{,25}签订合同', whole):
                     label = 0
@@ -806,6 +814,19 @@ class PREMPredict():
                 elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
+                elif re.search('税费', front) and re.search('^承担', behind):
+                    label = 5
+                elif re.search('第一候补|第一后备|备选', front):
+                    label = 2
+                    values[label] = 0.6
+                elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
+                    values[2] = 0.5
+                    label = 5
+                elif re.search('(承包权人|帐户名称):$', front):
+                    label = 5
+                elif re.search('合同供方:?$', front):
+                    label = 0
+                    values[label] = 0.5
             elif re.search('是否中标:是,供应商', front) and label == 5:
                 label = 2
                 values[label] = 0.9
@@ -821,6 +842,11 @@ class PREMPredict():
                     values[label] = 0.501
                 elif re.search('^:受', behind):  # 354009560 附件格式问题 ,中选中介服务机构通知书,编号:HZ2305120541,中汕项目管理有限公司:受惠东县人民政府大岭街道办事处委托
                     label = 5
+                elif re.search('发布机构', front) and re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站|馆)$', entity.entity_text):
+                    label = 0
+                    values[label] = 0.501
+                elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
+                    label = 5
             elif label in [3,4]:
                 if re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
                     label = 2
@@ -834,6 +860,9 @@ class PREMPredict():
                 elif re.search('\d+\.\d+[,、]?(中标|成交)候选人', front):
                     label = 5
                     values[label] = 0.501
+                elif re.search('第一名:$', front):
+                    label = 2
+                    values[label] = 0.7
             elif re.search('(中标|成交)通知书[,:]$', front) and re.search('^:', behind) and label != 2:
                 label = 2
                 values[label] = 0.8
@@ -1318,10 +1347,10 @@ class RoleRulePredictor():
                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
                                         "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\
-                                        "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
+                                        "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
         self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
-                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
@@ -1331,7 +1360,7 @@ class RoleRulePredictor():
         self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_50>" \
                "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
-               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?" \
+               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
                "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)"  # 承办单位:不作为中标 83914772
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
@@ -1356,7 +1385,7 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|盖章)?[::是为]+$)"
+        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?\w{,5})?|如下|:?牵头人)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1531,7 +1560,9 @@ class RoleRulePredictor():
                                                            word_flag=True, use_text=True, text=re.sub(")", ")",
                                                                                                       re.sub("(", "(",
                                                                                                              p_entity.entity_text)))
-                                        if str(_span[1] + _span[2][:len(str(_name))]).find(
+                                        if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标  368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目
+                                            break
+                                        if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
                                                 _name) >= 0:
                                             if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
                                                 find_flag = True
@@ -1698,7 +1729,7 @@ class RoleRulePredictor():
                                     p_entity.label = 0
                                     # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
             if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search(
-                    '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
+                    '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
                     article.title+article.content[:100]):
                 for p_entity in candidates:
                     # print('只有一个候选人的作为中标人', p_entity.entity_text)
@@ -2144,16 +2175,18 @@ class TendereeRuleRecall():
 
 class RoleGrade():
     def __init__(self):
-        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
-        self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
-        self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包|甲)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
+        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
+        self.tenderee_center_8 = "(?P<tenderee_center_8>受.{5,20}委托)"
+        self.tenderee_left_8 = "(?P<tenderee_left_8>(尊敬的供应商|(需求|最终|发包|征集|甲|转让|出租|处置)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
+        self.tenderee_left_6 = "(?P<tenderee_left_6>(发布|业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方|发布机构)"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]|排名:1)"
+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]|排[序]:1|名次:1)"
         self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
-        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
-        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
-        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
-                             self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
+        self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
+        self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.agency_left_9, self.winTenderer_left_9,
+                             self.winTenderer_left_8,self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9]
     def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
         '''
         根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
@@ -2173,7 +2206,7 @@ class RoleGrade():
             if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
                 text = sentences[entity.sentence_index].sentence_text
                 in_att = sentences[entity.sentence_index].in_attachment
-                pre_prob = entity.values[entity.label]
+                pre_prob = entity.values[entity.label] # 模型预测角色概率
                 b = entity.wordOffset_begin
                 e = entity.wordOffset_end
                 not_found = 1
@@ -2196,9 +2229,11 @@ class RoleGrade():
                         _prob = int(_prob)*0.1
                         # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
                         if in_att:
-                            _prob = _prob - 0.2
-                        if pre_prob < _prob:
+                            _prob = _prob - 0.1 # 0.2
+                        if pre_prob < _prob: # 如果模型预测概率小于关键词概率
                             _prob = 0.65
+                        if len(entity.entity_text) < 6: # 如果实体名称小于6个字,概率再降0.05
+                            _prob -= 0.05
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0
                         # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
@@ -2965,7 +3000,7 @@ class ProductAttributesPredictor():
         total_price_list = []  # 总价列表,拥有判断是否为几行产品合计总价
         # print('表格数:', len(tables))
 
-        for i in range(len(tables)-1, -1, -1):
+        for i in range(len(tables)):  # (len(tables)-1, -1, -1) 由从最后到前改为 前到后
             table = tables[i]
             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
                 table.string = table.get_text()
@@ -5342,7 +5377,7 @@ class TablePremExtractor(object):
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
             "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
-            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因",
+            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
@@ -5356,7 +5391,7 @@ class TablePremExtractor(object):
 
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -5463,7 +5498,7 @@ class TablePremExtractor(object):
             bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
             win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
 
-            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
+            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
                 break
             if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2:  # 内容为空或全部一样 停止匹配
@@ -5483,7 +5518,7 @@ class TablePremExtractor(object):
                 project_name = ''
             previous_package = package_code
 
-            if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
+            if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
                 break
             if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
                 continue
@@ -5694,7 +5729,7 @@ class CandidateExtractor(object):
             self.headerset = pickle.load(f)
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -5785,7 +5820,7 @@ class CandidateExtractor(object):
             second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
 
-            if set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
                 # print('包含表头, 停止匹配')
                 break
             if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配

BIN
BiddingKG/dl/interface/role_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.index


BIN
BiddingKG/dl/product/data/dev_data.pkl


BIN
BiddingKG/dl/product/data/dev_data2.pkl


BIN
BiddingKG/dl/product/data/train_data.pkl


BIN
BiddingKG/dl/product/data/train_data2.pkl


+ 619 - 0
BiddingKG/dl_dev/role/context_model.py

@@ -0,0 +1,619 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/7/28 0028 11:32 
+
+import os
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+import sys
+sys.path.append(os.path.abspath("../../.."))
+import pandas as pd
+
+from BiddingKG.dl.interface.modelFactory import Model_role_classify_word
+from BiddingKG.dl.common.Utils import *
+import tensorflow as tf
+import tensorflow.keras.backend as K
+# from tensorflow.keras import layers, models,optimizers,losses,callbacks
+
+from keras import layers, models,optimizers,losses,callbacks
+# import keras.backend as K
+# from keras.models import Model
+from keras.engine.topology import Layer
+
+from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+seq_len = 30 # 20
+sp = 30
+lb2id = {'招标人':0,
+         '代理人':1,
+         '中标人':2,
+         '第二候选人':3,
+         '第三候选人':4,
+         '其他角色':5}
+
+
+def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
+    # assert len(input_shape)==3
+    list_input = []
+    for i in range(input_shape[0]):
+        list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i)))
+    list_embedding = []
+
+    embedding_input = list_input
+    embedding = layers.Embedding(len(vocab),input_shape[2],
+                                 weights=[embedding_weights] if embedding_weights is not None else None,
+                                 mask_zero=True,trainable=True,name="char_embeding")
+    for i in range(len(embedding_input)):
+        list_embedding.append(embedding(embedding_input[i]))
+
+    list_w2v = list_embedding
+    list_lstm = []
+
+    # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
+    # list_lstm.append(layers.Bidirectional(layers.GRU(60, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
+
+    list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[0])) #dropout=0.5, recurrent_dropout=0.5
+    list_lstm.append(layers.Bidirectional(layers.LSTM(120, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))(list_w2v[1]))
+
+    concat = layers.concatenate(list_lstm, axis=1)
+
+    out = layers.Dense(classes,activation="softmax")(concat)
+    model = models.Model(list_input,out)
+    model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score])
+    model.summary()
+
+    return model
+
+def labeling(label, out_len=6):
+    out = np.zeros((out_len))
+    out[label] = 1
+    return out
+
+def word2id(df, seq_len=seq_len, is_test=False):
+    train_x = []
+    train_y = []
+    test_x = []
+    test_y = []
+    # print(set(df['label']))
+    # print(set(lb2id))
+    # if set(df['label']) == set(lb2id):
+    #     df['label'] = df['label'].apply(lambda x:lb2id[x])
+
+    for before, text, after, label in zip(df["front20"], df["entity_text"], df["behind20"], df["new_label"]):
+        before = before if isinstance(before, str) else ""
+        text = text if isinstance(text, str) else ""
+        after = after if isinstance(after, str) else ""
+
+        b = before.find('。')
+        if b!=-1: # 分句看不到前面句子
+            before = before[b+1:]
+        e = after.find('。')
+        if e!=-1:
+            after = after[:e+1]
+
+        x = encodeInput([before, after], word_len=seq_len, word_flag=True, userFool=False)
+        if is_test:
+            y = label
+        else:
+            y = labeling(label)
+        train_x.append(x)
+        train_y.append(y)
+    return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y)
+
+def fix_digit_eng(text):
+    '''
+    处理数字及英文编号等
+    :param text:
+    :return:
+    '''
+    text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
+    text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
+    text = re.sub('标段[一二三1-3]', '标段d', text)
+    text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
+    text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
+    text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
+    text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
+    text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
+    text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
+    return text.replace('(', '(').replace(')', ')')
+
+def train():
+    # df_train = pd.read_excel('traindata/df_train_20230908.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230908.xlsx')
+
+    # df_train = pd.read_excel('traindata/df_train_20230912.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230912.xlsx')
+
+    # df_train = pd.read_excel('traindata/df_train_20230912_predict.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+
+    # df_train = pd.read_excel('traindata/df_train_20230912_2.xlsx')
+    # df_test = pd.read_excel('traindata/df_test_20230912_2.xlsx')
+    # df1 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_train.xlsx')
+    # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
+    # df3 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_train.xlsx')
+    # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
+    # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    # df_train = df_train.append([df1,df3, df, df, df, df], ignore_index=True)
+    # df_test = df_test.append([df2,df4, df], ignore_index=True)
+
+    df_train = pd.read_excel('traindata/所有训练测试数据_train.xlsx')
+    df_test = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
+    df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    df_train = df_train.append([df, df, df, df], ignore_index=True)
+    df_test = df_test.append([df], ignore_index=True)
+
+
+    df_train = df_train.sample(frac=1)
+    df_test = df_test.sample(frac=1)
+
+    df_train['front20'] = df_train['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
+    df_train['behind20'] = df_train['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
+    df_test['front20'] = df_test['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
+    df_test['behind20'] = df_test['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
+
+    # df_train['front20'] = df_train['front'].apply(lambda x: str(x)[-seq_len:])
+    # df_train['behind20'] = df_train['behind'].apply(lambda x: str(x)[:seq_len])
+    # df_test['front20'] = df_test['front'].apply(lambda x: str(x)[-seq_len:])
+    # df_test['behind20'] = df_test['behind'].apply(lambda x: str(x)[:seq_len])
+
+    df_train.fillna("", inplace=True)
+    df_test.fillna("", inplace=True)
+
+
+
+
+    if 'relabel' in df_train.columns:
+        df_train['new_label'] = df_train.apply(lambda x: int(x['relabel']) if x['relabel'] !="" else int(x['new_label']), axis=1)
+    if 'relabel' in df_test.columns:
+        df_test['new_label'] = df_test.apply(lambda x: int(x['relabel']) if x['relabel'] !=""  else int(x['new_label']), axis=1)
+    print('df_train', set(df_train['new_label']), set(df_train['relabel']))
+    print('df_test', set(df_test['new_label']), set(df_test['relabel']))
+
+    df_train = df_train[df_train['new_label'].isin([0,1,2,3,4,5])]
+    df_test = df_test[df_test['new_label'].isin([0,1,2,3,4,5])]
+    print('训练数据:%d,测试数据:%d'%(len(df_train), len(df_test)))
+
+    print(set(df_train['new_label']), set(lb2id.values()))
+    assert set(df_train['new_label'])==set(lb2id.values())
+
+    train_x, train_y = word2id(df_train)
+    print('train_x.shape', train_x.shape)
+    print('train_y.shape', train_y.shape)
+    print('train_x: ', train_x[0])
+    test_x, test_y = word2id(df_test)
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True) # 加 多一个dense
+        # model.load_weights("log/ep021-loss0.078-val_loss0.104-f1_score0.969.h5",by_name=True) # 加 多一个lstm连接前后lstm输出
+
+        callback = callbacks.ModelCheckpoint(
+            filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",
+            monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min")
+        model.fit(x=[train_x[0],train_x[1]], y=train_y, batch_size=512, epochs=100, callbacks=[callback],
+                  validation_data=[[test_x[0],test_x[1]], test_y])
+def test():
+    # df_val = pd.read_excel('traindata/df_test_20230908.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230908_predict.xlsx')
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+    # df_val = pd.read_excel('traindata/df_train_20230912_2.xlsx')
+    # df_val = pd.read_excel('traindata/角色实体分类新旧数据汇总.xlsx')
+    # df_val = pd.read_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果.xlsx')
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000.xlsx')
+    # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx')
+
+    # df_val = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='不确定角色表达')
+    # df_val = pd.read_excel('E:\实体识别数据/2023-08-24所有公告_筛选前后文不同的数据.xlsx')
+    # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果.xlsx')
+    # df_val = pd.read_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx')
+    # df_val = pd.read_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx')
+
+    # df_val = pd.read_excel('traindata/所有训练测试数据_add.xlsx')
+    df_val = pd.read_excel('traindata/所有训练测试数据_test.xlsx')
+
+    # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    # df_val = df_val.append([df], ignore_index=True)
+    # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
+    #    'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
+    #    'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
+
+    # df_val = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据.xlsx')
+
+    # df_val = pd.read_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx')
+    # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx')
+
+    lb2id = {'招标人': 0, '代理人': 1, '中标人': 2, '第二候选人': 3, '第三候选人': 4, '其他角色': 5}
+    # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据.xlsx')
+    # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel']!='' else x['label'], axis=1)
+    # df_val['new_label'] = df_val['label'].apply(lambda x: lb2id[x])
+    # df_val['label'] = df_val['label'].apply(lambda x: lb2id[x])
+    # df_val['relabel'] = df_val['relabel'].apply(lambda x: lb2id.get(x, ''))
+
+    # df_val = pd.read_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx')
+    # df_val.fillna('', inplace=True)
+    # print('测试公告数量:', len(df_val), set(df_val['new_label']))
+    # df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] != '' else x['new_label'], axis=1)
+
+    # # df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+    # df_val = pd.read_excel('traindata/df_test_20230912_2.xlsx')
+    # print(df_val.columns)
+    # df2 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_补充训练数据_test.xlsx')
+    # df4 = pd.read_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_类别5的数据_补充数据_test.xlsx')
+    # # df = pd.read_excel('E:\角色金额数据/易错角色表达.xlsx', sheet_name='需补充数据')
+    # print(df2.columns)
+    # df_val = df_val.append([df2, df4], ignore_index=True)
+    # df_val = df_val[['entity_id', 'docid', 'label', 'front', 'entity_text', 'behind',
+    #    'new_label', 'relabel', 'kws', 'new_old', 'front20', 'behind20',
+    #    'front_reverse', 'pred_new', 'prob_new', 'new=lb']]
+
+    df_val.fillna('', inplace=True)
+
+    # df_val = df_val[df_val['relabel']!=6]
+
+    # for i in df_val.index:
+    #     b = df_val.loc[i, 'front']
+    #     e = df_val.loc[i, 'behind']
+    #     if not isinstance(b, str):
+    #         print('异常数据', i, type(b))
+    #     if not isinstance(e, str):
+    #         print('异常数据', i, type(e))
+
+    if 'new_label' in df_val.columns:
+        if 'relabel' in df_val.columns:
+            df_val['new_label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] in [0,1,2,3,4,5] else x['new_label'], axis=1)
+    else:
+        df_val['new_label'] = df_val['label']
+    # df_val['new_label'] = df_val['new_label'].apply(lambda x: x if x in [0, 1, 2, 3, 4, 5] else 5)
+    # df_val = df_val[df_val['new_label'].isin([0,1,2,3,4,5])]
+    print('测试公告数量:', len(df_val), set(df_val['new_label']))
+    df_val['new_label'] = df_val['new_label'].apply(lambda x: int(x))
+
+    df_val['front20'] = df_val['front'].apply(lambda x: fix_digit_eng(str(x)[-seq_len:]))
+    df_val['behind20'] = df_val['behind'].apply(lambda x: fix_digit_eng(str(x)[:seq_len]))
+
+    # df_val.drop_duplicates(subset=['front20', 'behind20'], inplace=True)
+    # print('测试公告去重后数量:', len(df_val))
+
+    # df_val['front20'] = df_val['front'].apply(lambda x: str(x)[-seq_len:])
+    # df_val['behind20'] = df_val['behind'].apply(lambda x: str(x)[:seq_len])
+
+    df_val['front_reverse'] = df_val['front20'].apply(lambda x: x[-6:][::-1])
+
+
+    # df_val['label'] = df_val.apply(lambda x: x['relabel'] if x['relabel'] !="" else x['label'], axis=1)
+    # df_val['label'] = df_val['label'].apply(lambda x:lb2id[x] if x in lb2id else x)
+
+    df_val.reset_index(drop=True, inplace=True)
+    val_x, val_y = word2id(df_val, seq_len=seq_len, is_test=True)
+    # val_x = np.transpose(np.array(train_x), (1, 0, 2))
+
+    # old_x, old_y = word2id(df_val, seq_len=50)
+    # old_x = np.transpose(np.array(old_x), (1, 0, 2))
+    role_old = Model_role_classify_word()
+
+    with tf.Session() as sess:
+        vocab, matrix = getVocabAndMatrix(getModel_word())
+        model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+        print("loading weights")
+        # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep006-loss0.174-val_loss0.234-f1_score0.917.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep014-loss0.091-val_loss0.110-f1_score0.968.h5",by_name=True, skip_mismatch=True)
+        # model.load_weights("log/ep008-loss0.162-val_loss0.173-f1_score0.947.h5",by_name=True)  # 20230425 取消实体,合并前后输入  效果不佳,招标代理分不清,特别是 受。。。委托这种
+        # model.load_weights("log/ep009-loss0.104-val_loss0.115-f1_score0.966.h5",by_name=True)  # 20230425 取消实体,前后分别输入
+        # model.load_weights("log/ep008-loss0.103-val_loss0.109-f1_score0.970.h5",by_name=True)  # 20230425 取消实体,前后分别输入 多加一个danse
+        # model.load_weights("log/ep019-loss0.087-val_loss0.106-f1_score0.968.h5",by_name=True)  # 20230425 前后分别输入 中间用公司代替,三输入lstm后合并再次经过lstm
+        # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5",by_name=True)  # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm
+        # model.load_weights("log/20ep045-loss0.140-val_loss0.181-f1_score0.941.h5",by_name=True)  # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
+        # model.load_weights("log/20912ep038-loss0.123-val_loss0.181-f1_score0.947.h5",by_name=True)  # 20230908 前后分别输入 去掉实体,2输入lstm后合并输出
+        # model.load_weights("log/ep068-loss0.075-val_loss0.190-f1_score0.941.h5",by_name=True)  # 20230908 前后分别输入gru 去掉实体
+        # model.load_weights("log/gruep043-loss0.124-val_loss0.177-f1_score0.947.h5",by_name=True)  # 20230908 前后分别输入gru 去掉实体
+        # model.load_weights("log/ep052-loss0.130-val_loss0.216-f1_score0.931.h5",by_name=True)  # 20230919 前后分别输入gru 去掉实体 新标注数据+旧数据重新标注
+        model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5",by_name=True)  # 20231008 前后分别输入lstm 去掉实体 最终选择结果
+
+
+        # lg_old = role_old.predict(old_x)
+        # df_val['pred_old'] = pd.DataFrame(np.argmax(lg_old, axis=1))
+        # df_val['prob_old'] = pd.DataFrame(np.amax(lg_old, axis=1))
+
+        # logit = model.predict([val_x[0], val_x[1], val_x[2]])
+        # print('新模型预测结果',logit[:3])
+        # print('旧模型预测结果:',lg_old[:3])
+        # df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
+        # df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
+        # # df_val['new=new3'] = df_val.apply(lambda x: 1 if x['pred_new3'] == x['pred_new2'] else 0, axis=1)
+        # df_val['new=old'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+        # df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['pred_old'] else 0, axis=1)
+        # df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
+
+        # logit = model.predict([val_x])
+        logit = model.predict([val_x[0],val_x[1]])
+        print('新模型预测结果', logit[:3])
+        # df_val['pred_new2'] = df_val['pred_new']
+
+        df_val['pred_new'] = pd.DataFrame(np.argmax(logit, axis=-1))
+        df_val['prob_new'] = pd.DataFrame(np.amax(logit, axis=1))
+        # df_val['new=new2'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['pred_new2'] else 0, axis=1)
+        df_val['new=lb'] = df_val.apply(lambda x: 1 if x['pred_new'] == x['new_label'] else 0, axis=1)
+
+
+        for it in set(df_val['new_label']):
+            df_tmp = df_val[df_val['new_label']==it]
+            lb = len(df_tmp)
+            eq = sum(df_tmp['new=lb'])
+            pr = len(df_val[df_val['pred_new']==it])
+            acc = eq/pr if pr>0 else 0
+            recall = eq/lb if lb>0 else 0
+            f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
+            print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
+
+        print('旧模型:')
+        df_val['old=lb'] = df_val.apply(lambda x: 1 if x['label'] == x['new_label'] else 0, axis=1)
+        for it in set(df_val['label']):
+            df_tmp = df_val[df_val['new_label']==it]
+            lb = len(df_tmp)
+            eq = sum(df_tmp['old=lb'])
+            pr = len(df_val[df_val['label']==it])
+            acc = eq/pr if pr>0 else 0
+            recall = eq/lb if lb>0 else 0
+            f1 = 2*recall*acc/(acc+recall) if (acc+recall)>0 else 0
+            print('类别:%d, acc:%.4f, recall:%.4f, f1: %.4f'%(it, acc, recall, f1))
+
+        # df_val.to_excel('traindata/df_val_predict.xlsx')
+        # df_val.to_excel('traindata/兼职标注数据_test29_predict.xlsx')
+        # df_val.to_excel('traindata/兼职标注数据_test3_predict.xlsx')
+        # df_val.to_excel('traindata/df_test_20230908_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果all_所有筛选训练测试数据_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict_重新标注数据20230919.xlsx', index=False)
+        # df_val.to_excel('traindata/旧训练测试数据_筛选数据_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/df_test_20230912_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/df_test_20230912_加补充数据_predict.xlsx', index=False)
+        # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果.xlsx', index=False)
+        # df_val.to_excel('E:\角色金额数据/新旧角色模型不一致数据_原模型识别结果及新模型预测结果_re.xlsx', index=False)
+        # df_val.to_excel('E:\实体识别数据/少于10条关键词补充数据.xlsx', index=False)
+
+        # df_val.to_excel('traindata/所有训练测试数据_add_predict.xlsx', index=False)
+
+        # df_val.to_excel('traindata/所有训练测试数据_test_predict.xlsx', index=False)
+        # df_val.to_excel('traindata/df_train_20230912_predict.xlsx', index=False)
+
+        # df_val = df_val[df_val['new=lb']==0]
+        # for i in df_val.index:
+        #     if ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])):
+        #         print('过滤异常数据',i ,ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'front'])) or ILLEGAL_CHARACTERS_RE.search(str(df_val.loc[i, 'behind'])))
+        #         df_val.drop(index=i, inplace=True)
+        # print('不一致数量: ', len(df_val))
+        # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果_重新不一致结果.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_重新预测结果40000-60000_重新不一致结果.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据.xlsx', index=False)
+        # df_val.to_excel('traindata/2023-08-24所有公告_筛选前后文不同的数据_新旧预测不一致_新为中标数据_pred.xlsx', index=False)
+
+        # df_val.to_excel('traindata/角色实体分类新旧数据汇总_predict.xlsx', index=False)
+        # df_val.to_excel('E:/角色金额数据/数据库验证数据原模型识别结果20230926_predict.xlsx', index=False)
+        # df_val.to_excel('E:\角色金额数据/易错角色表达_predict.xlsx', index=False)
+        print('df_val.columns', df_val.columns)
+
+
+'''
+类别:0, acc:0.4199, recall:0.6492, f1: 0.5099
+类别:1, acc:0.5126, recall:0.7846, f1: 0.6201
+类别:2, acc:0.4416, recall:0.6632, f1: 0.5301
+类别:3, acc:0.7455, recall:0.7961, f1: 0.7700
+类别:4, acc:0.7471, recall:0.8553, f1: 0.7975
+类别:5, acc:0.9664, recall:0.9100, f1: 0.9373
+
+类别:0, acc:0.9537, recall:0.9777, f1: 0.9655
+类别:1, acc:0.9589, recall:0.9722, f1: 0.9655
+类别:2, acc:0.9227, recall:0.9502, f1: 0.9363
+类别:3, acc:0.8750, recall:0.9333, f1: 0.9032
+类别:4, acc:0.9643, recall:1.0000, f1: 0.9818
+类别:5, acc:0.9476, recall:0.8690, f1: 0.9066
+
+类别:0, acc:0.9393, recall:0.9319, f1: 0.9356
+类别:1, acc:0.9500, recall:0.9620, f1: 0.9560
+类别:2, acc:0.9156, recall:0.9406, f1: 0.9279
+类别:3, acc:0.8857, recall:0.9394, f1: 0.9118
+类别:4, acc:0.9655, recall:0.9333, f1: 0.9492
+类别:5, acc:0.9102, recall:0.8990, f1: 0.9046
+
+类别:0, acc:0.9357, recall:0.9615, f1: 0.9484
+类别:1, acc:0.9538, recall:0.9483, f1: 0.9510
+类别:2, acc:0.9271, recall:0.9366, f1: 0.9318
+类别:3, acc:0.9600, recall:0.9863, f1: 0.9730
+类别:4, acc:0.9429, recall:0.9851, f1: 0.9635
+类别:5, acc:0.9407, recall:0.9098, f1: 0.9250
+
+类别:0, acc:0.9402, recall:0.9556, f1: 0.9478
+类别:1, acc:0.9593, recall:0.9375, f1: 0.9483
+类别:2, acc:0.9243, recall:0.9412, f1: 0.9327
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
+类别:4, acc:0.9452, recall:0.9857, f1: 0.9650
+类别:5, acc:0.9296, recall:0.9058, f1: 0.9176
+
+类别:0, acc:0.9468, recall:0.9568, f1: 0.9518
+类别:1, acc:0.9489, recall:0.9489, f1: 0.9489
+类别:2, acc:0.9388, recall:0.9312, f1: 0.9350
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
+类别:4, acc:0.9324, recall:0.9857, f1: 0.9583
+类别:5, acc:0.9316, recall:0.9202, f1: 0.9258
+
+类别:0, acc:0.9455, recall:0.9478, f1: 0.9467
+类别:1, acc:0.9375, recall:0.9538, f1: 0.9456
+类别:2, acc:0.9275, recall:0.9295, f1: 0.9285
+类别:3, acc:0.9500, recall:0.9870, f1: 0.9682
+类别:4, acc:0.9583, recall:0.9857, f1: 0.9718
+类别:5, acc:0.9262, recall:0.9159, f1: 0.9210
+
+类别:0, acc:0.9331, recall:0.9516, f1: 0.9423
+类别:1, acc:0.9524, recall:0.9467, f1: 0.9496
+类别:2, acc:0.9437, recall:0.9089, f1: 0.9260
+类别:3, acc:0.9565, recall:0.9565, f1: 0.9565
+类别:4, acc:0.9242, recall:0.9683, f1: 0.9457
+类别:5, acc:0.9270, recall:0.9261, f1: 0.9266
+
+新模型:
+类别:0, acc:0.9336, recall:0.9225, f1: 0.9280
+类别:1, acc:0.9389, recall:0.9762, f1: 0.9572
+类别:2, acc:0.8937, recall:0.9439, f1: 0.9181
+类别:3, acc:0.9130, recall:1.0000, f1: 0.9545
+类别:4, acc:0.9545, recall:0.8936, f1: 0.9231
+类别:5, acc:0.9445, recall:0.9292, f1: 0.9368
+旧模型:
+类别:0, acc:0.8323, recall:0.7694, f1: 0.7996
+类别:1, acc:0.9565, recall:0.8730, f1: 0.9129
+类别:2, acc:0.8800, recall:0.8491, f1: 0.8643
+类别:3, acc:0.8723, recall:0.9762, f1: 0.9213
+类别:4, acc:0.9778, recall:0.9362, f1: 0.9565
+类别:5, acc:0.8402, recall:0.8878, f1: 0.8633
+'''
+
+def get_savedModel():
+    sess = tf.Session(graph=tf.Graph())
+    with sess.as_default():
+        with sess.graph.as_default():
+            vocab, matrix = getVocabAndMatrix(getModel_word())
+            model = getBiLSTMModel(input_shape=(2, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=6)
+            sess.run(tf.global_variables_initializer())
+            # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
+            # model.load_weights(filepath="log/ep010-loss0.107-val_loss0.114-f1_score0.966.h5")  #7月30日训练最优模型20字
+            # model.load_weights(filepath="../../dl_dev/role/log/ep015-loss0.090-val_loss0.113-f1_score0.967.h5") #8月5日调整部分招标人标注后重新训练结果20字
+            # model.load_weights("log/ep004-loss0.069-val_loss0.103-f1_score0.971.h5", # 20230427
+            # model.load_weights("log/ep059-loss0.096-val_loss0.180-f1_score0.945.h5", # 20231008 重新整理标注数据后结果
+            # model.load_weights("log/ep059-loss0.101-val_loss0.191-f1_score0.940.h5", # 20231012 重新整理标注数据后结果
+            # model.load_weights("log/ep052-loss0.123-val_loss0.194-f1_score0.937.h5", # 20231012 重新整理标注数据后结果
+            model.load_weights("log/ep049-loss0.108-val_loss0.185-f1_score0.938.h5", # 20231026 重新整理标注数据后结果
+                               by_name=True)  # 20230425 前后分别输入 去掉实体,2输入lstm后合并再次经过lstm 2023/04/27
+            tf.saved_model.simple_save(session=sess,
+                                       export_dir="role_savedmodel2023-10-26", # role_savedmodel2021-8-5
+                                       inputs={"input0": model.input[0],
+                                               "input1": model.input[1],
+                                               }, #"input2": model.input[2]
+                                       outputs={"outputs": model.output})
+
+def predict_pb():
+    # df_val = pd.read_excel('traindata/df_val.xlsx')
+    df_val = pd.read_excel('traindata/df_test_20230912_predict.xlsx')
+    old_x, old_y = word2id(df_val, seq_len=seq_len)
+    # old_x = np.transpose(np.array(old_x), (1, 0, 2))
+
+    sess_role = tf.Session()
+    with sess_role.as_default() as sess:
+        with sess_role.graph.as_default():
+            meta_graph_def = tf.saved_model.loader.load(sess=sess_role, tags=["serve"],
+                                                        export_dir="role_savedmodel2023-10-08") # role_savedmodel2021-8-5  role_savedmodel2023-04-27
+            signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+            signature_def = meta_graph_def.signature_def
+
+            input0 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
+            input1 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
+            # input2 = sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
+            output = sess_role.graph.get_tensor_by_name(
+                signature_def[signature_key].outputs["outputs"].name)
+            model_role = [[input0, input1], output] #, input2
+            lg_old = sess_role.run(output, feed_dict={input0:old_x[0],
+                                                      input1:old_x[1],
+                                                      }) # input2:old_x[2]
+            print(lg_old[:3])
+            pos = neg = 0
+            for i in range(len(lg_old)):
+                if np.argmax(lg_old[i]) !=  np.argmax(old_y[i]):
+                    print(np.argmax(lg_old[i]) , np.argmax(old_y[i]))
+                    neg += 1
+                else:
+                    pos += 1
+            print(pos, neg, pos/(pos+neg))
+
+
+if __name__ == "__main__":
+    # train()
+    test()
+    # get_savedModel()
+    # predict_pb()
+
+    # import tensorflow as tf
+    #
+    # # X = tf.constant([[[1, 1, 1], [2, 2, 2]],
+    # #                  [[3, 3, 3], [4, 4, 4]],
+    # #                  [[5, 5, 5], [6, 6, 6]]])
+    # X = tf.constant([[1, 1, 1], [2, 2, 2]]
+    #                  )
+    # print(X.shape)
+    # rs = tf.slice(X, [0, 0], [1, -1])
+    # with tf.Session() as sess:
+    #     print(sess.run(rs))
+
+
+
+
+
+
+
+
+
+

Неке датотеке нису приказане због велике количине промена