瀏覽代碼

调整角色召回正则、概率分级、去除距离选择角色;补充废标公告产品数据

lsm 3 年之前
父節點
當前提交
cb44fdde45
共有 2 個文件被更改,包括 167 次插入200 次删除
  1. 1 1
      BiddingKG/dl/entityLink/entityLink.py
  2. 166 199
      BiddingKG/dl/interface/predictor.py

+ 1 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -124,7 +124,7 @@ def link_entitys(list_entitys,on_value=0.81):
 
 
 def getEnterprisePath():
-    filename = "../LEGAL_ENTERPRISE.txt"
+    filename = "LEGAL_ENTERPRISE.txt"
     real_path = getFileFromSysPath(filename)
     if real_path is None:
         real_path = filename

+ 166 - 199
BiddingKG/dl/interface/predictor.py

@@ -1094,21 +1094,23 @@ class FormPredictor():
 class RoleRulePredictor():
     
     def __init__(self):
-        # self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)"
-        # self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价|评选|挂牌|出租|出让|谈判|邀标|邀请|洽谈|约谈|买受|选取|抽取|抽选|出售|标卖|比价)(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人)[))]?(名称|信息)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
-        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价|评选|挂牌|出租|出让|谈判|邀标|邀请|洽谈|约谈|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
+        # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
+        self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
                                 "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
                                 "[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
+        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                     "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
+                                     "(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^拟对|^现就|^现委托)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
 
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
-        self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|受.{,15}委托|^受托)"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托)"  # |^受托  会与 受托生产等冲突,暂时为发现受托表达代理方式
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
-        # self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
-        self.pattern_winTenderer_left = "(?P<winTenderer_left>(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为,]+$|" \
+        self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为,]+$|" \
                                         "(选定单位|指定的中介服务机构|实施主体|承制单位)[::是为,]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::,]*$|" \
                                         "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))(是|为|:|:)$|(供应|供货|供|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为,]$)"
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为,]+$)"
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
@@ -1122,26 +1124,23 @@ class RoleRulePredictor():
         
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
-        
-        self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
-                                      ["C",self.pattern_tenderee_center],
-                                      ["R",self.pattern_tenderee_right]],
-                                 "1":[["L",self.pattern_agency_left],
-                                      ["R",self.pattern_agency_right]],
-                                 "2":[["L",self.pattern_winTenderer_left],
-                                      # ["C",self.pattern_winTenderer_center],
-                                      ["R",self.pattern_winTenderer_right],
-                                      ["W",self.pattern_winTenderer_whole]],
-                                 "3":[["L",self.pattern_secondTenderer_left],
-                                      ["R",self.pattern_secondTenderer_right]],
-                                 "4":[["L",self.pattern_thirdTenderer_left],
-                                      ["R",self.pattern_thirdTenderer_right]]}
-        self.pattern_whole = []
-        for _k,_v in self.dict_list_pattern.items():
-            for _d,_p in _v:
-                self.pattern_whole.append(_p)
-        # self.pattern_whole = "|".join(list_pattern)
-        
+
+        self.pattern_whole = [self.pattern_tenderee_left,
+                              self.pattern_tenderee_left_w1,
+                              self.pattern_tenderee_center,
+                              self.pattern_tenderee_right,
+                              self.pattern_agency_left,
+                              self.pattern_agency_right,
+                              self.pattern_winTenderer_left,
+                              self.pattern_winTenderer_left_w1,
+                              self.pattern_winTenderer_whole,
+                              self.pattern_winTenderer_right,
+                              self.pattern_secondTenderer_left,
+                              self.pattern_secondTenderer_right,
+                              self.pattern_thirdTenderer_left,
+                              self.pattern_thirdTenderer_right
+                              ]  # 需按顺序排列, 第二、三中标要在中标正则后面
+
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
         self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
@@ -1162,59 +1161,66 @@ class RoleRulePredictor():
             raise Exception("null text in input ")
         
         return text
-        
-    def predict(self,list_articles,list_sentences,list_entitys,list_codenames,on_value = 0.5):
 
-        for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames):
+
+    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
+
+        for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
+                                                                      list_codenames):
             list_sentence.sort(key=lambda x: x.sentence_index)  # 2022/1/5 按句子顺序排序
             # list_name = list_codename["name"]
             list_name = []  # 2022/1/5  改为实体列表内所有项目名称
             for entity in list_entity:
                 if entity.entity_type == 'name':
                     list_name.append(entity.entity_text)
-            list_name = self._check_input(list_name)+[article.title]
+            list_name = self._check_input(list_name) + [article.title]
             for p_entity in list_entity:
 
-
-                if p_entity.entity_type in ["org","company"]:
-                    #将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
-                    if str(p_entity.label)=="0":
+                if p_entity.entity_type in ["org", "company"]:
+                    # 只解析角色为无的或者概率低于阈值的
+                    if p_entity.label is None:
+                        continue
+                    # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
+                    if str(p_entity.label) == "0":
                         find_flag = False
                         for _sentence in list_sentence:
-                            if _sentence.sentence_index==p_entity.sentence_index:
-                                _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,use_text=True,text=re.sub(")",")",re.sub("(","(",p_entity.entity_text)))
+                            if _sentence.sentence_index == p_entity.sentence_index:
+                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                                   end_index=p_entity.end_index, size=20, center_include=True,
+                                                   word_flag=True, use_text=True,
+                                                   text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
                                 for _name in list_name:
-                                    if _name!="" and str(_span[1]+_span[2][:len(str(_name))]).find(_name)>=0:
+                                    if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
                                         find_flag = True
-                                        if p_entity.values[0]>on_value:
-                                            p_entity.values[0] = 0.6+(p_entity.values[0]-0.6)/10
+                                        if p_entity.values[0] > on_value:
+                                            p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
                         if find_flag:
                             continue
-                    
-                    
-                    #只解析角色为无的或者概率低于阈值的
-                    if p_entity.label is None:
-                        continue
+
+                    # 正则从概率低于阈值或其他类别中召回角色
                     role_prob = float(p_entity.values[int(p_entity.label)])
-                    if role_prob<on_value or str(p_entity.label)=="5":
-                        #将标题中的实体置为招标人
-                        _list_name = self._check_input(list_name,ignore=True)
+                    if role_prob < on_value or str(p_entity.label) == "5":
+                        # 将标题中的实体置为招标人
+                        _list_name = self._check_input(list_name, ignore=True)
                         find_flag = False
-                        for _name in _list_name: #2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
-                            if str(_name).find(re.sub(")",")",re.sub("(","(",p_entity.entity_text))) >= 0 and p_entity.sentence_index<4:
+                        for _name in _list_name:  # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
+                            if str(_name).find(re.sub(")", ")", re.sub("(", "(",
+                                                                       p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
                                 for _sentence in list_sentence:
                                     if _sentence.sentence_index == p_entity.sentence_index:
                                         _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                            end_index=p_entity.end_index, size=20, center_include=True,
-                                                           word_flag=True, use_text=True,text=re.sub(")",")",re.sub("(","(",p_entity.entity_text)))
+                                                           word_flag=True, use_text=True, text=re.sub(")", ")",
+                                                                                                      re.sub("(", "(",
+                                                                                                             p_entity.entity_text)))
                                         if str(_span[1] + _span[2][:len(str(_name))]).find(
-                                            _name) >= 0:
+                                                _name) >= 0:
                                             find_flag = True
                                             _label = 0
                                             p_entity.label = _label
                                             p_entity.values[int(_label)] = on_value
                                             break
-                                    if p_entity.sentence_index>=4:
+                                    if p_entity.sentence_index >= 4:
                                         break
                             if find_flag:
                                 break
@@ -1224,194 +1230,142 @@ class RoleRulePredictor():
                             #     p_entity.label = _label
                             #     p_entity.values[int(_label)] = on_value
                             #     break
-                        #若是实体在标题中,默认为招标人,不进行以下的规则匹配
+                        # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
                         if find_flag:
                             continue
-                        
+
                         for s_index in range(len(list_sentence)):
-                            if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
+                            if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
+                                    list_sentence[s_index].sentence_index:
                                 tokens = list_sentence[s_index].tokens
                                 begin_index = p_entity.begin_index
                                 end_index = p_entity.end_index
                                 size = 15
-                                spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
-                                #距离
-                                list_distance = [100,100,100,100,100]
-                                _flag = False
-
+                                spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
+                                                   word_flag=True, use_text=False)
+                                # _flag = False
 
-                                #使用正则+距离解决冲突
+                                # 使用正则+距离解决冲突
                                 # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
-                                list_spans = [spans[0][-30:],spans[0][-10:]+spans[1]+spans[2][:10],spans[2]]
+                                list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息
                                 for _i_span in range(len(list_spans)):
+                                    _flag = False
+                                    _prob_weight = 1
+
                                     # print(list_spans[_i_span],p_entity.entity_text)
                                     for _pattern in self.pattern_whole:
-                                        for _iter in re.finditer(_pattern,list_spans[_i_span]):
-                                            for _group,_v_group in _iter.groupdict().items():
-                                                if _v_group is not None and _v_group!="":
+                                        for _iter in re.finditer(_pattern, list_spans[_i_span]):
+                                            for _group, _v_group in _iter.groupdict().items():
+                                                if _v_group is not None and _v_group != "":
                                                     _role = _group.split("_")[0]
                                                     _direct = _group.split("_")[1]
-                                                    _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role)
-                                                    if _i_span==0 and _direct=="left" and re.search('各供应商|尊敬的供应商', list_spans[0])==None: #2021/12/22 修正错误中标召回 例子208668937
+                                                    _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
+                                                    # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                    #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商',
+                                                                                                        list_spans[
+                                                                                                            0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
-                                                        _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
-                                                        list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
-                                                    if _i_span==1 and _direct=="center":
+                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                        _prob_weight = 1.2 if _weight=='w1' else 1
+                                                        # print('_v_group:',_group, _v_group, p_entity.entity_text)
+
+                                                    if _i_span == 1 and _direct == "center":
                                                         _flag = True
-                                                        _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
-                                                        list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
-                                                    if _i_span==2 and _direct=="right":
+                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
+                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)
+
+                                                    if _i_span == 2 and _direct == "right":
                                                         _flag = True
-                                                        _distance = _iter.span()[0]
-                                                        list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
-
-
-                                # print(list_distance)
-
-                                # for _key in self.dict_list_pattern.keys():
-                                #
-                                #     for pattern in self.dict_list_pattern[_key]:
-                                #         if pattern[0]=="L":
-                                #             for _iter in re.finditer(pattern[1], spans[0][-30:]):
-                                #                 _flag = True
-                                #                 if len(spans[0])-_iter.span()[1]<list_distance[int(_key)]:
-                                #                     list_distance[int(_key)] = len(spans[0])-_iter.span()[1]-(_iter.span()[1]-_iter.span()[0])
-                                #
-                                #         if pattern[0]=="C":
-                                #             if re.search(pattern[1],spans[0]) is None and re.search(pattern[1],spans[2]) is None and re.search(pattern[1],spans[0]+spans[1]+spans[2]) is not None:
-                                #                 _flag = True
-                                #                 list_distance[int(_key)] = 0
-                                #
-                                #         if pattern[0]=="R":
-                                #             for _iter in re.finditer(pattern[1], spans[2][:30]):
-                                #                 _flag = True
-                                #                 if _iter.span()[0]<list_distance[int(_key)]:
-                                #                     list_distance[int(_key)] = _iter.span()[0]
-                                #         if pattern[0]=="W":
-                                #             spans = spanWindow(tokens, begin_index, end_index, size=20, center_include=True, word_flag=True, use_text=False)
-                                #             for _iter in re.finditer(pattern[1], "".join(spans)):
-                                #                 _flag = True
-                                #                 if _iter.span()[0]<list_distance[int(_key)]:
-                                #                     list_distance[int(_key)] = _iter.span()[0]
-
-
-                                # print("==",list_distance)
-                                #得到结果
-                                _label = np.argmin(list_distance)
-                                if _flag:
-                                    # if _label==2 and min(list_distance[3:])<100:
-                                    #     _label += np.argmin(list_distance[3:])+1
-                                    if _label in [2,3,4]:
-                                        if p_entity.entity_type in ["company","org"]:
-                                            p_entity.label = _label
-                                            p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
-                                    else:
-                                        p_entity.label = _label
-                                        p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
-                # if p_entity.entity_type=="location":
-                #     for _sentence in list_sentence:
-                #         if _sentence.sentence_index==p_entity.sentence_index:
-                #             _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=5,center_include=True,word_flag=True,text=p_entity.entity_text)
-                #             if re.search(self.pattern_winTenderer_location,_span[0][-10:]) is not None and re.search("地址|地点",_span[0]) is None:
-                #                 p_entity.entity_type="company"
-                #                 _label = "2"
-                #                 p_entity.label = _label
-                #                 p_entity.values = [0]*6
-                #                 p_entity.values[int(_label)] = on_value
-
-
-                #确定性强的特殊修改
-                if p_entity.entity_type in ["company","org"]:
-                    for s_index in range(len(list_sentence)):
-                        if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
-                            tokens = list_sentence[s_index].tokens
-                            begin_index = p_entity.begin_index
-                            end_index = p_entity.end_index
-                            size = 15
-                            spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
-                            #距离
-                            list_distance = [100,100,100,100,100]
-                            _flag = False
-                            for _key in self.dict_list_pattern.keys():
-                                for pattern in self.dict_list_pattern[_key]:
-                                    if pattern[0]=="W":
-                                        spans = spanWindow(tokens, begin_index, end_index, size=30, center_include=True, word_flag=True, use_text=False)
-                                        for _iter in re.finditer(pattern[1], spans[0][-10:]+spans[1]+spans[2]):
-                                            _flag = True
-                                            if _iter.span()[0]<list_distance[int(_key)]:
-                                                list_distance[int(_key)] = _iter.span()[0]
-                            #得到结果
-                            _label = np.argmin(list_distance)
-                            if _flag:
-                                if _label==2 and min(list_distance[3:])<100:
-                                    _label += np.argmin(list_distance[3:])+1
-                                if _label in [2,3,4]:
+                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
+                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)
+
+                                    # 得到结果
+                                    if _flag:
                                         p_entity.label = _label
-                                        p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
-                                else:
-                                    p_entity.label = _label
-                                    p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
+                                        p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
+                                        # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
+                                        break
+
+                # 其他金额通过正则召回可能是招标或中投标的金额
                 if p_entity.entity_type in ["money"]:
-                    if str(p_entity.label)=="2":
+                    if str(p_entity.label) == "2":
                         for _sentence in list_sentence:
-                            if _sentence.sentence_index==p_entity.sentence_index:
-                                _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
-                                if re.search(self.pattern_money_tenderee,_span[0]) is not None and re.search(self.pattern_money_other,_span[0]) is None:
-                                    p_entity.values[0] = 0.8+p_entity.values[0]/10
+                            if _sentence.sentence_index == p_entity.sentence_index:
+                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                                   end_index=p_entity.end_index, size=20, center_include=True,
+                                                   word_flag=True, text=p_entity.entity_text)
+                                if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
+                                        self.pattern_money_other, _span[0]) is None:
+                                    p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                     p_entity.label = 0
-                                if re.search(self.pattern_money_tenderer,_span[0]) is not None:
-                                    if re.search(self.pattern_money_other,_span[0]) is not None:
-                                        if re.search(self.pattern_money_tenderer,_span[0]).span()[1]>re.search(self.pattern_money_other,_span[0]).span()[1]:
-                                            p_entity.values[1] = 0.8+p_entity.values[1]/10
+                                if re.search(self.pattern_money_tenderer, _span[0]) is not None:
+                                    if re.search(self.pattern_money_other, _span[0]) is not None:
+                                        if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
+                                                re.search(self.pattern_money_other, _span[0]).span()[1]:
+                                            p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                             p_entity.label = 1
                                     else:
-                                        p_entity.values[1] = 0.8+p_entity.values[1]/10
+                                        p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                         p_entity.label = 1
-                                if re.search(self.pattern_money_tenderer_whole,"".join(_span)) is not None and re.search(self.pattern_money_other,_span[0]) is None:
-                                    p_entity.values[1] = 0.8+p_entity.values[1]/10
+                                if re.search(self.pattern_money_tenderer_whole,
+                                             "".join(_span)) is not None and re.search(self.pattern_money_other,
+                                                                                       _span[0]) is None:
+                                    p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                     p_entity.label = 1
-                                    
-            #增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
+
+            # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
             list_p = []
             state = 0
             for p_entity in list_entity:
                 for _sentence in list_sentence:
-                    if _sentence.sentence_index==p_entity.sentence_index:
-                        _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
-                        
-                        if state==2:
+                    if _sentence.sentence_index == p_entity.sentence_index:
+                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                           end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
+                                           text=p_entity.entity_text)
+
+                        if state == 2:
                             for _p in list_p[1:]:
-                                
-                                _p.values[0] = 0.8+_p.values[0]/10
+                                _p.values[0] = 0.8 + _p.values[0] / 10
                                 _p.label = 0
                             state = 0
                             list_p = []
-                        
-                        if state==0:
+
+                        if state == 0:
                             if p_entity.entity_type in ["money"]:
-                                if str(p_entity.label)=="0" and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None:
+                                if str(p_entity.label) == "0" and re.search(self.pattern_pack,
+                                                                            _span[0] + "-" + _span[2]) is not None:
                                     state = 1
                                     list_p.append(p_entity)
-                        elif state==1:
+                        elif state == 1:
                             if p_entity.entity_type in ["money"]:
-                                if str(p_entity.label) in ["0","2"] and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None and re.search(self.pattern_money_other,_span[0]+"-"+_span[2]) is None and p_entity.sentence_index==list_p[0].sentence_index:
+                                if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
+                                                                                   _span[0] + "-" + _span[
+                                                                                       2]) is not None and re.search(
+                                        self.pattern_money_other,
+                                        _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
+                                    0].sentence_index:
                                     list_p.append(p_entity)
                                 else:
                                     state = 2
-                        
-            if len(list_p)>1:
+
+            if len(list_p) > 1:
                 for _p in list_p[1:]:
-                    #print("==",_p.entity_text,_p.sentence_index,_p.label)
-                    _p.values[0] = 0.8+_p.values[0]/10
+                    # print("==",_p.entity_text,_p.sentence_index,_p.label)
+                    _p.values[0] = 0.8 + _p.values[0] / 10
                     _p.label = 0
                 state = 0
                 list_p = []
-                    
-                
+
             for p_entity in list_entity:
-                #将属于集合中的不可能是中标人的标签置为无
+                # 将属于集合中的不可能是中标人的标签置为无
                 if p_entity.entity_text in self.SET_NOT_TENDERER:
-                    p_entity.label=5
+                    p_entity.label = 5
 
 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
 class RoleRuleFinalAdd():
@@ -1420,14 +1374,15 @@ class RoleRuleFinalAdd():
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
         sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}年.{1,2}月.{1,2}日', text_end)
         sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_ent3 = re.search('报名咨询,([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
         if sear_ent or sear_ent2 or sear_ent3:
             if sear_ent3:
-                ent_re = sear_ent3.group(1).replace("(","(").replace(")",")")
+                ent_re = sear_ent3.group(2)
             elif sear_ent2:
-                ent_re = sear_ent2.group(2).replace("(","(").replace(")",")")
+                ent_re = sear_ent2.group(2)
             else:
-                ent_re = sear_ent.group(1).replace(',', '').replace("(","(").replace(")",")")
+                ent_re = sear_ent.group(1)
+            ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
             tenderee_notfound = True
             agency_notfound = True
             ents = []
@@ -1448,6 +1403,7 @@ class RoleRuleFinalAdd():
                     if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
                         ents[i].label = 1
                         ents[i].values[1] = 0.5
+                        # log('正则最后补充实体: %s'%(ent_re))
                         break
 
             elif tenderee_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None:
@@ -1459,6 +1415,7 @@ class RoleRuleFinalAdd():
                     if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
                         ents[i].label = 0
                         ents[i].values[0] = 0.5
+                        # log('正则最后补充实体: %s'%(ent_re))
                         break
 
 
@@ -1628,6 +1585,16 @@ class ProductPredictor():
                     batch_paths = self.decode(scores, lengths, tran_)
                     for text, path, length in zip(text_list, batch_paths, lengths):
                         tags = ''.join([str(it) for it in path[:length]])
+                        for it in re.finditer("12*3", tags):
+                            start = it.start()
+                            end = it.end()
+                            _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
+                                list_articles[0].doc_id, 0, start, end),
+                                             entity_text=text[start:end],
+                                             entity_type="product", sentence_index=0,
+                                             begin_index=0, end_index=0, wordOffset_begin=start,
+                                             wordOffset_end=end)
+                            list_entitys[0].append(_entity)
                         for it in re.finditer("45*6", tags):
                             start = it.start()
                             end = it.end()