瀏覽代碼

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

znj 2 年之前
父節點
當前提交
576a80d0e3
共有 2 個文件被更改,包括 37 次插入8 次删除
  1. 1 1
      BiddingKG/dl/interface/extract.py
  2. 36 7
      BiddingKG/dl/interface/predictor.py

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -257,7 +257,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-02-07'}
+    version_date = {'version_date': '2023-02-20'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 36 - 7
BiddingKG/dl/interface/predictor.py

@@ -1291,6 +1291,10 @@ class RoleRulePredictor():
                                                    end_index=p_entity.end_index, size=20, center_include=True,
                                                    word_flag=True, use_text=True,
                                                    text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
+                                if re.search(self.pattern_tenderee_left, _span[0]) or re.search(self.pattern_tenderee_left_w0, _span[0]): # 前面有关键词的实体不判断是否在项目名称中出现
+                                    find_flag = True
+                                    break
+
                                 for _name in list_name:
                                     if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:  #加上前面一些信息,修复公司不在项目名称开头的,检测不到
                                         find_flag = True
@@ -1880,17 +1884,17 @@ class TendereeRuleRecall():
 
 class RoleGrade():
     def __init__(self):
-        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|甲)(人|方|单位))"
+        self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
         self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
-        self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
+        self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包|甲)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一]|排名:1)"
-        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商))"
+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]|排名:1)"
+        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
         self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
         self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
                              self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
-    def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
+    def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
         '''
         根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
         :param list_articles:
@@ -1901,8 +1905,11 @@ class RoleGrade():
         '''
         sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
         role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
+        org_winner = []
+        company_winner = []
+        org_tenderee = []
         for entity in list_entitys[0]:
-            if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.5:
+            if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
                 text = sentences[entity.sentence_index].sentence_text
                 in_att = sentences[entity.sentence_index].in_attachment
                 pre_prob = entity.values[entity.label]
@@ -1940,6 +1947,28 @@ class RoleGrade():
                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
                     # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values)
 
+                if entity.label == 2 and entity.values[entity.label]> min_prob:
+                    if entity.entity_type == 'org':
+                        org_winner.append(entity)
+                    elif entity.entity_type == 'company':
+                        company_winner.append(entity)  # 保存中标人实体
+                if entity.label == 0 and entity.values[entity.label]> min_prob:
+                    org_tenderee.append(entity.entity_text)  # 保存所有招标人名称
+
+        if org_winner != []:
+            flag = 0
+            if org_tenderee != []:
+                for ent in org_winner:
+                    if ent.entity_text in org_tenderee:
+                        # log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label))
+                        ent.values[2] = 0.6
+                        flag = 1
+            if flag == 0 and company_winner != []:
+                for ent in org_winner:
+                    if ent.label == 2 and ent.values[2] > 0.6:
+                        # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
+                        ent.values[2] = 0.6
+
 
 class MoneyGrade():
     def __init__(self):
@@ -2750,7 +2779,7 @@ class ProductAttributesPredictor():
                         product = tds[id1]
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
-                                _quantity = tds[id2]
+                                quantity = tds[id2]
                                 # quantity = re.sub('[()(),,约]', '', quantity)
                                 # quantity = re.sub('[一壹]', '1', quantity)
                                 # ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)