Forráskód Böngészése

优化项目名称提取;优化招标人代理人提取_大数据统计代理人集合;优化招标人/代理人斜杠分割提取;处理特殊中标表达名次在后面

lsm 2 éve
szülő
commit
c17352ef64

+ 10 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -2201,10 +2201,17 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
         article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
-        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>\w{4,15})/(?P<agency>\w{4,15})[,。]', article_processed)
+        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
-            article_processed = article_processed.replace(ser.group(0), '采购人名称: %s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
+            article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
 
+        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25})[,。]', article_processed)
+        if ser2:
+            article_processed = article_processed.replace(ser2.group(0), '采购人名称:%s,采购代理机构名称:,' % (
+            ser2.group('tenderee')))
+        if re.search('中标单位名称:[\w()]{5,25},中标候选人名次:\d,', article_processed) and re.search('中标候选人名次:\d,中标单位名称:[\w()]{5,25},', article_processed)==None:  # 处理类似 304706608 此篇的数据源正文特殊表达
+            for it in re.finditer('(?P<tenderer>(中标单位名称:[\w()]{5,25},))(?P<rank>(中标候选人名次:\d,))', article_processed):
+                article_processed = article_processed.replace(it.group(0), it.group('rank')+it.group('tenderer'))
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -2425,7 +2432,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
     list_entitys = []
-    not_extract_roles = ['黄埔军校'] # 需要过滤掉的企业单位
+    not_extract_roles = ['黄埔军校', '国有资产管理处'] # 需要过滤掉的企业单位
     for list_sentence in list_sentences:
         sentences = []
         list_entitys_temp = []

BIN
BiddingKG/dl/interface/agency_set.pkl


+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -257,7 +257,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-02-20'}
+    version_date = {'version_date': '2023-03-09'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 36 - 9
BiddingKG/dl/interface/predictor.py

@@ -35,6 +35,10 @@ sess_config = tf.ConfigProto(
                         log_device_placement=True)
 sess_config = None
 
+file = os.path.dirname(__file__) + '/agency_set.pkl'
+with open(file, 'rb') as f:
+    agency_set = pickle.load(f)
+
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
@@ -418,6 +422,8 @@ class CodeNamePredict():
                                 #     item['code'] = list(code_set)
                     for iter in re.finditer(self.PN_pattern,join_predict):
                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
+                        if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
+                            continue
 
                         #add name to entitys
                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
@@ -426,7 +432,8 @@ class CodeNamePredict():
                         w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
                         if _name not in dict_name_freq_score:
                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w+(5-sentence.sentence_index)*0.2]
+                            len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
                         else:
                             dict_name_freq_score[_name][0] += 1
                     '''
@@ -454,6 +461,9 @@ class CodeNamePredict():
                     othername = re.search(name_re1, sentence.sentence_text)
                     if othername != None:
                         project_name = othername.group('name')
+                        if re.search('[\u4e00-\u9fa5]+', project_name) == None:  # 没有中文的项目名称去除
+                            # log('没有中文的项目名称去除')
+                            continue
                         beg = find_index([project_name], sentence.sentence_text)[0]
                         end = beg + len(project_name)
                         _name = self.fitDataByRule(sentence.sentence_text[beg:end])
@@ -1351,11 +1361,18 @@ class RoleRulePredictor():
                                                                                                              p_entity.entity_text)))
                                         if str(_span[1] + _span[2][:len(str(_name))]).find(
                                                 _name) >= 0:
-                                            find_flag = True
-                                            _label = 0
-                                            p_entity.label = _label
-                                            p_entity.values[int(_label)] = on_value
-                                            break
+                                            if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
+                                                find_flag = True
+                                                _label = 1
+                                                p_entity.label = _label
+                                                p_entity.values[int(_label)] = on_value
+                                                break
+                                            else:
+                                                find_flag = True
+                                                _label = 0
+                                                p_entity.label = _label
+                                                p_entity.values[int(_label)] = on_value
+                                                break
                                     if p_entity.sentence_index >= 4:
                                         break
                             if find_flag:
@@ -1549,6 +1566,7 @@ class RoleRuleFinalAdd():
         for sentence in main_sentences[-5:]:
             end_tokens.extend(sentence.tokens)
         text_end = "".join(end_tokens[-30:])
+        text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处)', '', text_end)  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
         # print(text_end)
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
         sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
@@ -1561,6 +1579,7 @@ class RoleRuleFinalAdd():
         tenderee_notfound = True
         agency_notfound = True
         tenderee_list = []
+        agency_list = []
         ents = []
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
@@ -1571,6 +1590,7 @@ class RoleRuleFinalAdd():
                     tenderee_list.append(ent.entity_text)
                     tenderee_notfound = False
                 elif ent.label == 1:
+                    agency_list.append(ent.entity_text)
                     agency_notfound = False
                 elif ent.label == 5:
                     if '公共资源交易中心' in ent.entity_text:
@@ -1581,8 +1601,9 @@ class RoleRuleFinalAdd():
                 ent_re = _sear_ent.group('entity')
                 ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
 
-                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
-                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
+                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
+                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None) \
+                        and ent_re not in agency_list and ent_re not in agency_set:
                     n = 0
                     for i in range(len(ents) - 1, -1, -1):
                         if not ents[i].in_attachment:
@@ -1595,7 +1616,8 @@ class RoleRuleFinalAdd():
                             tenderee_notfound = False
                             # log('正则最后补充实体: %s'%(ent_re))
                             break
-                elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) and ent_re not in tenderee_list:
+                elif agency_notfound == True and ent_re not in tenderee_list and (
+                        re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
                     n = 0
                     for i in range(len(ents) - 1, -1, -1):
                         if not ents[i].in_attachment:
@@ -1927,6 +1949,7 @@ class RoleGrade():
     def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
         '''
         根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
+        修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
         :param list_articles:
         :param list_sentences:
         :param list_entitys:
@@ -1984,6 +2007,10 @@ class RoleGrade():
                         company_winner.append(entity)  # 保存中标人实体
                 if entity.label == 0 and entity.values[entity.label]> min_prob:
                     org_tenderee.append(entity.entity_text)  # 保存所有招标人名称
+            if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6:  # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
+                # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
+                entity.label = 1
+                entity.values[entity.label] = 0.5
 
         if org_winner != []:
             flag = 0