2 éve · c17352ef64
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2201,10 +2201,17 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = re.sub('(招标|采购)人(概况|信息)：?[，。]', '采购人信息：', article_processed)  # 2022/8/10统一表达
			
 
				         article_processed = article_processed.replace('\（%）', '')    # 中标（成交）金额（元）\（%）：498888.00， 处理 江西省政府采购网  金额特殊问题
			
 
				         article_processed = re.sub('金额：?（(可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元（）\d]{8,20})）：?', '金额：', article_processed)    # 中标（成交）金额：（可填写下浮率、折扣率或费率）：29.3万元  金额特殊问题
			
 
				-        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?：(?P<tenderee>\w{4,15})/(?P<agency>\w{4,15})[，。]', article_processed)
			
 
				+        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?：(?P<tenderee>[\w（）]{4,25}(/[\w（）]{4,25})?)/(?P<agency>[\w（）]{4,25})[，。]', article_processed)
			
 
				         if ser:
			
 
				-            article_processed = article_processed.replace(ser.group(0), '采购人名称： %s，采购代理机构名称：%s，' % (ser.group('tenderee'), ser.group('agency')))
			
 
				+            article_processed = article_processed.replace(ser.group(0), '采购人名称：%s，采购代理机构名称：%s，' % (ser.group('tenderee'), ser.group('agency')))
			
 
				 
			
 
				+        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?：(?P<tenderee>[\w（）]{4,25})[，。]', article_processed)
			
 
				+        if ser2:
			
 
				+            article_processed = article_processed.replace(ser2.group(0), '采购人名称：%s，采购代理机构名称：，' % (
			
 
				+            ser2.group('tenderee')))
			
 
				+        if re.search('中标单位名称：[\w（）]{5,25}，中标候选人名次：\d，', article_processed) and re.search('中标候选人名次：\d，中标单位名称：[\w（）]{5,25}，', article_processed)==None:  # 处理类似 304706608 此篇的数据源正文特殊表达
			
 
				+            for it in re.finditer('(?P<tenderer>(中标单位名称：[\w（）]{5,25}，))(?P<rank>(中标候选人名次：\d，))', article_processed):
			
 
				+                article_processed = article_processed.replace(it.group(0), it.group('rank')+it.group('tenderer'))
			
 
				 
			
 
				         '''去除业绩内容'''
			
 
				         article_processed = del_achievement(article_processed)
			
@@ -2425,7 +2432,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				     '''
			
 
				 
			
 
				     list_entitys = []
			
 
				-    not_extract_roles = ['黄埔军校'] # 需要过滤掉的企业单位
			
 
				+    not_extract_roles = ['黄埔军校', '国有资产管理处'] # 需要过滤掉的企业单位
			
 
				     for list_sentence in list_sentences:
			
 
				         sentences = []
			
 
				         list_entitys_temp = []
			
--- a/BiddingKG/dl/interface/agency_set.pkl
+++ b/BiddingKG/dl/interface/agency_set.pkl
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -257,7 +257,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2023-02-20'}
			
 
				+    version_date = {'version_date': '2023-03-09'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
			
 
				     data_res["doctitle_refine"] = doctitle_refine
			
 
				     data_res["nlp_enterprise"] = nlp_enterprise
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -35,6 +35,10 @@ sess_config = tf.ConfigProto(
 
				                         log_device_placement=True)
			
 
				 sess_config = None
			
 
				 
			
 
				+file = os.path.dirname(__file__) + '/agency_set.pkl'
			
 
				+with open(file, 'rb') as f:
			
 
				+    agency_set = pickle.load(f)
			
 
				+
			
 
				 from threading import RLock
			
 
				 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
			
 
				               "prem":{"predictor":None,"Lock":RLock()},
			
@@ -418,6 +422,8 @@ class CodeNamePredict():
 
				                                 #     item['code'] = list(code_set)
			
 
				                     for iter in re.finditer(self.PN_pattern,join_predict):
			
 
				                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
			
 
				+                        if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
			
 
				+                            continue
			
 
				 
			
 
				                         #add name to entitys
			
 
				                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
			
@@ -426,7 +432,8 @@ class CodeNamePredict():
 
				                         w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[:：\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
			
 
				                         if _name not in dict_name_freq_score:
			
 
				                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
			
 
				-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w+(5-sentence.sentence_index)*0.2]
			
 
				+                            len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
			
 
				+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
			
 
				                         else:
			
 
				                             dict_name_freq_score[_name][0] += 1
			
 
				                     '''
			
@@ -454,6 +461,9 @@ class CodeNamePredict():
 
				                     othername = re.search(name_re1, sentence.sentence_text)
			
 
				                     if othername != None:
			
 
				                         project_name = othername.group('name')
			
 
				+                        if re.search('[\u4e00-\u9fa5]+', project_name) == None:  # 没有中文的项目名称去除
			
 
				+                            # log('没有中文的项目名称去除')
			
 
				+                            continue
			
 
				                         beg = find_index([project_name], sentence.sentence_text)[0]
			
 
				                         end = beg + len(project_name)
			
 
				                         _name = self.fitDataByRule(sentence.sentence_text[beg:end])
			
@@ -1351,11 +1361,18 @@ class RoleRulePredictor():
 
				                                                                                                              p_entity.entity_text)))
			
 
				                                         if str(_span[1] + _span[2][:len(str(_name))]).find(
			
 
				                                                 _name) >= 0:
			
 
				-                                            find_flag = True
			
 
				-                                            _label = 0
			
 
				-                                            p_entity.label = _label
			
 
				-                                            p_entity.values[int(_label)] = on_value
			
 
				-                                            break
			
 
				+                                            if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
			
 
				+                                                find_flag = True
			
 
				+                                                _label = 1
			
 
				+                                                p_entity.label = _label
			
 
				+                                                p_entity.values[int(_label)] = on_value
			
 
				+                                                break
			
 
				+                                            else:
			
 
				+                                                find_flag = True
			
 
				+                                                _label = 0
			
 
				+                                                p_entity.label = _label
			
 
				+                                                p_entity.values[int(_label)] = on_value
			
 
				+                                                break
			
 
				                                     if p_entity.sentence_index >= 4:
			
 
				                                         break
			
 
				                             if find_flag:
			
@@ -1549,6 +1566,7 @@ class RoleRuleFinalAdd():
 
				         for sentence in main_sentences[-5:]:
			
 
				             end_tokens.extend(sentence.tokens)
			
 
				         text_end = "".join(end_tokens[-30:])
			
 
				+        text_end = re.sub('，?(招标办|招投标管理中心|国有资产管理处)', '', text_end)  # 处理 类似 285264698 传真：0512-62690315，苏州卫生职业技术学院，国有资产管理处，2022年11月24日， 这种情况
			
 
				         # print(text_end)
			
 
				         # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
			
 
				         sear_ent = re.search('[，。；](?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,6}(分公司|部))?)，?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
			
@@ -1561,6 +1579,7 @@ class RoleRuleFinalAdd():
 
				         tenderee_notfound = True
			
 
				         agency_notfound = True
			
 
				         tenderee_list = []
			
 
				+        agency_list = []
			
 
				         ents = []
			
 
				         for ent in list_entitys[0]:
			
 
				             if ent.entity_type in ['org', 'company']:
			
@@ -1571,6 +1590,7 @@ class RoleRuleFinalAdd():
 
				                     tenderee_list.append(ent.entity_text)
			
 
				                     tenderee_notfound = False
			
 
				                 elif ent.label == 1:
			
 
				+                    agency_list.append(ent.entity_text)
			
 
				                     agency_notfound = False
			
 
				                 elif ent.label == 5:
			
 
				                     if '公共资源交易中心' in ent.entity_text:
			
@@ -1581,8 +1601,9 @@ class RoleRuleFinalAdd():
 
				                 ent_re = _sear_ent.group('entity')
			
 
				                 ent_re = ent_re.replace('，', '').replace("(","（").replace(")","）")
			
 
				 
			
 
				-                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
			
 
				-                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
			
 
				+                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
			
 
				+                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None) \
			
 
				+                        and ent_re not in agency_list and ent_re not in agency_set:
			
 
				                     n = 0
			
 
				                     for i in range(len(ents) - 1, -1, -1):
			
 
				                         if not ents[i].in_attachment:
			
@@ -1595,7 +1616,8 @@ class RoleRuleFinalAdd():
 
				                             tenderee_notfound = False
			
 
				                             # log('正则最后补充实体： %s'%(ent_re))
			
 
				                             break
			
 
				-                elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) and ent_re not in tenderee_list:
			
 
				+                elif agency_notfound == True and ent_re not in tenderee_list and (
			
 
				+                        re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
			
 
				                     n = 0
			
 
				                     for i in range(len(ents) - 1, -1, -1):
			
 
				                         if not ents[i].in_attachment:
			
@@ -1927,6 +1949,7 @@ class RoleGrade():
 
				     def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
			
 
				         '''
			
 
				         根据规则给角色分配不同等级概率；分三级：0.9-1，0.8-0.9，0.7-0.8；附件0.7-0.8，0.6-0.7，0.5-0.6
			
 
				+        修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
			
 
				         :param list_articles:
			
 
				         :param list_sentences:
			
 
				         :param list_entitys:
			
@@ -1984,6 +2007,10 @@ class RoleGrade():
 
				                         company_winner.append(entity)  # 保存中标人实体
			
 
				                 if entity.label == 0 and entity.values[entity.label]> min_prob:
			
 
				                     org_tenderee.append(entity.entity_text)  # 保存所有招标人名称
			
 
				+            if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6:  # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
			
 
				+                # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
			
 
				+                entity.label = 1
			
 
				+                entity.values[entity.label] = 0.5
			
 
				 
			
 
				         if org_winner != []:
			
 
				             flag = 0