Explorar el Código

fingerprint输入纠正;连续重复名称去重;角色概率微调

lsm hace 1 año
padre
commit
4e87486982

+ 18 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -2224,6 +2224,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     for article in articles:
         doc_id = article[0]
         sourceContent = article[1]
+        sourceContent_raw = article[1] # 原始html数据,fingerprint计算MD5用
         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
         sourceContent = re.sub("##attachment##","",sourceContent)
         sourceContent = sourceContent.replace('<br/>', '<br>')
@@ -2237,6 +2238,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         #         sourceContent = sourceContent.replace(br_match,_new,1)
         _send_doc_id = article[3]
         _title = article[4]
+        _title_raw = article[4]
         page_time = article[5]
         web_source_no = article[6]
         '''特别数据源对 html 做特别修改'''
@@ -2364,7 +2366,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         #article_processed = article[1]
         _article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
                            bidway=bidway)
-        _article.fingerprint = getFingerprint(_title+sourceContent)
+        _article.fingerprint = getFingerprint(_title_raw+sourceContent_raw)
         _article.page_time = page_time
         list_articles.append(_article)
     return list_articles
@@ -2719,7 +2721,19 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                 continue
             money_list.append((entity_text, start_index, end_index, unit, notes))
     return money_list, found_yeji
-
+def cut_repeat_name(s):
+    '''
+    公司连续重复名称去重
+    :param s:
+    :return:
+    '''
+    if len(s) >= 8:
+        n = s.count(s[-4:])
+        id = s.find(s[-4:]) + 4
+        sub_s = s[:id]
+        if n>=2 and s == sub_s * n:
+            s = sub_s
+    return s
 def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
@@ -2913,6 +2927,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         # print('公司实体不符合规范:', entity_text)
                         continue
 
+                entity_text = cut_repeat_name(entity_text) # 20231201 重复名称去重 如:中山大学附属第一医院中山大学附属第一医院中山大学附属第一医院
+
                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1 or sentence_index==doctextcon_sentence_len-1:

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -342,7 +342,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-11-20'}
+    version_date = {'version_date': '2023-12-01'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 12 - 4
BiddingKG/dl/interface/predictor.py

@@ -1530,13 +1530,19 @@ class RoleRulePredictor():
 
                                 if re.search('(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题):$', _span[0]):
                                     find_flag = True
-                                    p_entity.values[0] = on_value  # 项目名称里面实体修改为最低概率
+                                    if re.search('(局|院|府|学|处|站|会|所|校|馆|队|厅|室|司|心|园|厂)$', p_entity.entity_text):
+                                        p_entity.values[0] = 0.6 if p_entity.values[0]>0.6 else 0.55
+                                    else:
+                                        p_entity.values[0] = on_value  # 项目名称里面实体修改为最低概率
                                     break
 
                                 for _name in name_entitys:
                                     if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
                                         find_flag = True
-                                        p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
+                                        if re.search('(局|院|府|学|处|站|会|所|校|馆|队|厅|室|司|心|园|厂)$', p_entity.entity_text):
+                                            p_entity.values[0] = 0.6 if p_entity.values[0] > 0.6 else 0.55
+                                        else:
+                                            p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
                                         break
                                         # if p_entity.values[0] > on_value:
                                         #     p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
@@ -1835,7 +1841,7 @@ class RoleRuleFinalAdd():
                 if ent.label == 0 and ent.values[ent.label]>0.5:
                     if '公共资源交易中心' in ent.entity_text:  # 公共资源交易中心不算招标或代理,只算平台
                         # ent.label = 5
-                        ent.values[ent.label] = 0.5 # 改为降低概率,不改类别,防止 336220759 明显招标人表达不提取
+                        ent.values[ent.label] = 0.6 if ent.values[ent.label]>0.6 else 0.5 # 改为降低概率,不改类别,防止 382573066 明显招标人表达不提取
                         continue
                     tenderee_list.append(ent.entity_text)
                     tenderee_notfound = False
@@ -2224,7 +2230,9 @@ class RoleGrade():
                             _prob = _prob - 0.1 # 0.2
                         if pre_prob < _prob: # 如果模型预测概率小于关键词概率
                             _prob = 0.65
-                        if len(entity.entity_text) < 6: # 如果实体名称小于6个字,概率再降0.05
+                        if len(entity.entity_text) < 6 and re.search('大学|医院', entity.entity_text)==None: # 如果实体名称小于6个字,概率再降0.05
+                            _prob -= 0.05
+                        if re.search('(地址|联系方式):$', context): # 地址结尾的概率 概率降低
                             _prob -= 0.05
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0