3 年之前 · 07e9a9878b
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -62,6 +62,15 @@ def link_entitys(list_entitys,on_value=0.8):
 
				                         if len(_ent.entity_text)>len(_entity.entity_text):
			
 
				                             _entity.entity_text = _ent.entity_text
			
 
				 
			
 
				+        # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
			
 
				+        for _entity in range_entity:
			
 
				+            for _ent in _entity.linked_entitys:
			
 
				+                print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
			
 
				+                if re.search("公司$", _ent.entity_text) is not None \
			
 
				+                        and _ent.if_dict_match == 1:
			
 
				+                    if len(_ent.entity_text) > len(_entity.entity_text):
			
 
				+                        _entity.entity_text = _ent.entity_text
			
 
				+
			
 
				 
			
 
				 def getEnterprisePath():
			
 
				     filename = "LEGAL_ENTERPRISE.txt"
			
@@ -146,6 +155,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				         for p_sentence in list_sentence:
			
 
				             sentence = p_sentence.sentence_text
			
 
				             list_match = match_enterprise_max_first(sentence)
			
 
				+            print("list_match", list_match)
			
 
				 
			
 
				             doc_id = p_sentence.doc_id
			
 
				             sentence_index = p_sentence.sentence_index
			
@@ -164,10 +174,14 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                     if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
			
 
				                         find_flag = True
			
 
				                         p_entity.entity_type = "company"
			
 
				+                        p_entity.if_dict_match = 1
			
 
				 
			
 
				                     if p_entity.entity_type not in ["location","org","company"]:
			
 
				                         continue
			
 
				 
			
 
				+                    if _match["entity_text"] == p_entity.entity_text:
			
 
				+                        p_entity.if_dict_match = 1
			
 
				+
			
 
				                     #有重叠
			
 
				                     #match部分被包含则不处理
			
 
				                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
			
@@ -189,6 +203,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             p_entity.wordOffset_end = _match["end_index"]
			
 
				                             p_entity.begin_index = begin_index
			
 
				                             p_entity.end_index = end_index
			
 
				+                            # 该公司实体是字典识别的
			
 
				+                            p_entity.if_dict_match = 1
			
 
				 
			
 
				                             for _match_h in range(_match_index+1,_match_j+1):
			
 
				                                 entity_text = list_match[_match_h]["entity_text"]
			
@@ -198,6 +214,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
			
 
				                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
 
				                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
			
 
				+                                add_entity.if_dict_match = 1
			
 
				                                 list_entity.append(add_entity)
			
 
				 
			
 
				                                 range_entity.append(add_entity)
			
@@ -225,6 +242,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                                     p_entity.wordOffset_end = _match["end_index"]
			
 
				                                     p_entity.begin_index = begin_index
			
 
				                                     p_entity.end_index = end_index
			
 
				+                                    p_entity.if_dict_match = 1
			
 
				                         elif _match["end_index"]>=p_entity.wordOffset_end:
			
 
				                             match_replace = True
			
 
				                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
@@ -236,6 +254,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             p_entity.begin_index = begin_index
			
 
				                             p_entity.end_index = end_index
			
 
				                             p_entity.entity_type = "company"
			
 
				+                            p_entity.if_dict_match = 1
			
 
				                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
			
 
				                         find_flag = True
			
 
				                         if p_entity.entity_type in ("org","company"):
			
@@ -248,12 +267,12 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
				                             p_entity.wordOffset_end = _match["end_index"]
			
 
				                             p_entity.begin_index = begin_index
			
 
				                             p_entity.end_index = end_index
			
 
				+                            p_entity.if_dict_match = 1
			
 
				                 if not find_flag:
			
 
				                     match_add = True
			
 
				                     entity_text = _match["entity_text"]
			
 
				                     entity_type = "company"
			
 
				 
			
 
				-
			
 
				                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
			
 
				                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
			
 
				                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -169,6 +169,7 @@ class Entity():
 
				         self.is_tail = False
			
 
				         self.notes = ''  # 2021/7/20 新增，保存金额大小写，单位等备注
			
 
				         self.money_unit = '' #2021/8/17 新增，保存金额单位 元、万元 、亿元
			
 
				+        self.if_dict_match = 0 # 2021/12/21 新增，判断公司实体是否由字典识别得到
			
 
				 
			
 
				     def set_Role(self,role_label,role_values):
			
 
				         self.label = int(role_label)
			
--- a/BiddingKG/dl/money/re_money_total_unit.py
+++ b/BiddingKG/dl/money/re_money_total_unit.py
@@ -36,7 +36,10 @@ def re_standard_unit(_str):
 
				             keyword_index = [m_span[0], m_span[1]]
			
 
				             keyword = m_dict.get("value")
			
 
				             # unit_money_list.append([keyword, keyword_index])
			
 
				-            unit_money_list.append([keyword, keyword_index, _str])
			
 
				+
			
 
				+            # 上下文有招标文件的不算
			
 
				+            if '文件' not in _str:
			
 
				+                unit_money_list.append([keyword, keyword_index, _str])
			
 
				 
			
 
				     return unit_money_list
			
 
				 
			
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -1,6 +1,13 @@
 
				 import re
			
 
				 
			
 
				-ratio = '((上浮|下浮)(率|).{0,2}[0-9.]+%)'
			
 
				+ratio = '([（(]?(上浮|下浮)(率|)(报价|)([(（]?%[）)]?|)[)）]?[：: ，]{0,3}[0-9]+.?[0-9]*[(（]?%?[）)]?)'
			
 
				+
			
 
				+# 基准利率上浮率）：大写：百分之叁拾点零零，小写：30.00%，
			
 
				+# 基准利率上浮率：百分之三十（30%）
			
 
				+# 租金上浮率
			
 
				+# 上浮率活期20%
			
 
				+# 上浮率：活期20%、一年定期35%
			
 
				+# 下浮率报价0.5%
			
 
				 
			
 
				 
			
 
				 def re_standard_ratio(_str):
			
@@ -37,7 +44,8 @@ def extract_ratio(text):
 
				 
			
 
				 def test_str():
			
 
				     s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
			
 
				-    s = '年利率较基准利率的上浮率：30% 活期存款年利率：0.455% 协定存'
			
 
				+    s = '年利率较基准利率的上浮率（%）： 30 活期存款下浮率：0.455% 协定存的下浮率，（1-下浮率）' \
			
 
				+        ' 上浮率....  上浮率30（%）  (下浮率%):43  下浮率报价0.5%'
			
 
				     print(extract_ratio(s))
			
 
				 
			
 
				 
			
--- a/BiddingKG/dl/ratio/test_re_ratio.py
+++ b/BiddingKG/dl/ratio/test_re_ratio.py
@@ -22,8 +22,8 @@ def test_csv(_path):
 
				     predict_list_2 = []
			
 
				     predict_list_3 = []
			
 
				     for index, row in df.iterrows():
			
 
				-        if index >= 1000:
			
 
				-            break
			
 
				+        # if index >= 1000:
			
 
				+        #     break
			
 
				 
			
 
				         if index % 50 == 0:
			
 
				             print("="*30, "Loop", index, time.time()-start_time, "="*30)
			
@@ -58,4 +58,5 @@ def test_csv(_path):
 
				 if __name__ == "__main__":
			
 
				     # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
			
 
				     path = '比率_result.csv'
			
 
				+    # path = '总价单价_result.csv'
			
 
				     test_csv(path)