Explorar o código

优化招标人字典识别规则,优化比率单价总价

fangjiasheng %!s(int64=3) %!d(string=hai) anos
pai
achega
07e9a9878b

+ 20 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -62,6 +62,15 @@ def link_entitys(list_entitys,on_value=0.8):
                         if len(_ent.entity_text)>len(_entity.entity_text):
                             _entity.entity_text = _ent.entity_text
 
+        # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
+        for _entity in range_entity:
+            for _ent in _entity.linked_entitys:
+                print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
+                if re.search("公司$", _ent.entity_text) is not None \
+                        and _ent.if_dict_match == 1:
+                    if len(_ent.entity_text) > len(_entity.entity_text):
+                        _entity.entity_text = _ent.entity_text
+
 
 def getEnterprisePath():
     filename = "LEGAL_ENTERPRISE.txt"
@@ -146,6 +155,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
         for p_sentence in list_sentence:
             sentence = p_sentence.sentence_text
             list_match = match_enterprise_max_first(sentence)
+            print("list_match", list_match)
 
             doc_id = p_sentence.doc_id
             sentence_index = p_sentence.sentence_index
@@ -164,10 +174,14 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
                         find_flag = True
                         p_entity.entity_type = "company"
+                        p_entity.if_dict_match = 1
 
                     if p_entity.entity_type not in ["location","org","company"]:
                         continue
 
+                    if _match["entity_text"] == p_entity.entity_text:
+                        p_entity.if_dict_match = 1
+
                     #有重叠
                     #match部分被包含则不处理
                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
@@ -189,6 +203,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             p_entity.wordOffset_end = _match["end_index"]
                             p_entity.begin_index = begin_index
                             p_entity.end_index = end_index
+                            # 该公司实体是字典识别的
+                            p_entity.if_dict_match = 1
 
                             for _match_h in range(_match_index+1,_match_j+1):
                                 entity_text = list_match[_match_h]["entity_text"]
@@ -198,6 +214,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                 add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
+                                add_entity.if_dict_match = 1
                                 list_entity.append(add_entity)
 
                                 range_entity.append(add_entity)
@@ -225,6 +242,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                     p_entity.wordOffset_end = _match["end_index"]
                                     p_entity.begin_index = begin_index
                                     p_entity.end_index = end_index
+                                    p_entity.if_dict_match = 1
                         elif _match["end_index"]>=p_entity.wordOffset_end:
                             match_replace = True
                             begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
@@ -236,6 +254,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             p_entity.begin_index = begin_index
                             p_entity.end_index = end_index
                             p_entity.entity_type = "company"
+                            p_entity.if_dict_match = 1
                     elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
                         find_flag = True
                         if p_entity.entity_type in ("org","company"):
@@ -248,12 +267,12 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                             p_entity.wordOffset_end = _match["end_index"]
                             p_entity.begin_index = begin_index
                             p_entity.end_index = end_index
+                            p_entity.if_dict_match = 1
                 if not find_flag:
                     match_add = True
                     entity_text = _match["entity_text"]
                     entity_type = "company"
 
-
                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)

+ 1 - 0
BiddingKG/dl/interface/Entitys.py

@@ -169,6 +169,7 @@ class Entity():
         self.is_tail = False
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
         self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
+        self.if_dict_match = 0 # 2021/12/21 新增,判断公司实体是否由字典识别得到
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)

+ 4 - 1
BiddingKG/dl/money/re_money_total_unit.py

@@ -36,7 +36,10 @@ def re_standard_unit(_str):
             keyword_index = [m_span[0], m_span[1]]
             keyword = m_dict.get("value")
             # unit_money_list.append([keyword, keyword_index])
-            unit_money_list.append([keyword, keyword_index, _str])
+
+            # 上下文有招标文件的不算
+            if '文件' not in _str:
+                unit_money_list.append([keyword, keyword_index, _str])
 
     return unit_money_list
 

+ 10 - 2
BiddingKG/dl/ratio/re_ratio.py

@@ -1,6 +1,13 @@
 import re
 
-ratio = '((上浮|下浮)(率|).{0,2}[0-9.]+%)'
+ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
+
+# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
+# 基准利率上浮率:百分之三十(30%)
+# 租金上浮率
+# 上浮率活期20%
+# 上浮率:活期20%、一年定期35%
+# 下浮率报价0.5%
 
 
 def re_standard_ratio(_str):
@@ -37,7 +44,8 @@ def extract_ratio(text):
 
 def test_str():
     s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
-    s = '年利率较基准利率的上浮率:30% 活期存款年利率:0.455% 协定存'
+    s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \
+        ' 上浮率....  上浮率30(%)  (下浮率%):43  下浮率报价0.5%'
     print(extract_ratio(s))
 
 

+ 3 - 2
BiddingKG/dl/ratio/test_re_ratio.py

@@ -22,8 +22,8 @@ def test_csv(_path):
     predict_list_2 = []
     predict_list_3 = []
     for index, row in df.iterrows():
-        if index >= 1000:
-            break
+        # if index >= 1000:
+        #     break
 
         if index % 50 == 0:
             print("="*30, "Loop", index, time.time()-start_time, "="*30)
@@ -58,4 +58,5 @@ def test_csv(_path):
 if __name__ == "__main__":
     # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
     path = '比率_result.csv'
+    # path = '总价单价_result.csv'
     test_csv(path)