Ver código fonte

更新服务期限概率

fangjiasheng 3 anos atrás
pai
commit
517dd4d3a7

+ 2 - 1
BiddingKG/dl/interface/Entitys.py

@@ -155,7 +155,7 @@ class Entity():
     @summary:实体类
     '''
     
-    def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False):
+    def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False, prob=0):
         '''
         @param:
             doc_id:文章的uuid
@@ -203,6 +203,7 @@ class Entity():
         self.pointer_ratio = None  # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率"
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
         self.in_attachment = in_attachment  # 2022/02/10添加,实体是否在附件中
+        self.prob = prob  # 2022/06/20添加,实体的概率
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)

+ 1 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -2600,7 +2600,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = servicetime['body']
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment, prob=servicetime["prob"]))
 
             # 招标方式提取 2020/12/30 新增
             # list_bidway = extract_bidway(sentence_text, )

+ 19 - 60
BiddingKG/dl/time/re_servicetime.py

@@ -156,7 +156,7 @@ reg_right_unit = re.compile(u'[-.年月日号天~~至到—/]')
 reg_error = re.compile(u'公告|发布|中')
 
 
-def re_serviceTime(text):
+def re_service_time(text):
     if TEST_MODE:
         # print(chardet.detect(text))
         text = re.sub("\s*", "", text)
@@ -186,46 +186,53 @@ def re_serviceTime(text):
             print("input_str", input_str)
 
         # 匹配
-        output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
+        output_list, text_index_list = re_find_all_result(reg_wuye, input_str)
         if TEST_MODE:
             print("output_str, text_index reg_wuye", output_list, text_index_list)
         output_list, text_index_list = filter_service_time(output_list, text_index_list)
+        prob = 0.9
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_findAllResult(reg2, input_str)
+            output_list, text_index_list = re_find_all_result(reg2, input_str)
             if TEST_MODE:
                 print("output_str, text_index reg2", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
+            prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_findAllResult(reg, input_str)
+            output_list, text_index_list = re_find_all_result(reg, input_str)
             if TEST_MODE:
                 print("output_str, text_index reg", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
+            prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_findAllResult(reg1, input_str)
+            output_list, text_index_list = re_find_all_result(reg1, input_str)
             if TEST_MODE:
                 print("output_str, text_index reg1", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
+            prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_findAllResult(reg3, input_str)
+            output_list, text_index_list = re_find_all_result(reg3, input_str)
             if TEST_MODE:
                 print("output_str, text_index reg3", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
+            prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_findAllResult(reg4, input_str)
+            output_list, text_index_list = re_find_all_result(reg4, input_str)
             if TEST_MODE:
                 print("output_str, text_index reg4", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
+            prob = 0.5
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_findAllResult(reg5, input_str)
+            output_list, text_index_list = re_find_all_result(reg5, input_str)
             if TEST_MODE:
                 print("output_str, text_index reg5", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
+            prob = 0.8
 
         # 添加
         all_output_list += output_list
@@ -240,7 +247,7 @@ def re_serviceTime(text):
 
     if TEST_MODE:
         print("index2word all_text_index_list", index2word, all_text_index_list)
-    return index2word, all_text_index_list
+    return index2word, all_text_index_list, prob
 
 
 def filter_service_time(output_list, text_index_list):
@@ -282,7 +289,7 @@ def filter_service_time(output_list, text_index_list):
     return output_list, text_index_list
 
 
-def re_findAllResult(reg, input, unit="", index=0):
+def re_find_all_result(reg, input, unit="", index=0):
     """
 
     :param reg: 正则表达式
@@ -296,61 +303,13 @@ def re_findAllResult(reg, input, unit="", index=0):
     match1 = re.finditer(reg, input)
     output_list = []
     for i in match1:
-        output = ""
         d = i.groupdict()
-        if d.get("before"):
-            output += d.get("before")
-        if d.get("before3"):
-            output += d.get("before3")
-        if d.get("before7"):
-            output += d.get("before7")
-        if d.get("charac"):
-            output += d.get("charac")
-        if d.get("before2"):
-            output += d.get("before2")
-        if d.get("before4"):
-            output += d.get("before4")
-        if d.get("before5"):
-            output += d.get("before5")
-        if d.get("before6"):
-            output += d.get("before6")
-        if d.get("center"):
-            output += d.get("center")
-        if d.get("number"):
-            output += d.get("number")
-        if d.get("after"):
-            output += d.get("after")
-        if d.get("after1"):
-            output += d.get("after1")
-        if d.get("after2"):
-            output += d.get("after2")
-        if d.get("after4"):
-            output += d.get("after4")
-        if d.get("after3"):
-            output += d.get("after3")
 
         if TEST_MODE:
             for key in d.keys():
                 if d.get(key):
                     print('d.get("' + key + '")', d.get(key))
 
-        # if d.get("before") is not None:
-        #     if d.get("before3") is None or d.get("before3") != "":
-        #         front_len = len(d.get("before"))
-        #         # print("1-", len(d.get("before")))
-        #     else:
-        #         front_len = len(d.get("before")) + len(d.get("charac"))
-        #         # print("2-", len(d.get("before")), len(d.get("charac")))
-        #         if d.get("before2") is not None:
-        #             front_len += len(d.get("before2"))
-        #     if d.get("before4") is not None:
-        #         front_len += len(d.get("before4"))
-        # else:
-        #     if d.get("before2") is not None:
-        #         front_len = len(d.get("before2"))
-        #     else:
-        #         front_len = 0
-
         front_len = 0
         for key in d.keys():
             if d.get(key) and key in ["before", "before2", "before4",
@@ -379,10 +338,10 @@ def calculateLen(ss, i):
 
 def extract_servicetime(text):
     list_servicetime = []
-    word_list, text_index_list = re_serviceTime(text)
+    word_list, text_index_list, prob = re_service_time(text)
     # print(word, text_index_list)
     for i in range(len(text_index_list)):
-        d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
+        d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1], "prob": prob}
         if len(word_list[i]) <= 35:
             list_servicetime.append(d)
     if TEST_MODE: