Эх сурвалжийг харах

整合时间分类、资金来源、招标方式、服务期限要素到实体类

bidi 4 жил өмнө
parent
commit
100a9c905a

+ 7 - 5
BiddingKG/dl/bidway/re_bidway.py

@@ -215,13 +215,15 @@ def calculateLen(ss, i):
 def extract_bidway(text):
     list_bidway = []
     word, text_index_list = re_bidway(text)
-    d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
-    list_bidway.append(d)
-    print(d)
+    if word is not None:
+        d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
+        list_bidway.append(d)
+    # print(d)
     return list_bidway
 
 
 if __name__ == "__main__":
-    df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
-    s = df["text"].iloc[1]
+    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
+    # s = df["text"].iloc[1]
+    s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
     extract_bidway(s)

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -12,7 +12,7 @@ import os
 
 from threading import RLock
 
-from pai_tf_predict_proto import tf_predict_pb2
+# from pai_tf_predict_proto import tf_predict_pb2
 import requests
 
 

+ 59 - 22
BiddingKG/dl/complaint/punish_rule.py

@@ -3,6 +3,7 @@
 # @Author  : bidikeji
 # @Time    : 2020/12/24 0024 15:23
 import re
+import os
 import time
 import tensorflow as tf
 from BiddingKG.dl.common.Utils import *
@@ -85,7 +86,8 @@ def decode(logits, trans, sequence_lengths, tag_num):
     return viterbi_sequences
 
 class Punish_Extract():
-    def __init__(self, model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"):
+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
+        print('model_file_path:',model_file)
         self.sess = tf.Session(graph=tf.Graph())
         self.code = ""
         self.punish_dicition = ""
@@ -325,7 +327,7 @@ class Punish_Extract():
         ins = ""
         ptime = ""
         # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
-        if institutions == []:
+        if institutions == [] and len(title)>10:
             title_ners = getNers([title], useselffool=True)
             if title_ners[0]:
                 for title_ner in title_ners[0]:
@@ -426,22 +428,56 @@ class Punish_Extract():
         punishPeople = set([it.entity_text for l in punishPeople for it in l])
         return ';'.join(complainants), ';'.join(punishPeople)
 
-def get_punish_extracts(doc_id=' ', title=' ', text=' '):
-    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
-                                                                                    useselffool=True)
-    punish_code = punish.predict_punishCode(list_sentences)
-    # print('处罚编号: ',punish_code)
-    institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
-    # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-    keyword, punishType = punish.get_punishType(title, text)
-    # print('处罚类型:',punishType)
-    punishDecision = punish.get_punishDecision(text, punishType)
-    # print('处罚决定:',punishDecision)
-    punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
-    # print('投诉是否成立:',punishWhether)
-    complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
-    # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-    return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
+    def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
+        list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
+                                                                                        useselffool=True)
+        punish_code = punish.predict_punishCode(list_sentences)
+        # print('处罚编号: ',punish_code)
+        institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
+        # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+        keyword, punishType = punish.get_punishType(title, text)
+        # print('处罚类型:',punishType)
+        punishDecision = punish.get_punishDecision(text, punishType)
+        # print('处罚决定:',punishDecision)
+        punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
+        # print('投诉是否成立:',punishWhether)
+        complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
+        # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+        punish_dic = {'punish_code':punish_code,
+                      'punishType':punishType,
+                      'punishDecision':punishDecision,
+                     'complainants':complainants,
+                     'punishPeople':punishPeople,
+                     'punishWhether':punishWhether,
+                     'institutions':institutions,
+                     'punishTimes':punishTimes}
+        return punish_dic
+        # return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
+
+    def get_punish_extracts(self,list_sentences, list_entitys, title=' ', text=' '):
+        keyword, punishType = self.get_punishType(title, text)
+        if punishType == "未知类别":
+            return {'punishType':punishType}
+        # print('处罚类型:',punishType)
+        punish_code = self.predict_punishCode(list_sentences)
+        # print('处罚编号: ',punish_code)
+        institutions, punishTimes = self.get_institution(title, list_sentences[0], list_entitys[0])
+        # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+        punishDecision = self.get_punishDecision(text, punishType)
+        # print('处罚决定:',punishDecision)
+        punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+        # print('投诉是否成立:',punishWhether)
+        complainants, punishPeople = self.get_complainant(punishType, list_sentences[0], list_entitys[0])
+        # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+        punish_dic = {'punish_code':punish_code,
+                      'punishType':punishType,
+                      'punishDecision':punishDecision,
+                     'complainants':complainants,
+                     'punishPeople':punishPeople,
+                     'punishWhether':punishWhether,
+                     'institutions':institutions,
+                     'punishTimes':punishTimes}
+        return punish_dic
 
 if __name__ == "__main__":
     punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
@@ -478,11 +514,12 @@ if __name__ == "__main__":
     #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
     # t3 = time.time()
     # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
-    s = '厦财企〔2020〕12号,各有关单位:341号。厦财企〔2020〕12号,各有关单位:行政处罚厦建招诉决【2019】342号。行政处罚厦建招诉决【2019】343号。行政处罚厦建招诉决【2019】344号,'
+    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
     # list_sentences = [s.split('。')]
     # punish_code= punish.predict_punishCode( list_sentences)
     # print(punish_code)
 
-    punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-                get_punish_extracts(text=s)
-    print(punish_code)
+    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #             get_punish_extracts(text=s)
+    punish_dic = punish.get_punish_extracts_backup(text=s)
+    print(punish_dic)

+ 64 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -13,6 +13,9 @@ from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import *
 from BiddingKG.dl.interface.predictor import *
 from BiddingKG.dl.foolnltk import selffool
+from BiddingKG.dl.money.moneySource.ruleExtra import extract_moneySource
+from BiddingKG.dl.time.re_servicetime import extract_servicetime
+from BiddingKG.dl.bidway.re_bidway import extract_bidway
 
 
 
@@ -1678,6 +1681,67 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 else:
                     index += 1
 
+            # 资金来源提取  2020/12/30 新增
+            list_moneySource = extract_moneySource(sentence_text)
+            entity_type = "moneySource"
+            for moneySource in list_moneySource:
+                begin_index_temp = moneySource['begin_index']
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j] == begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j] > begin_index_temp:
+                        begin_index = j - 1
+                        break
+                index = moneySource['end_index']
+                end_index_temp = index
+                for j in range(begin_index, len(list_tokenbegin)):
+                    if list_tokenbegin[j] >= index:
+                        end_index = j - 1
+                        break
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                entity_text = moneySource['body']
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp))
+
+            # 服务期限提取 2020/12/30 新增
+            list_servicetime = extract_servicetime(sentence_text)
+            entity_type = "servicetime"
+            for servicetime in list_servicetime:
+                begin_index_temp = servicetime['begin_index']
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j] == begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j] > begin_index_temp:
+                        begin_index = j - 1
+                        break
+                index = servicetime['end_index']
+                end_index_temp = index
+                for j in range(begin_index, len(list_tokenbegin)):
+                    if list_tokenbegin[j] >= index:
+                        end_index = j - 1
+                        break
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                entity_text = servicetime['body']
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp))
+
+            # 招标方式提取 2020/12/30 新增
+            list_bidway = extract_bidway(sentence_text)
+            entity_type = "bidway"
+            for bidway in list_bidway:
+                begin_index_temp = bidway['begin_index']
+                end_index_temp = bidway['end_index']
+                begin_index = changeIndexFromWordToWords(tokens, begin_index_temp)
+                end_index = changeIndexFromWordToWords(tokens, end_index_temp)
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                entity_text = bidway['body']
+                list_sentence_entitys.append(
+                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp))
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys

+ 85 - 2
BiddingKG/dl/interface/predictor.py

@@ -1101,8 +1101,77 @@ class RoleRulePredictor():
                 #将属于集合中的不可能是中标人的标签置为无
                 if p_entity.entity_text in self.SET_NOT_TENDERER:
                     p_entity.label=5
-    
-    
+
+# 时间类别
+class TimePredictor():
+    def __init__(self):
+        self.sess = tf.Session(graph=tf.Graph())
+        self.inputs_code = None
+        self.outputs_code = None
+        self.input_shape = (2,30,60)
+        self.load_model()
+
+    def load_model(self):
+        model_path = os.path.dirname(__file__)+'/timesplit_model'
+        if self.inputs_code is None:
+            log("get model of time")
+            with self.sess.as_default():
+                with self.sess.graph.as_default():
+                    meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
+                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                    signature_def = meta_graph_def.signature_def
+                    self.inputs_code = []
+                    self.inputs_code.append(
+                        self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
+                    self.inputs_code.append(
+                        self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
+                    self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
+                    return self.inputs_code, self.outputs_code
+        else:
+            return self.inputs_code, self.outputs_code
+
+    def search_time_data(self,list_sentences,list_entitys):
+        data_x = []
+        points_entitys = []
+        for list_sentence, list_entity in zip(list_sentences, list_entitys):
+            p_entitys = 0
+            p_sentences = 0
+            while(p_entitys<len(list_entity)):
+                entity = list_entity[p_entitys]
+                if entity.entity_type in ['time']:
+                    while(p_sentences<len(list_sentence)):
+                        sentence = list_sentence[p_sentences]
+                        if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
+                            left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
+                            right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            context = [left, right]
+                            x = embedding_word(context, shape=self.input_shape)
+                            data_x.append(x)
+                            points_entitys.append(entity)
+                            break
+                        p_sentences += 1
+                p_entitys += 1
+        if len(points_entitys)==0:
+            return None
+        data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
+        return [data_x, points_entitys]
+
+    def predict(self, list_sentences,list_entitys):
+        datas = self.search_time_data(list_sentences, list_entitys)
+        if datas is None:
+            return
+        points_entitys = datas[1]
+        with self.sess.as_default():
+            predict_y = self.sess.run(self.outputs_code, feed_dict={self.inputs_code[0]:datas[0][0]
+                ,self.inputs_code[1]:datas[0][1]})
+            for i in range(len(predict_y)):
+                entity = points_entitys[i]
+                label = np.argmax(predict_y[i])
+                values = []
+                for item in predict_y[i]:
+                    values.append(item)
+                    entity.set_Role(label, values)
+
 def getSavedModel():
     #predictor = FormPredictor()
     graph = tf.Graph()
@@ -1367,6 +1436,19 @@ def save_codesplit_model():
                                            "input2":model_code.input[2]},
                                    outputs={"outputs":model_code.output})
 
+def save_timesplit_model():
+    filepath = '../time/model_label_time_classify.model.hdf5'
+    with tf.Graph().as_default() as graph:
+        time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+        with tf.Session() as sess:
+            sess.run(tf.global_variables_initializer())
+            h5_to_graph(sess, graph, filepath)
+            tf.saved_model.simple_save(sess,
+                                       "./timesplit_model/",
+                                       inputs={"input0":time_model.input[0],
+                                               "input1":time_model.input[1]},
+                                       outputs={"outputs":time_model.output})
+
 if __name__=="__main__":
     #save_role_model()
     #save_codename_model()
@@ -1374,6 +1456,7 @@ if __name__=="__main__":
     #save_person_model()
     #save_form_model()
     #save_codesplit_model()
+    save_timesplit_model()
     '''
     with tf.Session(graph=tf.Graph()) as sess:
         from tensorflow.python.saved_model import tag_constants

+ 1 - 1
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -154,7 +154,7 @@ def extract_moneySource(text):
         _moneySource['body'] = entity_text
         _moneySource['begin_index'] = wordOffset_begin
         _moneySource['end_index'] = wordOffset_end
-        print(_moneySource)
+        # print(_moneySource)
         list_moneySource.append(_moneySource)
     return list_moneySource
 

+ 3 - 3
BiddingKG/dl/time/re_servicetime.py

@@ -142,13 +142,13 @@ def extract_servicetime(text):
         word_list = word.split(" ")
         d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
         list_servicetime.append(d)
-    print(list_servicetime)
+    # print(list_servicetime)
     return list_servicetime
 
 
 if __name__ == '__main__':
-    df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
-    s = df["text"].iloc[5]
+    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
+    # s = df["text"].iloc[5]
     s = u'''
     ,大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
     中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:

+ 14 - 1
BiddingKG/dl/time/train_2.py

@@ -258,8 +258,21 @@ def plot_loss(history):
 if __name__ == '__main__':
     # getModel()
     # getModel_center()
-    training()
+    # training()
     # training_center()
     # predict()
     # predict_center()
+    model1 = models.load_model("model_label_time_classify.model.hdf5",
+                               custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+    test_x = []
+    test_y = []
+    left = '8675.20元人民币,(3)服务期限:'
+    right = '(4)质量:符合竞争性磋商文件规定的质'
+    context = [left, right]
+    x = embedding_word(context, shape=input_shape)
+    test_x.append(x)
+    test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
+    pre_y = model1.predict([test_x[0],test_x[1]])
+    rs = [np.argmax(item) for item in pre_y]
+    print(pre_y, rs)
     pass