Преглед на файлове

Merge branch 'master' of http://192.168.2.65:3000/BIDI-ML/BIDI_ML_INFO_EXTRACTION into master

 Conflicts:
	BiddingKG.iml
	BiddingKG/dl/interface/modelFactory.py
	BiddingKG/dl/test/test_model_fjs.py
Jiasheng преди 4 години
родител
ревизия
5f26efa5b8

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.5 (dl_nlp)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (py3.5)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>

+ 3 - 2
BiddingKG.iml

@@ -2,12 +2,13 @@
 <module type="JAVA_MODULE" version="4">
   <component name="FacetManager">
     <facet type="Python" name="Python">
-      <configuration sdkName="" />
+      <configuration sdkName="Python 3.5 (dl_nlp)" />
     </facet>
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.5 (py35)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.5 (py3.5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>
 </module>

+ 40 - 0
BiddingKG/dl/common/Utils.py

@@ -9,6 +9,7 @@ import re
 import gensim
 from keras import backend as K
 import os
+import time
 
 from threading import RLock
 
@@ -569,6 +570,45 @@ def fitDataByRule(data):
     result = re.sub("[。]","",result)
     return  result
 
+time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+def timeFormat(_time):
+    current_year = time.strftime("%Y",time.localtime())
+    all_match = re.finditer(time_format_pattern,_time)
+    for _match in all_match:
+        if len(_match.group())>0:
+            legal = True
+            year = ""
+            month = ""
+            day = ""
+            for k,v in _match.groupdict().items():
+                if k=="year":
+                    year = v
+                if k=="month":
+                    month = v
+                if k=="day":
+                    day = v
+            if year!="":
+                if len(year)==2:
+                    year = "20"+year
+                if int(year)>int(current_year):
+                    legal = False
+            else:
+                legal = False
+            if month!="":
+                if int(month)>12:
+                    legal = False
+            else:
+                legal = False
+            if day!="":
+                if int(day)>31:
+                    legal = False
+            else:
+                legal = False
+            if legal:
+                return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
+    return ""
+
+
 def embedding(datas,shape):
     '''
     @summary:查找词汇对应的词向量

+ 30 - 24
BiddingKG/dl/complaint/punish_rule.py

@@ -454,30 +454,36 @@ class Punish_Extract():
         return punish_dic
         # return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
 
-    def get_punish_extracts(self,list_sentences, list_entitys, title=' ', text=' '):
-        keyword, punishType = self.get_punishType(title, text)
-        if punishType == "未知类别":
-            return {'punishType':punishType}
-        # print('处罚类型:',punishType)
-        punish_code = self.predict_punishCode(list_sentences)
-        # print('处罚编号: ',punish_code)
-        institutions, punishTimes = self.get_institution(title, list_sentences[0], list_entitys[0])
-        # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-        punishDecision = self.get_punishDecision(text, punishType)
-        # print('处罚决定:',punishDecision)
-        punishWhether= self.get_punishWhether(punishDecision, text, punishType)
-        # print('投诉是否成立:',punishWhether)
-        complainants, punishPeople = self.get_complainant(punishType, list_sentences[0], list_entitys[0])
-        # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-        punish_dic = {'punish_code':punish_code,
-                      'punishType':punishType,
-                      'punishDecision':punishDecision,
-                     'complainants':complainants,
-                     'punishPeople':punishPeople,
-                     'punishWhether':punishWhether,
-                     'institutions':institutions,
-                     'punishTimes':punishTimes}
-        return punish_dic
+    def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
+        list_result = []
+        for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+            title = article.title
+            text=article.content
+            keyword, punishType = self.get_punishType(title, text)
+            if punishType == "未知类别":
+                list_result.append({"punish":{}})
+            else:
+                # print('处罚类型:',punishType)
+                punish_code = self.predict_punishCode(list_sentences)
+                # print('处罚编号: ',punish_code)
+                institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+                # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+                punishDecision = self.get_punishDecision(text, punishType)
+                # print('处罚决定:',punishDecision)
+                punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+                # print('投诉是否成立:',punishWhether)
+                complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+                # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+                punish_dic = {'punish_code':punish_code,
+                              'punishType':punishType,
+                              'punishDecision':punishDecision,
+                             'complainants':complainants,
+                             'punishPeople':punishPeople,
+                             'punishWhether':punishWhether,
+                             'institutions':institutions,
+                             'punishTimes':punishTimes}
+                list_result.append({"punish":punish_dic})
+        return list_result
 
 if __name__ == "__main__":
     punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")

+ 3 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1722,7 +1722,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             # 资金来源提取  2020/12/30 新增
             list_moneySource = extract_moneySource(sentence_text)
-            entity_type = "moneySource"
+            entity_type = "moneysource"
             for moneySource in list_moneySource:
                 begin_index_temp = moneySource['begin_index']
                 for j in range(len(list_tokenbegin)):
@@ -1746,7 +1746,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             # 服务期限提取 2020/12/30 新增
             list_servicetime = extract_servicetime(sentence_text)
-            entity_type = "servicetime"
+            entity_type = "serviceTime"
             for servicetime in list_servicetime:
                 begin_index_temp = servicetime['begin_index']
                 for j in range(len(list_tokenbegin)):
@@ -1798,8 +1798,7 @@ def union_result(codeName,prem):
     result = []
     assert len(codeName)==len(prem)
     for item_code,item_prem in zip(codeName,prem):
-        if item_code[0]==item_prem[0]:
-            result.append([item_code[0],dict(item_code[1],**item_prem[1])])
+        result.append(dict(item_code,**item_prem))
     return result
 
 def persistenceData(data):

BIN
BiddingKG/dl/interface/codename_classlabels.pk


BIN
BiddingKG/dl/interface/codename_savedmodel_tf/saved_model.pb


BIN
BiddingKG/dl/interface/codename_savedmodel_tf/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/codename_savedmodel_tf/variables/variables.index


+ 30 - 6
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,6 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 import re
 import copy
@@ -1137,8 +1137,6 @@ def getPackageRoleMoney(list_sentence,list_entity):
         list_entity:文章的实体list
     @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话  
     '''
-    from BiddingKG.dl.common.Utils import save
-    save([list_sentence,list_entity],"list_sentence_entity.pk")
     # print("=1")
     theRole = getRoleList(list_sentence,list_entity)
     if not theRole:
@@ -1154,7 +1152,34 @@ def getPackageRoleMoney(list_sentence,list_entity):
     PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity)
     # print("=4")
     return PackDict
-       
+
+def getOtherAttributes(list_entity):
+    dict_other = {"bidway":"",
+                  "moneysource":"",
+                  "person_review":[],
+                  "time_release":"",
+                  "time_bidopen":"",
+                  "time_bidclose":"",
+                  "serviceTime":"",
+                  "product":[]}
+    for entity in list_entity:
+        if entity.entity_type == 'bidway':
+            dict_other["bidway"] = entity.entity_text
+        elif entity.entity_type=='moneysource':
+            dict_other["moneysource"] = entity.entity_text
+        elif entity.entity_type=='serviceTime':
+            dict_other["serviceTime"] = entity.entity_text
+        elif entity.entity_type == 'time' and entity.label==1:
+            dict_other["time_release"] = timeFormat(entity.entity_text)
+        elif entity.entity_type == 'time' and entity.label==2:
+            dict_other["time_bidopen"] = timeFormat(entity.entity_text)
+        elif entity.entity_type == 'time' and entity.label == 3:
+            dict_other["time_bidclose"] = timeFormat(entity.entity_text)
+        elif entity.entity_type=="person" and entity.label ==4:
+            dict_other["person_review"].append(entity.entity_text)
+    return dict_other
+
+
 def getPREMs(list_sentences,list_entitys,list_articles):
     '''
     @param:
@@ -1164,9 +1189,8 @@ def getPREMs(list_sentences,list_entitys,list_articles):
     '''
     result = []
     for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
-        
         RoleList = getPackageRoleMoney(list_sentence,list_entity)
-        result.append([list_article.id,{"prem":RoleList}])
+        result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity)))
     return result
 
 

+ 8 - 7
BiddingKG/dl/interface/modelFactory.py

@@ -159,7 +159,7 @@ class Model_person_classify():
     def __init__(self,lazyLoad=getLazyLoad()):
         if USE_PAI_EAS:
             lazyLoad = True
-        self.model_person_file = os.path.dirname(__file__)+"/../person/models/model_person_classify_fjs.model.hdf5"
+        self.model_person_file = os.path.dirname(__file__)+"/../person/models/model_person.model.hdf5"
         self.model_person = None
         self.sess_person = tf.Session(graph=tf.Graph())
         if not lazyLoad:
@@ -183,11 +183,11 @@ class Model_person_classify():
       return self.model_person
             
       
-    '''
-      if self.model_person is None:
-          self.model_person = models.load_model(self.model_person_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
-      return self.model_person
-    '''
+      '''
+        if self.model_person is None:
+            self.model_person = models.load_model(self.model_person_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+        return self.model_person
+      '''
     '''
     def load_weights(self):
         model = self.getModel()
@@ -195,8 +195,9 @@ class Model_person_classify():
     '''
     
     def encode(self,tokens,begin_index,end_index,**kwargs):
+        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=10),shape=(2,10,128))
         return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
-    
+
     def predict(self,x):
         x = np.transpose(np.array(x),(1,0,2,3))
         

BIN
BiddingKG/dl/interface/person_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/person_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/person_savedmodel/variables/variables.index


BIN
BiddingKG/dl/interface/person_savedmodel_backup/saved_model.pb


BIN
BiddingKG/dl/interface/person_savedmodel_backup/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/person_savedmodel_backup/variables/variables.index


+ 164 - 63
BiddingKG/dl/interface/predictor.py

@@ -74,8 +74,8 @@ class CodeNamePredict():
         id_PN_B = self.class_labels.index("PN_B")
         id_PN_M = self.class_labels.index("PN_M")
         id_PN_E = self.class_labels.index("PN_E")
-        self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"+"+str(id_PC_E)+"?")
-        self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"+"+str(id_PN_E)+"?")
+        self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
+        self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
         print("pc",self.PC_pattern)
         print("pn",self.PN_pattern)
         self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
@@ -100,14 +100,18 @@ class CodeNamePredict():
             log("get model of codename")
             with self.sess_codename.as_default():
                 with self.sess_codename.graph.as_default():
-                    meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel")
+                    meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
                     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                     signature_def = meta_graph_def.signature_def
                     self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
-                    self.outputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
-                return self.inputs,self.outputs
+                    self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
+                    self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
+                    self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
+                    self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
+
+                return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
         else:
-            return self.inputs,self.outputs
+            return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
         '''    
         if self.model is None:
             self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
@@ -198,7 +202,14 @@ class CodeNamePredict():
                 else:
                     result = symbol_dict.get(rightfinds[0])+data
         return  result
-    
+
+    def decode(self,logits, trans, sequence_lengths, tag_num):
+        viterbi_sequences = []
+        for logit, length in zip(logits, sequence_lengths):
+            score = logit[:length]
+            viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+            viterbi_sequences.append(viterbi_seq)
+        return viterbi_sequences
     
     def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
         #@summary: 获取每篇文章的code和name
@@ -228,7 +239,7 @@ class CodeNamePredict():
             list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
             _begin_index = 0
             
-            item = [doc_id,{"code":[],"name":""}]
+            item = {"code":[],"name":""}
             code_set = set()
             dict_name_freq_score = dict()
             while(True):
@@ -237,10 +248,13 @@ class CodeNamePredict():
                     MAX_LEN = MAX_AREA
                 _LEN = MAX_AREA//MAX_LEN
                 #预测
-                x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+
+                # x = [[self.word2index.get(word,index_pad)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
                 x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
+
                 if USE_PAI_EAS:
-                    
                     request = tf_predict_pb2.PredictRequest()
                     request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
                     request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
@@ -256,15 +270,20 @@ class CodeNamePredict():
                             predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
                 else:
                     with self.sess_codename.as_default():
-                        t_input,t_output = self.getModel()
-                        predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
+                        t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
+                        _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
+                                                                                              t_input_length:x_len,
+                                                                                              t_keepprob:1.0})
+                        predict_y = self.decode(_logits,_trans,x_len,7)
+                        # print('==========',_logits)
+
                         '''
                         for item11 in np.argmax(predict_y,-1):
                             print(item11)
                         print(predict_y)
                         '''
                 # print(predict_y)
-                for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.argmax(predict_y,-1)):
+                for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
                     pad_sentence = sentence.sentence_text[:MAX_LEN]
                     join_predict = "".join([str(s) for s in predict])
                     # print(pad_sentence)
@@ -323,7 +342,7 @@ class CodeNamePredict():
 
                                 if the_code not in code_set:
                                     code_set.add(the_code)
-                                    item[1]['code'] = list(code_set)
+                                    item['code'] = list(code_set)
                     for iter in re.finditer(self.PN_pattern,join_predict):
                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
 
@@ -352,27 +371,28 @@ class CodeNamePredict():
             list_name_freq_score = []
 
             # 2020/11/23 大网站规则调整
-            name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
-            for sentence in list_sentence:
-                # pad_sentence = sentence.sentence_text
-                othername = re.search(name_re1, sentence.sentence_text)
-                if othername != None:
-                    project_name = othername.group(3)
-                    beg = find_index([project_name], sentence.sentence_text)[0]
-                    end = beg + len(project_name)
-                    _name = self.fitDataByRule(sentence.sentence_text[beg:end])
-                    # add name to entitys
-                    _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
-                    sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
-                                     entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
-                                     end_index=0, wordOffset_begin=beg, wordOffset_end=end)
-                    list_entity.append(_entity)
-                    w = 1
-                    if _name not in dict_name_freq_score:
-                        # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
-                        dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
-                    else:
-                        dict_name_freq_score[_name][0] += 1
+            if len(dict_name_freq_score) == 0:
+                name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
+                for sentence in list_sentence:
+                    # pad_sentence = sentence.sentence_text
+                    othername = re.search(name_re1, sentence.sentence_text)
+                    if othername != None:
+                        project_name = othername.group(3)
+                        beg = find_index([project_name], sentence.sentence_text)[0]
+                        end = beg + len(project_name)
+                        _name = self.fitDataByRule(sentence.sentence_text[beg:end])
+                        # add name to entitys
+                        _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                        sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
+                                         entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
+                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end)
+                        list_entity.append(_entity)
+                        w = 1
+                        if _name not in dict_name_freq_score:
+                            # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
+                        else:
+                            dict_name_freq_score[_name][0] += 1
                 # othername = re.search(name_re1, sentence.sentence_text)
                 # if othername != None:
                 #     _name = othername.group(3)
@@ -386,7 +406,7 @@ class CodeNamePredict():
             # print(list_name_freq_score)
             if len(list_name_freq_score)>0:
                 list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
-                item[1]['name'] = list_name_freq_score[0][0]
+                item['name'] = list_name_freq_score[0][0]
                 # if list_name_freq_score[0][1][0]>1:
                 #     item[1]['name'] = list_name_freq_score[0][0]
                 # else:
@@ -394,7 +414,7 @@ class CodeNamePredict():
                 #     item[1]["name"] = list_name_freq_score[0][0]
                 
             #下面代码加上去用正则添加某些识别不到的项目编号
-            if item[1]['code'] == []:
+            if item['code'] == []:
                 for sentence in list_sentence:
                     # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
                     # if othercode != None:
@@ -402,7 +422,7 @@ class CodeNamePredict():
                     # 2020/11/23 大网站规则调整
                     othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
                     if othercode != None:
-                        item[1]['code'].append(othercode.group(3))
+                        item['code'].append(othercode.group(3))
             result.append(item)
 
             list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
@@ -884,7 +904,7 @@ class RoleRulePredictor():
         
 
         for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames):
-            list_name = list_codename[1]["name"]
+            list_name = list_codename["name"]
             list_name = self._check_input(list_name)+[article.title]
             for p_entity in list_entity:
 
@@ -1168,8 +1188,8 @@ class TimePredictor():
             return
         points_entitys = datas[1]
         with self.sess.as_default():
-            predict_y = self.sess.run(self.outputs_code, feed_dict={self.inputs_code[0]:datas[0][0]
-                ,self.inputs_code[1]:datas[0][1]})
+            predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
+                ,self.inputs_code[1]:datas[0][1]})[0]
             for i in range(len(predict_y)):
                 entity = points_entitys[i]
                 label = np.argmax(predict_y[i])
@@ -1217,6 +1237,78 @@ def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
     model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
     return model
 
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+def BiLSTM_CRF_tfmodel(sess,weights):
+    BiRNN_Units = 200
+    chunk_tags = {
+        'O': 0,
+        'PN_B': 1,
+        'PN_M': 2,
+        'PN_E': 3,
+        'PC_B': 4,
+        'PC_M': 5,
+        'PC_E': 6,
+    }
+
+    def embedding_layer(input,keepprob):
+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
+        embedding = tf.nn.embedding_lookup(params=embedding,ids=input)
+        embedding = tf.nn.dropout(embedding,keepprob)
+        return embedding
+
+    def BiLSTM_Layer(input,length):
+        with tf.variable_scope("BiLSTM"):
+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
+        output = tf.concat(output,2)
+        return output
+
+    def CRF_layer(input,num_tags,BiRNN_Units,time_step,keepprob):
+        with tf.variable_scope("CRF"):
+            with tf.variable_scope("hidden"):
+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
+                # print(input)
+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
+                hidden = tf.nn.dropout(hidden,keepprob)
+            with tf.variable_scope("output"):
+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
+        return logits_
+
+    def layer_loss(input,true_target,num_tags,length):
+        with tf.variable_scope("crf_loss"):
+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
+            return tf.reduce_mean(-log_likelihood),trans
+
+    with sess.graph.as_default():
+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
+        keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
+
+        _embedding = embedding_layer(char_input,keepprob)
+        _shape = tf.shape(char_input)
+        batch_size = _shape[0]
+        step_size = _shape[-1]
+        bilstm = BiLSTM_Layer(_embedding,length)
+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size,keepprob=keepprob)
+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
+        global_step = tf.Variable(0,trainable=False)
+        with tf.variable_scope("optimizer"):
+            opt = tf.train.AdamOptimizer(0.002)
+            grads_vars = opt.compute_gradients(crf_loss)
+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op
+
 import h5py
 def h5_to_graph(sess,graph,h5file):
     
@@ -1310,40 +1402,48 @@ def initialize_uninitialized(sess):
     
       
 def save_codename_model():
-    filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
+    # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
+    filepath = "models_tf/32-L0.565985563055-F0.8640033553528363-P0.85770792130738-R0.8703918876095912/model.ckpt"
     vocabpath = "../projectCode/models/vocab.pk"
     classlabelspath = "../projectCode/models/classlabels.pk"
-    vocab = load(vocabpath)
-    class_labels = load(classlabelspath)
+    # vocab = load(vocabpath)
+    # class_labels = load(classlabelspath)
+    vocab_model = getModel_word()
+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
     graph = tf.get_default_graph()
     with graph.as_default() as g:
         ''''''
-        model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
+        # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
         #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
         
-        #sess = tf.Session(graph=g)
-        sess = tf.keras.backend.get_session()
-        
+        sess = tf.Session(graph=g)
+        # sess = tf.keras.backend.get_session()
+        char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
         #with sess.as_default():
         sess.run(tf.global_variables_initializer())
-        print(sess.run("time_distributed_1/kernel:0"))
-        model.load_weights(filepath)
-        
-        
-        
-        print("#",sess.run("time_distributed_1/kernel:0"))
+        # print(sess.run("time_distributed_1/kernel:0"))
+        # model.load_weights(filepath)
+        saver = tf.train.Saver()
+        saver.restore(sess, filepath)
+
+        print("logits",sess.run(logits))
         
-        x = load("codename_x.pk")
+        # print("#",sess.run("time_distributed_1/kernel:0"))
+
+        # x = load("codename_x.pk")
         #y = model.predict(x)
-        y = sess.run(model.output,feed_dict={model.input:x})
+        # y = sess.run(model.output,feed_dict={model.input:x})
         
-        for item in np.argmax(y,-1):
-            print(item)
+        # for item in np.argmax(y,-1):
+        #     print(item)
         tf.saved_model.simple_save(
                                     sess,
-                                    "./codename_savedmodel/",
-                                    inputs={"inputs": model.input},
-                                    outputs={"outputs": model.output}
+                                    "./codename_savedmodel_tf/",
+                                    inputs={"inputs": char_input,
+                                            "inputs_length":length,
+                                            'keepprob':keepprob},
+                                    outputs={"logits": logits,
+                                             "trans":trans}
         )
         
     
@@ -1457,12 +1557,13 @@ def save_timesplit_model():
 
 if __name__=="__main__":
     #save_role_model()
-    #save_codename_model()
+    # save_codename_model()
+    save_codename_model()
     #save_money_model()
     #save_person_model()
     #save_form_model()
     #save_codesplit_model()
-    save_timesplit_model()
+    # save_timesplit_model()
     '''
     with tf.Session(graph=tf.Graph()) as sess:
         from tensorflow.python.saved_model import tag_constants

+ 14 - 10
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -147,15 +147,19 @@ def extract_moneySource(text):
     list_moneySource = []
     for result in first:
         entity_text = sub.sub("",result['moneySource'])
-        wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
-        wordOffset_end = wordOffset_begin + len(entity_text)
-        # print(entity_text,wordOffset_begin,wordOffset_end)
-        _moneySource = dict()
-        _moneySource['body'] = entity_text
-        _moneySource['begin_index'] = wordOffset_begin
-        _moneySource['end_index'] = wordOffset_end
-        # print(_moneySource)
-        list_moneySource.append(_moneySource)
+        # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
+        if entity_text is None:
+            continue
+        else:
+            wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
+            wordOffset_end = wordOffset_begin + len(entity_text)
+            print(entity_text,wordOffset_begin,wordOffset_end)
+            _moneySource = dict()
+            _moneySource['body'] = entity_text
+            _moneySource['begin_index'] = wordOffset_begin
+            _moneySource['end_index'] = wordOffset_end
+            # print(_moneySource)
+            list_moneySource.append(_moneySource)
     return list_moneySource
 
 
@@ -163,6 +167,6 @@ def extract_moneySource(text):
 if __name__ == '__main__':
     # re_rule()
     test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。"
-    # 11,15 11
+    # 11,23 35,37
     extract_moneySource(test)
     pass

+ 3 - 1
BiddingKG/dl/test/10.py

@@ -1 +1,3 @@
-print(0x5f)
+
+a = ""
+print(a.rjust(2,"20"))

BIN
BiddingKG/dl/test/list_sentence_entity.pk


+ 10 - 42
BiddingKG/dl/test/test4.py

@@ -89,49 +89,14 @@ def predict(doc_id,text):
     print("getPREMs")
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
     print("getPREMs")
-    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text)
-    print(punish_dic)
-    prem[0][1]['punish'] = punish_dic
-
-    bidway = [] # 招标方式
-    moneySource = [] # 资金来源
-    servicetime = [] # 服务时间
-    time_release = [] # 发布时间
-    time_bidopen = [] # 开标时间
-    time_bidclose = [] # 截标时间
-    for entity in list_entitys[0]:
-        if entity.entity_type == 'bidway':
-            bidway.append(entity.entity_text)
-        elif entity.entity_type=='moneySource':
-            moneySource.append(entity.entity_text)
-        elif entity.entity_type=='servicetime':
-            servicetime.append(entity.entity_text)
-        elif entity.entity_type == 'time' and entity.label==1:
-            time_release.append(entity.entity_text)
-        elif entity.entity_type == 'time' and entity.label==2:
-            time_bidopen.append(entity.entity_text)
-        elif entity.entity_type == 'time' and entity.label == 3:
-            time_bidclose.append(entity.entity_text)
-
-    prem[0][1]['bidway'] = ';'.join(set(bidway))
-    prem[0][1]['moneySource'] = ';'.join(set(moneySource))
-    prem[0][1]['servicetime'] = ';'.join(set(servicetime))
-    prem[0][1]['time_release'] = ';'.join(set(time_release))
-    prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
-    prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
-
-
-
-    
-    ''''''
-    
+    list_punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
+
 
     for entitys in list_entitys:
         for entity in entitys:
             print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
-
     #print(prem)
-    return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
 
          
 def test(name,content):
@@ -164,11 +129,14 @@ if __name__=="__main__":
     # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
     # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
     # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
-    text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
-    投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
-    建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
-    二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
+    # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
+    # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
+    # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
+    # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
+    text = '''
+    ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
+    '''
     print("start")
     # print(predict("12",content))
     # 评审专家 100005322