Ver Fonte

大数据合并修复多次招标的去重

luojiehua há 3 anos atrás
pai
commit
dd332fe2a3

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -841,7 +841,7 @@ def precision(y_true, y_pred):
 
 if __name__=="__main__":
     # print(fool_char_to_id[">"])
-    print(getUnifyMoney('陆万捌仟柒佰贰拾元'))
+    print(getUnifyMoney('壹拾柒万元'))
     # model = getModel_w2v()
     # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
     # save([vocab,matrix],"vocabMatrix_words.pk")

+ 3 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -323,4 +323,6 @@ if __name__=="__main__":
     # print(match_enterprise_max_first(sentences))
     #
     # print("takes %d s"%(time.time()-_time))
-    fix_LEGAL_ENTERPRISE()
+    # fix_LEGAL_ENTERPRISE()
+    # print(jaccard_score("中国南方航空股份有限公司上海分公司","南方航空上海分公司"))
+    print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))

+ 2 - 2
BiddingKG/dl/form/train.py

@@ -265,5 +265,5 @@ if __name__ == "__main__":
     # train1()
     # vali()
     # save_form_model()
-    # train_context()
-    predict_context()
+    train_context()
+    # predict_context()

+ 10 - 8
BiddingKG/dl/interface/extract.py

@@ -51,6 +51,8 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
+
+
     #依赖句子顺序
     start_time = time.time()
     list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
@@ -128,14 +130,14 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
-    # for _article in list_articles:
-    #     log(_article.content)
-    #
-    # for list_entity in list_entitys:
-    #     for _entity in list_entity:
-    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
-    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
-    #                str(_entity.begin_index),str(_entity.end_index)))
+    for _article in list_articles:
+        log(_article.content)
+
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
+                  (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
+                   str(_entity.begin_index),str(_entity.end_index)))
 
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
 

+ 28 - 8
BiddingKG/maxcompute/documentMerge.py

@@ -106,7 +106,7 @@ def split_with_time(list_dict,sort_key,timedelta=86400*120):
                 _group = []
                 for j in range(_begin,len(list_dict)):
                     _group.append(list_dict[j])
-                if len(_group)>1:
+                if len(_group)>0:
                     list_group.append(_group)
             return list_group
     return [list_dict]
@@ -531,18 +531,28 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
                     _channel = otherChannel
                 if _channel not in dict_channel_id:
                     dict_channel_id[_channel] = []
-                dict_channel_id[_channel].append([_docid,dict_docid_doc[_docid].get("page_time_stamp"),dict_docid_doc[_docid].get("extract_count")])
-            channel_dict = {}
+                dict_channel_id[_channel].append({"docid":_docid,"page_time_stamp":dict_docid_doc[_docid].get("page_time_stamp"),"extract_count":dict_docid_doc[_docid].get("extract_count")})
+
+            #根据日期进行切分
+            new_dict_channel_id = {}
+            print(dict_channel_id)
             for k,v in dict_channel_id.items():
-                v.sort(key=lambda x:x[1])
-                v.sort(key=lambda x:x[2],reverse=True)
-                channel_dict[v[0][0]] = []
+                list_time_docids = split_with_time(v,"page_time_stamp",86400*5)
+                print(list_time_docids)
+                for _l in list_time_docids:
+                    otherChannel += 1
+                    new_dict_channel_id[otherChannel] = _l
+            print(new_dict_channel_id)
+            channel_dict = {}
+            for k,v in new_dict_channel_id.items():
+                v.sort(key=lambda x:x["page_time_stamp"])
+                v.sort(key=lambda x:x["extract_count"],reverse=True)
+                channel_dict[v[0]["docid"]] = []
                 for _docs in v[1:]:
-                    channel_dict[v[0][0]].append(_docs[0])
+                    channel_dict[v[0]["docid"]].append(_docs["docid"])
             _d = {"data":channel_dict,"process_time":getCurrent_date()}
             final_group_channel.append(_d)
 
-
         return json.dumps(final_group_channel)
 
 @annotate('string -> string')
@@ -1153,3 +1163,13 @@ class f_get_merge_docids(BaseUDAF):
         for _docid in list_docid:
             list_docid_str.append(str(_docid))
         return ",".join(list_docid_str)
+
+
+if __name__ == '__main__':
+    a = f_remege_limit_num_contain_bychannel()
+    buffer = a.new_buffer()
+    a.iterate(buffer,1,1,86400*1,"1","1","1","1","1","1","1",5,5)
+    a.iterate(buffer,3,1,86400*4,"1","1","1","1","1","1","1",5,5)
+    a.iterate(buffer,5,1,86400*10,"1","1","1","1","1","1","1",5,5)
+    print(a.terminate(buffer))
+    print(1)

+ 17 - 0
BiddingKG/maxcompute/extract_check.py

@@ -596,7 +596,24 @@ class f_turnPageattachments(object):
                 new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
         return new_page_attachments
 
+@annotate("string->string")
+class f_getRoles(BaseUDTF):
+
+    def __init__(self):
+        self.columns = ["win_tenderer","second_tenderer","third_tenderer"]
+        pass
+
+    # bidway名称统一规范
+    def bidway_integrate(self,sub_docs_json):
+        if sub_docs_json is not None:
+            _docs = json.loads(sub_docs_json)
+            for _doc in _docs:
+                for _c in self.columns:
+                    if _doc.get(_c) is not None:
+                        self.forward(_doc.get(_c))
 
+    def process(self,sub_docs_json):
+        self.bidway_integrate(sub_docs_json)
 
 @annotate("string->string")
 class turn_bidway(BaseUDTF):