Browse Source

合并去重和去重交叉去重的问题

luojiehua 3 years ago
parent
commit
8d09b5c09b

+ 11 - 9
.idea/sonarlint/issuestore/index.pb

@@ -15,14 +15,10 @@ S
 #BiddingKG/dl/time/re_servicetime.py,4\4\4454e65be42efdd433a1de3147c6f3cb69cf116b
 O
 BiddingKG/dl/common/nerUtils.py,8\2\82c3c87116c1da9281790ac9c71f57821e9207cf
-I
-BiddingKG/maxcompute/1.py,5\8\58fa6fe30194ad773c47ea70f4e48401242a1a88
 H
 BiddingKG/dl/__init__.py,a\c\ac12bb80834a26e34df8eaf4c762410dfcfc0a27
 U
 %BiddingKG/dl/metrics/extractMetric.py,f\e\fed725bbe7e61499dcc542a2cd6279850a62cb79
-L
-BiddingKG/dl/common/Utils.py,f\4\f4c35e30342829a2fc89108259e28edc0a425cce
 W
 'BiddingKG/maxcompute/article_extract.py,1\d\1d533d48614eebe6b6e03d0bf64b381cdf4beca0
 ]
@@ -33,9 +29,15 @@ P
  BiddingKG/dl/test/t2/__init__.py,6\e\6e2a437853f56392367a0fb812234f339cb553b4
 T
 $BiddingKG/dl/complaint/test/test1.py,2\0\20a445a789f907f8d2f1946e5ee6afd692a84716
-P
- BiddingKG/maxcompute/cycleRec.py,b\d\bdbd92638e7f5983e655c67b07bb464d62021b36
-G
-BiddingKG/dl/test/12.py,5\c\5c99d16b0fcfaac86fa00d720a060d38778939c6
 B
-BiddingKG/setup.py,5\9\5940e92844c5eec502a3109dcee2bbc5880b37a4
+BiddingKG/setup.py,5\9\5940e92844c5eec502a3109dcee2bbc5880b37a4
+Q
+!BiddingKG/dl/interface/Entitys.py,6\3\6394a73f962d314de6209d5bb823941b56eda9d7
+L
+BiddingKG/maxcompute/test.py,d\e\de565067c7b40720cc108eec25c6762d518e57df
+Q
+!BiddingKG/dl/form/generateData.py,7\e\7e590aa47c1871cc7d75ac844d5769bff50a6e70
+U
+%BiddingKG/maxcompute/attachmentRec.py,b\e\be8f50b8961bc8ae61e105517763f21c707ea3ec
+W
+'BiddingKG/dl/interface/getAttributes.py,9\7\97778d596d5a6e43488b0f3c116ada7183116989

+ 8 - 8
BiddingKG/dl/interface/extract.py

@@ -172,14 +172,14 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     data_res["unit_money"] = unit_money_list
     data_res["ratio"] = ratio_list
 
-    # for _article in list_articles:
-    #     log(_article.content)
-    #
-    # for list_entity in list_entitys:
-    #     for _entity in list_entity:
-    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
-    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
-    #                str(_entity.begin_index),str(_entity.end_index)))
+    for _article in list_articles:
+        log(_article.content)
+
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
+                  (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
+                   str(_entity.begin_index),str(_entity.end_index)))
 
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
 

+ 7 - 3
BiddingKG/dl/test/test4.py

@@ -28,8 +28,9 @@ def test(name,content):
             "timeout":60
             }
     myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
-    _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    # _url = "http://192.168.2.102:15030/article_extract"
+
+    # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
+    _url = "http://192.168.2.102:15030/article_extract"
     _resp = requests.post(_url, json=user, headers=myheaders, verify=True)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
     resp_json = _resp.content.decode("utf-8")
@@ -70,7 +71,10 @@ if __name__=="__main__":
     # 广州比地数据科技有限公司翻译服务工程招标
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
+
+    # print(predict("12", content,"打印机"))
+    a = time.time()
     print(predict("12", content,"打印机"))
     # test(12,content)
-    print("takes",time.time()-_time1)
+    print("takes",time.time()-a)
     pass

+ 1 - 0
BiddingKG/maxcompute/documentDumplicate.py

@@ -650,6 +650,7 @@ class decare_document(BaseUDTF):
                             self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
 
 def getBestDocid(list_pair):
+    # [docid1,extract_count1,docid2,extract_count2]
     # list_pair.sort(key=lambda x:x[3],reverse=True)
     # _max_count = max(list_pair[0][3],list_pair[0][1])
     # set_candidate = set()

+ 1 - 1
BiddingKG/maxcompute/documentMerge.py

@@ -545,7 +545,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
             print(new_dict_channel_id)
             channel_dict = {}
             for k,v in new_dict_channel_id.items():
-                v.sort(key=lambda x:x["page_time_stamp"])
+                v.sort(key=lambda x:x["docid"])
                 v.sort(key=lambda x:x["extract_count"],reverse=True)
                 channel_dict[v[0]["docid"]] = []
                 for _docs in v[1:]: