3 년 전 · 5ac66e6908
--- a/.idea/sonarlint/issuestore/index.pb
+++ b/.idea/sonarlint/issuestore/index.pb
@@ -6,4 +6,40 @@ P
 
				 Y
			
 
				 )BiddingKG/dl/relation_extraction/model.py,1\8\18960bd2f3d2ffcab4e29ee67a5dbda20d72990a
			
 
				 G
			
 
				-BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
			
 
				+BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
			
 
				+a
			
 
				+1BiddingKG/deepdive/test/testArticle_processed2.py,a\f\afe8cc0ca1038654e923f628b8bfb01ce464db17
			
 
				+Y
			
 
				+)BiddingKG/maxcompute/contactDumplicate.py,0\9\09c1550b10f6ce58546ad5d4b900071c2a8292cc
			
 
				+S
			
 
				+#BiddingKG/dl/time/re_servicetime.py,4\4\4454e65be42efdd433a1de3147c6f3cb69cf116b
			
 
				+U
			
 
				+%BiddingKG/maxcompute/attachmentRec.py,b\e\be8f50b8961bc8ae61e105517763f21c707ea3ec
			
 
				+O
			
 
				+BiddingKG/dl/common/nerUtils.py,8\2\82c3c87116c1da9281790ac9c71f57821e9207cf
			
 
				+Q
			
 
				+!BiddingKG/dl/LEGAL_ENTERPRISE.txt,6\8\685bd49ae2f5f0de419c93a217a0e57564d705ab
			
 
				+I
			
 
				+BiddingKG/maxcompute/1.py,5\8\58fa6fe30194ad773c47ea70f4e48401242a1a88
			
 
				+P
			
 
				+ BiddingKG/dl/bidway/re_bidway.py,4\b\4bbee1b8e2177ffd4dbcc10e26686b81b38db517
			
 
				+Q
			
 
				+!BiddingKG/dl/interface/Entitys.py,6\3\6394a73f962d314de6209d5bb823941b56eda9d7
			
 
				+H
			
 
				+BiddingKG/dl/__init__.py,a\c\ac12bb80834a26e34df8eaf4c762410dfcfc0a27
			
 
				+U
			
 
				+%BiddingKG/dl/metrics/extractMetric.py,f\e\fed725bbe7e61499dcc542a2cd6279850a62cb79
			
 
				+L
			
 
				+BiddingKG/dl/common/Utils.py,f\4\f4c35e30342829a2fc89108259e28edc0a425cce
			
 
				+W
			
 
				+'BiddingKG/maxcompute/article_extract.py,1\d\1d533d48614eebe6b6e03d0bf64b381cdf4beca0
			
 
				+]
			
 
				+-BiddingKG/dl/test/测试所有提取信息.py,3\b\3baf1cd5607ac9f5b6263b25a3a5a1570221f784
			
 
				+I
			
 
				+BiddingKG/dl/test/t2/1.py,1\a\1a12a3e742b3f8f9ff5623b9eb9b471824b3db44
			
 
				+P
			
 
				+ BiddingKG/dl/test/t2/__init__.py,6\e\6e2a437853f56392367a0fb812234f339cb553b4
			
 
				+S
			
 
				+#BiddingKG/dl/interface/predictor.py,8\e\8efac754add7c721c357d60e457cffc0d592e7d1
			
 
				+U
			
 
				+%BiddingKG/dl/entityLink/entityLink.py,6\2\629ab9c7dc55af50771db3959ddf1e23b4ee5208
			
--- a/BiddingKG/dl/complaint/test/test1.py
+++ b/BiddingKG/dl/complaint/test/test1.py
@@ -749,7 +749,7 @@ def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
 
				             # tokens_all = getTokens(sentences,useselffool=useselffool)
			
 
				             if key_nerToken not in cost_time:
			
 
				                 cost_time[key_nerToken] = 0
			
 
				-            cost_time[key_nerToken] += time.time()-start_time
			
 
				+            cost_time[key_nerToken] += round(time.time()-start_time,2)
			
 
				 
			
 
				 
			
 
				             for sentence_index in range(len(sentences)):
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -72,9 +72,9 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
				     log("get product attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["product_attrs"] = round(time.time()-start_time,2)
			
 
				 
			
 
				-    start_time = time.time()
			
 
				-    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
			
 
				-    cost_time["rule"] = round(time.time()-start_time,2)
			
 
				+    # start_time = time.time()
			
 
				+    # predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
			
 
				+    # cost_time["rule"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     start_time = time.time()
			
 
				     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
			
--- a/BiddingKG/dl/projectCode/projectCodeAndName_tf.py
+++ b/BiddingKG/dl/projectCode/projectCodeAndName_tf.py
@@ -621,7 +621,7 @@ def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
 
				             # tokens_all = getTokens(sentences,useselffool=useselffool)
			
 
				             if key_nerToken not in cost_time:
			
 
				                 cost_time[key_nerToken] = 0
			
 
				-            cost_time[key_nerToken] += time.time()-start_time
			
 
				+            cost_time[key_nerToken] += round(time.time()-start_time,2)
			
 
				 
			
 
				 
			
 
				             for sentence_index in range(len(sentences)):
			
--- a/BiddingKG/dl/test/12.py
+++ b/BiddingKG/dl/test/12.py
@@ -1,19 +1,33 @@
 
				-
			
 
				-from multiprocessing import Queue,Process,freeze_support
			
 
				 import time
			
 
				 
			
 
				-def f():
			
 
				-    while(True):
			
 
				-        print(1)
			
 
				-        time.sleep(1)
			
 
				-
			
 
				-if __name__=="__main__":
			
 
				-    freeze_support()
			
 
				-    while(True):
			
 
				-        p = Process(target=f)
			
 
				-        p.start()
			
 
				-        p.join()
			
 
				-
			
 
				-        # time.sleep(4)
			
 
				-        # p.terminate()
			
 
				-        # p.start()
			
 
				+import math
			
 
				+def getAvgD(aint_dis):
			
 
				+    if len(aint_dis)==0:
			
 
				+        return 0
			
 
				+    avg_dis = 1
			
 
				+    int_avgD = int(sum(aint_dis)/len(aint_dis))
			
 
				+    new_aint_dis = [a for a in aint_dis]
			
 
				+    print(sum(aint_dis)/len(aint_dis))
			
 
				+    min_pow = 10000000
			
 
				+    min_dis = min(aint_dis)
			
 
				+
			
 
				+    for _dis in range(min(aint_dis),max(aint_dis)+1):
			
 
				+
			
 
				+        pow_x = 0
			
 
				+        for _d in new_aint_dis:
			
 
				+            pow_x += math.sqrt(abs((_d-_dis)))
			
 
				+        print(_dis,pow_x)
			
 
				+        if pow_x<min_pow:
			
 
				+            min_pow = pow_x
			
 
				+            min_dis = _dis
			
 
				+
			
 
				+    return min_dis
			
 
				+
			
 
				+a = [{"a":"1","b":2},{"a":"2","b":2},{"a":"1","b":23}]
			
 
				+from itertools import groupby
			
 
				+a.sort(key=lambda x:x["a"])
			
 
				+print(list(groupby(a,key=lambda x:x["a"])))
			
 
				+
			
 
				+
			
 
				+print(time.)
			
 
				+
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -66,7 +66,7 @@ if __name__=="__main__":
 
				     # 广州比地数据科技有限公司翻译服务工程招标
			
 
				     # '''
			
 
				     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				-    print(predict("12", text))
			
 
				+    print(predict("12", content))
			
 
				     # test("12",content)
			
 
				     print("takes",time.time()-_time1)
			
 
				     pass
			
--- a/BiddingKG/maxcompute/cycleRec.py
+++ b/BiddingKG/maxcompute/cycleRec.py
@@ -8,6 +8,7 @@ import time
 
				 import json
			
 
				 import logging
			
 
				 logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+import math
			
 
				 
			
 
				 @annotate('string->string')
			
 
				 class f_splitProduct(BaseUDTF):
			
@@ -102,6 +103,29 @@ def clusterTimestamp(aint_timestamp,distance = 28*24*60*60):
 
				 
			
 
				     return aint_center
			
 
				 
			
 
				+def getAvgD(aint_dis):
			
 
				+    if len(aint_dis)==0:
			
 
				+        return 0
			
 
				+    avg_dis = 1
			
 
				+    int_avgD = int(sum(aint_dis)/len(aint_dis))
			
 
				+    new_aint_dis = [a for a in aint_dis]
			
 
				+    print(sum(aint_dis)/len(aint_dis))
			
 
				+    min_pow = 10000000
			
 
				+    min_dis = min(aint_dis)
			
 
				+
			
 
				+    for _dis in range(min(aint_dis),max(aint_dis)+1):
			
 
				+
			
 
				+        pow_x = 0
			
 
				+        for _d in new_aint_dis:
			
 
				+            pow_x += math.sqrt(abs((_d-_dis)))
			
 
				+        print(_dis,pow_x)
			
 
				+        if pow_x<min_pow:
			
 
				+            min_pow = pow_x
			
 
				+            min_dis = _dis
			
 
				+
			
 
				+    return min_dis
			
 
				+
			
 
				+
			
 
				 def getDistanceOfCluster(aint_center):
			
 
				     aint_center.sort(key=lambda x:x)
			
 
				     aint_dis = []
			
@@ -122,13 +146,21 @@ def getDistanceOfCluster(aint_center):
 
				         if _d>int_maxD:
			
 
				             int_maxD = _d
			
 
				     if len(aint_dis)>0:
			
 
				-        int_avgD = int(sum(aint_dis)/len(aint_dis))
			
 
				+        # int_avgD = int(sum(aint_dis)/len(aint_dis))
			
 
				+        int_avgD = getAvgD(aint_dis)
			
 
				         int_minD = min(aint_dis)
			
 
				         int_maxD = max(aint_dis)
			
 
				-        for _d in aint_dis:
			
 
				-            aint_gre = [int(a>=_d) for a in aint_dis]
			
 
				-            if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
			
 
				-                cluster_d = _d
			
 
				+        if abs(int_maxD-int_avgD)>abs(int_minD-int_avgD):
			
 
				+            int_minD = min([int_avgD,int_minD])
			
 
				+            int_maxD = max([int_avgD,int_minD])
			
 
				+        else:
			
 
				+            int_minD = min([int_avgD,int_maxD])
			
 
				+            int_maxD = max([int_avgD,int_maxD])
			
 
				+        int_avgD = (int_minD+int_maxD)//2
			
 
				+        # for _d in aint_dis:
			
 
				+        #     aint_gre = [int(a>=_d) for a in aint_dis]
			
 
				+        #     if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
			
 
				+        #         cluster_d = _d
			
 
				 
			
 
				     return aint_dis,int_avgD,int_minD,int_maxD,cluster_d
			
 
				 
			
@@ -153,12 +185,17 @@ def getPeriod(aint_timestamp):
 
				             base_prob = 0.9
			
 
				         _prob = round(base_prob-(flt_powD/len(aint_dis)/int_avgD**2),4)
			
 
				         # if flt_powD/len(aint_dis)<30:
			
 
				-        if _prob>0.3:
			
 
				+        if _prob>0.5 and int_maxD-int_minD<=70:
			
 
				             return last_time,_prob,int(int_avgD),int(int_minD),int(int_maxD),len(aint_dis)
			
 
				     return None,_prob,None,None,None,None
			
 
				 
			
 
				+def timeAdd(_time,days):
			
 
				+    a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
			
 
				+
			
 
				+    _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
			
 
				+    return _time1
			
 
				 
			
 
				-@annotate('string->string,double,bigint,bigint,bigint,bigint')
			
 
				+@annotate('string->string,string,string,double,bigint,bigint,bigint,bigint')
			
 
				 class f_getProductCycle(BaseUDTF):
			
 
				 
			
 
				     def process(self,json_timestamp):
			
@@ -170,7 +207,9 @@ class f_getProductCycle(BaseUDTF):
 
				         # aint_center = aint_timestamp
			
 
				         last_time,_prob,int_avgD,int_minD,int_maxD,_periods = getPeriod(aint_timestamp)
			
 
				         if int_avgD is not None:
			
 
				-            self.forward(last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
			
 
				+            may_begin = timeAdd(last_time,int_minD)
			
 
				+            may_end = timeAdd(last_time,int_maxD)
			
 
				+            self.forward(may_begin,may_end,last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
			
 
				 
			
 
				 @annotate('string->string')
			
 
				 class f_getTendererCompany(BaseUDTF):
			
@@ -215,3 +254,86 @@ class f_concatstr(BaseUDAF):
 
				         if "" in _s1:
			
 
				             _s1.remove("")
			
 
				         return ",".join(list(_s1))
			
 
				+
			
 
				+@annotate('string,bigint->string')
			
 
				+class f_getLastProjectUuid(BaseUDAF):
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [[]]
			
 
				+
			
 
				+    def iterate(self,buffer, _uuid,page_timestamp):
			
 
				+        buffer[0].append({"uuid":_uuid,"timestamp":page_timestamp})
			
 
				+        buffer[0] = buffer[0][:10000]
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        buffer[0].extend(pbuffer[0])
			
 
				+        buffer[0] = buffer[0][:10000]
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        if len(buffer[0])>0:
			
 
				+            buffer[0].sort(key=lambda x:x["timestamp"],reverse=True)
			
 
				+            return buffer[0][0]["uuid"]
			
 
				+        return None
			
 
				+
			
 
				+@annotate('string->string')
			
 
				+class f_groupJsonStr(BaseUDAF):
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [[]]
			
 
				+
			
 
				+    def iterate(self,buffer, _str):
			
 
				+        buffer[0].append(_str)
			
 
				+        buffer[0] = buffer[0][:10000]
			
 
				+
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        buffer[0].extend(pbuffer[0])
			
 
				+        buffer[0] = buffer[0][:10000]
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        return json.dumps(list(set(buffer[0])),ensure_ascii=False)
			
 
				+
			
 
				+@annotate('bigint,string,string->string,string,string,string,string,double,string')
			
 
				+class f_extractDemand(BaseUDTF):
			
 
				+
			
 
				+    def getProduct(self,tenderee,_product,project_name):
			
 
				+        if len(_product)>0:
			
 
				+            _product.sort(key=lambda x:len(x),reverse=True)
			
 
				+            return _product[0]
			
 
				+        else:
			
 
				+            product = str(project_name).replace(tenderee,"")
			
 
				+            product = re.sub(".*公司|项目|采购","",product)
			
 
				+            return product
			
 
				+
			
 
				+    def formatTime(self,_date):
			
 
				+        if _date is not None:
			
 
				+            _d = _date.split("-")
			
 
				+            if len(_d)==3:
			
 
				+                return "%s-%s-%s"%(_d[0].rjust(4,"2"),_d[1].rjust(2,"0"),_d[2].rjust(2,"0"))
			
 
				+
			
 
				+
			
 
				+    def process(self,docid,tenderee,json_demand_info):
			
 
				+        if json_demand_info is None:
			
 
				+            return
			
 
				+        demand_info = json.loads(json_demand_info)
			
 
				+        for _line in demand_info["data"]:
			
 
				+            try:
			
 
				+                _product = _line.get("product",[])
			
 
				+                order_end = _line.get("order_end")
			
 
				+                order_end = self.formatTime(order_end)
			
 
				+                project_name = _line.get("project_name")
			
 
				+                demand = _line.get("demand")
			
 
				+                budget = _line.get("budget")
			
 
				+                if budget is not None and len(budget)>0:
			
 
				+                    budget = float(budget)
			
 
				+                order_begin = _line.get("order_begin")
			
 
				+                order_begin = self.formatTime(order_begin)
			
 
				+                if order_begin is None or order_end is None:
			
 
				+                    continue
			
 
				+                product = self.getProduct(tenderee,_product,project_name)
			
 
				+                json_docids = json.dumps([str(docid)])
			
 
				+                self.forward(product,order_begin,order_end,demand,project_name,budget,json_docids)
			
 
				+            except Exception as e:
			
 
				+                logging.info("============error:%s"%(str(e)))
			
--- a/BiddingKG/maxcompute/documentMerge.py
+++ b/BiddingKG/maxcompute/documentMerge.py
@@ -350,6 +350,214 @@ class f_remege_limit_num_contain(BaseUDAF):
 
				         log(str(final_group))
			
 
				         return json.dumps(final_group)
			
 
				 
			
 
				+def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
			
 
				+    _time = time.strftime(format,time.localtime())
			
 
				+    return _time
			
 
				+
			
 
				+@annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,bigint->string')
			
 
				+class f_remege_limit_num_contain_bychannel(BaseUDAF):
			
 
				+    '''
			
 
				+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json,re
			
 
				+        global json,logging,re
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [list()]
			
 
				+
			
 
				+    def iterate(self, buffer,docid,docchannel,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence):
			
 
				+        _dict = {"docid":docid,"docchannel":docchannel,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
			
 
				+                 "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
			
 
				+                 "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence}
			
 
				+        _count = 0
			
 
				+        for _,v in _dict.items():
			
 
				+            if v is not None and str(v)!="":
			
 
				+                _count += 1
			
 
				+        _dict["extract_count"] = _count
			
 
				+        buffer[0].append(_dict)
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        buffer[0].extend(pbuffer[0])
			
 
				+
			
 
				+    def getNotLikeSet(self,_dict,column_name):
			
 
				+        column_value = _dict.get(column_name,None)
			
 
				+        _set = set()
			
 
				+        if column_value is not None:
			
 
				+            for _i in range(1,len(column_value)):
			
 
				+                _set.add(column_value[_i-1:_i+1])
			
 
				+        _dict["notLike_set"] = _set
			
 
				+
			
 
				+    def getSimilarity(self,_set1,_set2):
			
 
				+        _sum = max([1,min([len(_set1),len(_set2)])])
			
 
				+        return len(_set1&_set2)/_sum
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        list_group = []
			
 
				+        the_group = buffer[0]
			
 
				+
			
 
				+        SIM_PROB = 0.6
			
 
				+        for _d in the_group:
			
 
				+            self.getNotLikeSet(_d,"notLike_column")
			
 
				+
			
 
				+        #判断多个值与否
			
 
				+        keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
			
 
				+        re_merge = False
			
 
				+        for _key in keys:
			
 
				+            if len(getSet(the_group,_key))>1:
			
 
				+                re_merge = True
			
 
				+                break
			
 
				+        #判断是否相似而不相同
			
 
				+        re_merge_sim = False
			
 
				+        for _i1 in range(0,len(the_group)):
			
 
				+            for _j1 in range(_i1+1,len(the_group)):
			
 
				+                _set1 = the_group[_i1]["notLike_set"]
			
 
				+                _set2 = the_group[_j1]["notLike_set"]
			
 
				+                _sim = self.getSimilarity(_set1,_set2)
			
 
				+                if _sim>SIM_PROB and _sim<1:
			
 
				+                    re_merge_sim = True
			
 
				+                    break
			
 
				+        contain_keys = ["contain_column1","contain_column2"]
			
 
				+
			
 
				+        logging.info(the_group)
			
 
				+        logging.info(str(re_merge)+str(re_merge_sim))
			
 
				+        #重新成组
			
 
				+        dict_docid_doc = {}
			
 
				+        for _doc in the_group:
			
 
				+            dict_docid_doc[_doc["docid"]] = _doc
			
 
				+        if re_merge or re_merge_sim:
			
 
				+            the_group.sort(key=lambda x:x["confidence"],reverse=True)
			
 
				+            the_group.sort(key=lambda x:x["page_time_stamp"])
			
 
				+
			
 
				+            for _doc in the_group:
			
 
				+                merge_flag = False
			
 
				+                for _index in range(len(list_group)):
			
 
				+                    _g = list_group[_index]
			
 
				+                    hit_count = 0
			
 
				+                    dict_temp = dict()
			
 
				+                    #多个值的异常
			
 
				+                    if re_merge:
			
 
				+                        for _c_key in contain_keys:
			
 
				+                            dict_temp[_c_key] = _g[_c_key]
			
 
				+                            if _g[_c_key] is not None and _doc[_c_key] is not None:
			
 
				+                                if len(_g[_c_key])>len(_doc[_c_key]):
			
 
				+                                    if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
			
 
				+                                        dict_temp[_c_key] = _g[_c_key]
			
 
				+                                        hit_count += 1
			
 
				+                                else:
			
 
				+                                    if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
			
 
				+                                        dict_temp[_c_key] = _doc[_c_key]
			
 
				+                                        _g[_c_key] = _doc[_c_key]
			
 
				+                                        hit_count += 1
			
 
				+                    else:
			
 
				+                        hit_count = 1
			
 
				+                    # if hit_count==len(contain_keys):
			
 
				+                    if hit_count>0:
			
 
				+                        _flag_sim = False
			
 
				+                        #相似而不相同的异常
			
 
				+                        if re_merge_sim:
			
 
				+                            for _docid in _g["docid"]:
			
 
				+                                tmp_d = dict_docid_doc[_docid]
			
 
				+                                _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
			
 
				+                                if _sim>SIM_PROB and _sim<1:
			
 
				+                                    _flag_sim = True
			
 
				+                        if not _flag_sim:
			
 
				+                            for _c_key in dict_temp.keys():
			
 
				+                                _g[_c_key] = dict_temp[_c_key]
			
 
				+                            _g["docid"].append(_doc["docid"])
			
 
				+                            merge_flag = True
			
 
				+                            break
			
 
				+                if not merge_flag:
			
 
				+                    _dict = dict()
			
 
				+                    _dict["docid"] = [_doc["docid"]]
			
 
				+                    for _c_key in contain_keys:
			
 
				+                        _dict[_c_key] = _doc[_c_key]
			
 
				+                    list_group.append(_dict)
			
 
				+
			
 
				+            final_group = []
			
 
				+            #判断是否符合一个值
			
 
				+            for _group in list_group:
			
 
				+                _split = []
			
 
				+                for _docid in _group["docid"]:
			
 
				+                    _split.append(dict_docid_doc[_docid])
			
 
				+
			
 
				+                #通过置信度排序，尽可能保留组
			
 
				+                _split.sort(key=lambda x:x["confidence"],reverse=True)
			
 
				+                #置信度
			
 
				+                list_key_index = []
			
 
				+                for _k in keys:
			
 
				+                    list_key_index.append(getDiffIndex(_split,_k))
			
 
				+
			
 
				+                _index = min(list_key_index)
			
 
				+
			
 
				+
			
 
				+                final_group.append([_c["docid"] for _c in _split[:_index]])
			
 
				+                for _c in _split[_index:]:
			
 
				+                    final_group.append([_c["docid"]])
			
 
				+
			
 
				+
			
 
				+                #若是找到两个以上，则全部单独成组，否则成一组
			
 
				+                # _flag = True
			
 
				+                # for _key in keys:
			
 
				+                #     if len(getSet(_split,_key))>1:
			
 
				+                #         _flag = False
			
 
				+                #         break
			
 
				+                # if not _flag:
			
 
				+                #     for _docid in _group["docid"]:
			
 
				+                #         final_group.append([_docid])
			
 
				+                # else:
			
 
				+                #     final_group.append(list(set(_group["docid"])))
			
 
				+        else:
			
 
				+            final_group = [list(set([item["docid"] for item in the_group]))]
			
 
				+        log(str(final_group))
			
 
				+
			
 
				+
			
 
				+        #每个channel选择一篇公告
			
 
				+        final_group_channel = []
			
 
				+        for _group in final_group:
			
 
				+            dict_channel_id = {}
			
 
				+            otherChannel = 10000
			
 
				+            for _docid in _group:
			
 
				+                _channel = dict_docid_doc[_docid].get("docchannel")
			
 
				+                if _channel in [115,116,117]:
			
 
				+                    otherChannel += 1
			
 
				+                    _channel = otherChannel
			
 
				+                if _channel not in dict_channel_id:
			
 
				+                    dict_channel_id[_channel] = []
			
 
				+                dict_channel_id[_channel].append([_docid,dict_docid_doc[_docid].get("page_time_stamp"),dict_docid_doc[_docid].get("extract_count")])
			
 
				+            channel_dict = {}
			
 
				+            for k,v in dict_channel_id.items():
			
 
				+                v.sort(key=lambda x:x[1])
			
 
				+                v.sort(key=lambda x:x[2],reverse=True)
			
 
				+                channel_dict[v[0][0]] = []
			
 
				+                for _docs in v[1:]:
			
 
				+                    channel_dict[v[0][0]].append(_docs[0])
			
 
				+            _d = {"data":channel_dict,"process_time":getCurrent_date()}
			
 
				+            final_group_channel.append(_d)
			
 
				+
			
 
				+
			
 
				+        return json.dumps(final_group_channel)
			
 
				+
			
 
				+@annotate('string -> string')
			
 
				+class f_get_remerge_group_channel(BaseUDTF):
			
 
				+    '''
			
 
				+    将多个组拆解成多条记录
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        global json,logging
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def process(self,json_remerge):
			
 
				+        if json_remerge is not None:
			
 
				+            list_group = json.loads(json_remerge)
			
 
				+            for _group in list_group:
			
 
				+                self.forward(json.dumps(_group))
			
 
				+
			
 
				 @annotate('string -> string')
			
 
				 class f_get_remerge_group(BaseUDTF):
			
 
				     '''
			
@@ -840,7 +1048,29 @@ class f_getMergeProb(BaseUDTF):
 
				 
			
 
				 
			
 
				 
			
 
				+@annotate('string -> bigint,bigint')
			
 
				+class f_check_remerge_channel(BaseUDTF):
			
 
				+    '''
			
 
				+    将多个组拆解成多条记录
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        global json,logging
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				 
			
 
				+    def process(self,json_remerge):
			
 
				+        if json_remerge is not None:
			
 
				+            list_group = json.loads(json_remerge)
			
 
				+            for _group in list_group:
			
 
				+                _keys = _group.get("data").keys()
			
 
				+                if len(_keys)>0:
			
 
				+                    main_docid = int(list(_keys)[0])
			
 
				+                    for k,v in _group.get("data",{}).items():
			
 
				+                        self.forward(main_docid,int(k))
			
 
				+                        for _v in v:
			
 
				+                            self.forward(main_docid,int(_v))
			
 
				 
			
 
				 @annotate('string -> bigint,bigint')
			
 
				 class f_check_remerge(BaseUDTF):
			
--- a/BiddingKG/maxcompute/evaluates.py
+++ b/BiddingKG/maxcompute/evaluates.py
@@ -76,8 +76,8 @@ def multiLoadEnv():
 
				         # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
			
 
				         #改为zip引入
			
 
				         log("=======")
			
 
				-        include_package_path("BiddingKG.baseline.zip")
			
 
				-        # include_package_path("BiddingKG.backup.zip")
			
 
				+        # include_package_path("BiddingKG.baseline.zip")
			
 
				+        include_package_path("BiddingKG.backup.zip")
			
 
				         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
			
 
				 
			
 
				     def load_vector():