소스 검색

云接口代码

luojiehua 3 년 전
부모
커밋
5ac66e6908

+ 37 - 1
.idea/sonarlint/issuestore/index.pb

@@ -6,4 +6,40 @@ P
 Y
 )BiddingKG/dl/relation_extraction/model.py,1\8\18960bd2f3d2ffcab4e29ee67a5dbda20d72990a
 G
-BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
+BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
+a
+1BiddingKG/deepdive/test/testArticle_processed2.py,a\f\afe8cc0ca1038654e923f628b8bfb01ce464db17
+Y
+)BiddingKG/maxcompute/contactDumplicate.py,0\9\09c1550b10f6ce58546ad5d4b900071c2a8292cc
+S
+#BiddingKG/dl/time/re_servicetime.py,4\4\4454e65be42efdd433a1de3147c6f3cb69cf116b
+U
+%BiddingKG/maxcompute/attachmentRec.py,b\e\be8f50b8961bc8ae61e105517763f21c707ea3ec
+O
+BiddingKG/dl/common/nerUtils.py,8\2\82c3c87116c1da9281790ac9c71f57821e9207cf
+Q
+!BiddingKG/dl/LEGAL_ENTERPRISE.txt,6\8\685bd49ae2f5f0de419c93a217a0e57564d705ab
+I
+BiddingKG/maxcompute/1.py,5\8\58fa6fe30194ad773c47ea70f4e48401242a1a88
+P
+ BiddingKG/dl/bidway/re_bidway.py,4\b\4bbee1b8e2177ffd4dbcc10e26686b81b38db517
+Q
+!BiddingKG/dl/interface/Entitys.py,6\3\6394a73f962d314de6209d5bb823941b56eda9d7
+H
+BiddingKG/dl/__init__.py,a\c\ac12bb80834a26e34df8eaf4c762410dfcfc0a27
+U
+%BiddingKG/dl/metrics/extractMetric.py,f\e\fed725bbe7e61499dcc542a2cd6279850a62cb79
+L
+BiddingKG/dl/common/Utils.py,f\4\f4c35e30342829a2fc89108259e28edc0a425cce
+W
+'BiddingKG/maxcompute/article_extract.py,1\d\1d533d48614eebe6b6e03d0bf64b381cdf4beca0
+]
+-BiddingKG/dl/test/测试所有提取信息.py,3\b\3baf1cd5607ac9f5b6263b25a3a5a1570221f784
+I
+BiddingKG/dl/test/t2/1.py,1\a\1a12a3e742b3f8f9ff5623b9eb9b471824b3db44
+P
+ BiddingKG/dl/test/t2/__init__.py,6\e\6e2a437853f56392367a0fb812234f339cb553b4
+S
+#BiddingKG/dl/interface/predictor.py,8\e\8efac754add7c721c357d60e457cffc0d592e7d1
+U
+%BiddingKG/dl/entityLink/entityLink.py,6\2\629ab9c7dc55af50771db3959ddf1e23b4ee5208

+ 1 - 1
BiddingKG/dl/complaint/test/test1.py

@@ -749,7 +749,7 @@ def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
             # tokens_all = getTokens(sentences,useselffool=useselffool)
             if key_nerToken not in cost_time:
                 cost_time[key_nerToken] = 0
-            cost_time[key_nerToken] += time.time()-start_time
+            cost_time[key_nerToken] += round(time.time()-start_time,2)
 
 
             for sentence_index in range(len(sentences)):

+ 3 - 3
BiddingKG/dl/interface/extract.py

@@ -72,9 +72,9 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
-    start_time = time.time()
-    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
-    cost_time["rule"] = round(time.time()-start_time,2)
+    # start_time = time.time()
+    # predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
+    # cost_time["rule"] = round(time.time()-start_time,2)
 
     start_time = time.time()
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)

+ 1 - 1
BiddingKG/dl/projectCode/projectCodeAndName_tf.py

@@ -621,7 +621,7 @@ def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
             # tokens_all = getTokens(sentences,useselffool=useselffool)
             if key_nerToken not in cost_time:
                 cost_time[key_nerToken] = 0
-            cost_time[key_nerToken] += time.time()-start_time
+            cost_time[key_nerToken] += round(time.time()-start_time,2)
 
 
             for sentence_index in range(len(sentences)):

+ 31 - 17
BiddingKG/dl/test/12.py

@@ -1,19 +1,33 @@
-
-from multiprocessing import Queue,Process,freeze_support
 import time
 
-def f():
-    while(True):
-        print(1)
-        time.sleep(1)
-
-if __name__=="__main__":
-    freeze_support()
-    while(True):
-        p = Process(target=f)
-        p.start()
-        p.join()
-
-        # time.sleep(4)
-        # p.terminate()
-        # p.start()
+import math
+def getAvgD(aint_dis):
+    if len(aint_dis)==0:
+        return 0
+    avg_dis = 1
+    int_avgD = int(sum(aint_dis)/len(aint_dis))
+    new_aint_dis = [a for a in aint_dis]
+    print(sum(aint_dis)/len(aint_dis))
+    min_pow = 10000000
+    min_dis = min(aint_dis)
+
+    for _dis in range(min(aint_dis),max(aint_dis)+1):
+
+        pow_x = 0
+        for _d in new_aint_dis:
+            pow_x += math.sqrt(abs((_d-_dis)))
+        print(_dis,pow_x)
+        if pow_x<min_pow:
+            min_pow = pow_x
+            min_dis = _dis
+
+    return min_dis
+
+a = [{"a":"1","b":2},{"a":"2","b":2},{"a":"1","b":23}]
+from itertools import groupby
+a.sort(key=lambda x:x["a"])
+print(list(groupby(a,key=lambda x:x["a"])))
+
+
+print(time.)
+

+ 1 - 1
BiddingKG/dl/test/test4.py

@@ -66,7 +66,7 @@ if __name__=="__main__":
     # 广州比地数据科技有限公司翻译服务工程招标
     # '''
     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
-    print(predict("12", text))
+    print(predict("12", content))
     # test("12",content)
     print("takes",time.time()-_time1)
     pass

+ 130 - 8
BiddingKG/maxcompute/cycleRec.py

@@ -8,6 +8,7 @@ import time
 import json
 import logging
 logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import math
 
 @annotate('string->string')
 class f_splitProduct(BaseUDTF):
@@ -102,6 +103,29 @@ def clusterTimestamp(aint_timestamp,distance = 28*24*60*60):
 
     return aint_center
 
+def getAvgD(aint_dis):
+    if len(aint_dis)==0:
+        return 0
+    avg_dis = 1
+    int_avgD = int(sum(aint_dis)/len(aint_dis))
+    new_aint_dis = [a for a in aint_dis]
+    print(sum(aint_dis)/len(aint_dis))
+    min_pow = 10000000
+    min_dis = min(aint_dis)
+
+    for _dis in range(min(aint_dis),max(aint_dis)+1):
+
+        pow_x = 0
+        for _d in new_aint_dis:
+            pow_x += math.sqrt(abs((_d-_dis)))
+        print(_dis,pow_x)
+        if pow_x<min_pow:
+            min_pow = pow_x
+            min_dis = _dis
+
+    return min_dis
+
+
 def getDistanceOfCluster(aint_center):
     aint_center.sort(key=lambda x:x)
     aint_dis = []
@@ -122,13 +146,21 @@ def getDistanceOfCluster(aint_center):
         if _d>int_maxD:
             int_maxD = _d
     if len(aint_dis)>0:
-        int_avgD = int(sum(aint_dis)/len(aint_dis))
+        # int_avgD = int(sum(aint_dis)/len(aint_dis))
+        int_avgD = getAvgD(aint_dis)
         int_minD = min(aint_dis)
         int_maxD = max(aint_dis)
-        for _d in aint_dis:
-            aint_gre = [int(a>=_d) for a in aint_dis]
-            if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
-                cluster_d = _d
+        if abs(int_maxD-int_avgD)>abs(int_minD-int_avgD):
+            int_minD = min([int_avgD,int_minD])
+            int_maxD = max([int_avgD,int_minD])
+        else:
+            int_minD = min([int_avgD,int_maxD])
+            int_maxD = max([int_avgD,int_maxD])
+        int_avgD = (int_minD+int_maxD)//2
+        # for _d in aint_dis:
+        #     aint_gre = [int(a>=_d) for a in aint_dis]
+        #     if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
+        #         cluster_d = _d
 
     return aint_dis,int_avgD,int_minD,int_maxD,cluster_d
 
@@ -153,12 +185,17 @@ def getPeriod(aint_timestamp):
             base_prob = 0.9
         _prob = round(base_prob-(flt_powD/len(aint_dis)/int_avgD**2),4)
         # if flt_powD/len(aint_dis)<30:
-        if _prob>0.3:
+        if _prob>0.5 and int_maxD-int_minD<=70:
             return last_time,_prob,int(int_avgD),int(int_minD),int(int_maxD),len(aint_dis)
     return None,_prob,None,None,None,None
 
+def timeAdd(_time,days):
+    a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
+
+    _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
+    return _time1
 
-@annotate('string->string,double,bigint,bigint,bigint,bigint')
+@annotate('string->string,string,string,double,bigint,bigint,bigint,bigint')
 class f_getProductCycle(BaseUDTF):
 
     def process(self,json_timestamp):
@@ -170,7 +207,9 @@ class f_getProductCycle(BaseUDTF):
         # aint_center = aint_timestamp
         last_time,_prob,int_avgD,int_minD,int_maxD,_periods = getPeriod(aint_timestamp)
         if int_avgD is not None:
-            self.forward(last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
+            may_begin = timeAdd(last_time,int_minD)
+            may_end = timeAdd(last_time,int_maxD)
+            self.forward(may_begin,may_end,last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
 
 @annotate('string->string')
 class f_getTendererCompany(BaseUDTF):
@@ -215,3 +254,86 @@ class f_concatstr(BaseUDAF):
         if "" in _s1:
             _s1.remove("")
         return ",".join(list(_s1))
+
+@annotate('string,bigint->string')
+class f_getLastProjectUuid(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, _uuid,page_timestamp):
+        buffer[0].append({"uuid":_uuid,"timestamp":page_timestamp})
+        buffer[0] = buffer[0][:10000]
+
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        if len(buffer[0])>0:
+            buffer[0].sort(key=lambda x:x["timestamp"],reverse=True)
+            return buffer[0][0]["uuid"]
+        return None
+
+@annotate('string->string')
+class f_groupJsonStr(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, _str):
+        buffer[0].append(_str)
+        buffer[0] = buffer[0][:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        return json.dumps(list(set(buffer[0])),ensure_ascii=False)
+
+@annotate('bigint,string,string->string,string,string,string,string,double,string')
+class f_extractDemand(BaseUDTF):
+
+    def getProduct(self,tenderee,_product,project_name):
+        if len(_product)>0:
+            _product.sort(key=lambda x:len(x),reverse=True)
+            return _product[0]
+        else:
+            product = str(project_name).replace(tenderee,"")
+            product = re.sub(".*公司|项目|采购","",product)
+            return product
+
+    def formatTime(self,_date):
+        if _date is not None:
+            _d = _date.split("-")
+            if len(_d)==3:
+                return "%s-%s-%s"%(_d[0].rjust(4,"2"),_d[1].rjust(2,"0"),_d[2].rjust(2,"0"))
+
+
+    def process(self,docid,tenderee,json_demand_info):
+        if json_demand_info is None:
+            return
+        demand_info = json.loads(json_demand_info)
+        for _line in demand_info["data"]:
+            try:
+                _product = _line.get("product",[])
+                order_end = _line.get("order_end")
+                order_end = self.formatTime(order_end)
+                project_name = _line.get("project_name")
+                demand = _line.get("demand")
+                budget = _line.get("budget")
+                if budget is not None and len(budget)>0:
+                    budget = float(budget)
+                order_begin = _line.get("order_begin")
+                order_begin = self.formatTime(order_begin)
+                if order_begin is None or order_end is None:
+                    continue
+                product = self.getProduct(tenderee,_product,project_name)
+                json_docids = json.dumps([str(docid)])
+                self.forward(product,order_begin,order_end,demand,project_name,budget,json_docids)
+            except Exception as e:
+                logging.info("============error:%s"%(str(e)))

+ 230 - 0
BiddingKG/maxcompute/documentMerge.py

@@ -350,6 +350,214 @@ class f_remege_limit_num_contain(BaseUDAF):
         log(str(final_group))
         return json.dumps(final_group)
 
+def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
+    _time = time.strftime(format,time.localtime())
+    return _time
+
+@annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,bigint->string')
+class f_remege_limit_num_contain_bychannel(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,docchannel,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence):
+        _dict = {"docid":docid,"docchannel":docchannel,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
+                 "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
+                 "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence}
+        _count = 0
+        for _,v in _dict.items():
+            if v is not None and str(v)!="":
+                _count += 1
+        _dict["extract_count"] = _count
+        buffer[0].append(_dict)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def getNotLikeSet(self,_dict,column_name):
+        column_value = _dict.get(column_name,None)
+        _set = set()
+        if column_value is not None:
+            for _i in range(1,len(column_value)):
+                _set.add(column_value[_i-1:_i+1])
+        _dict["notLike_set"] = _set
+
+    def getSimilarity(self,_set1,_set2):
+        _sum = max([1,min([len(_set1),len(_set2)])])
+        return len(_set1&_set2)/_sum
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+
+        SIM_PROB = 0.6
+        for _d in the_group:
+            self.getNotLikeSet(_d,"notLike_column")
+
+        #判断多个值与否
+        keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+        re_merge = False
+        for _key in keys:
+            if len(getSet(the_group,_key))>1:
+                re_merge = True
+                break
+        #判断是否相似而不相同
+        re_merge_sim = False
+        for _i1 in range(0,len(the_group)):
+            for _j1 in range(_i1+1,len(the_group)):
+                _set1 = the_group[_i1]["notLike_set"]
+                _set2 = the_group[_j1]["notLike_set"]
+                _sim = self.getSimilarity(_set1,_set2)
+                if _sim>SIM_PROB and _sim<1:
+                    re_merge_sim = True
+                    break
+        contain_keys = ["contain_column1","contain_column2"]
+
+        logging.info(the_group)
+        logging.info(str(re_merge)+str(re_merge_sim))
+        #重新成组
+        dict_docid_doc = {}
+        for _doc in the_group:
+            dict_docid_doc[_doc["docid"]] = _doc
+        if re_merge or re_merge_sim:
+            the_group.sort(key=lambda x:x["confidence"],reverse=True)
+            the_group.sort(key=lambda x:x["page_time_stamp"])
+
+            for _doc in the_group:
+                merge_flag = False
+                for _index in range(len(list_group)):
+                    _g = list_group[_index]
+                    hit_count = 0
+                    dict_temp = dict()
+                    #多个值的异常
+                    if re_merge:
+                        for _c_key in contain_keys:
+                            dict_temp[_c_key] = _g[_c_key]
+                            if _g[_c_key] is not None and _doc[_c_key] is not None:
+                                if len(_g[_c_key])>len(_doc[_c_key]):
+                                    if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
+                                        dict_temp[_c_key] = _g[_c_key]
+                                        hit_count += 1
+                                else:
+                                    if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
+                                        dict_temp[_c_key] = _doc[_c_key]
+                                        _g[_c_key] = _doc[_c_key]
+                                        hit_count += 1
+                    else:
+                        hit_count = 1
+                    # if hit_count==len(contain_keys):
+                    if hit_count>0:
+                        _flag_sim = False
+                        #相似而不相同的异常
+                        if re_merge_sim:
+                            for _docid in _g["docid"]:
+                                tmp_d = dict_docid_doc[_docid]
+                                _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
+                                if _sim>SIM_PROB and _sim<1:
+                                    _flag_sim = True
+                        if not _flag_sim:
+                            for _c_key in dict_temp.keys():
+                                _g[_c_key] = dict_temp[_c_key]
+                            _g["docid"].append(_doc["docid"])
+                            merge_flag = True
+                            break
+                if not merge_flag:
+                    _dict = dict()
+                    _dict["docid"] = [_doc["docid"]]
+                    for _c_key in contain_keys:
+                        _dict[_c_key] = _doc[_c_key]
+                    list_group.append(_dict)
+
+            final_group = []
+            #判断是否符合一个值
+            for _group in list_group:
+                _split = []
+                for _docid in _group["docid"]:
+                    _split.append(dict_docid_doc[_docid])
+
+                #通过置信度排序,尽可能保留组
+                _split.sort(key=lambda x:x["confidence"],reverse=True)
+                #置信度
+                list_key_index = []
+                for _k in keys:
+                    list_key_index.append(getDiffIndex(_split,_k))
+
+                _index = min(list_key_index)
+
+
+                final_group.append([_c["docid"] for _c in _split[:_index]])
+                for _c in _split[_index:]:
+                    final_group.append([_c["docid"]])
+
+
+                #若是找到两个以上,则全部单独成组,否则成一组
+                # _flag = True
+                # for _key in keys:
+                #     if len(getSet(_split,_key))>1:
+                #         _flag = False
+                #         break
+                # if not _flag:
+                #     for _docid in _group["docid"]:
+                #         final_group.append([_docid])
+                # else:
+                #     final_group.append(list(set(_group["docid"])))
+        else:
+            final_group = [list(set([item["docid"] for item in the_group]))]
+        log(str(final_group))
+
+
+        #每个channel选择一篇公告
+        final_group_channel = []
+        for _group in final_group:
+            dict_channel_id = {}
+            otherChannel = 10000
+            for _docid in _group:
+                _channel = dict_docid_doc[_docid].get("docchannel")
+                if _channel in [115,116,117]:
+                    otherChannel += 1
+                    _channel = otherChannel
+                if _channel not in dict_channel_id:
+                    dict_channel_id[_channel] = []
+                dict_channel_id[_channel].append([_docid,dict_docid_doc[_docid].get("page_time_stamp"),dict_docid_doc[_docid].get("extract_count")])
+            channel_dict = {}
+            for k,v in dict_channel_id.items():
+                v.sort(key=lambda x:x[1])
+                v.sort(key=lambda x:x[2],reverse=True)
+                channel_dict[v[0][0]] = []
+                for _docs in v[1:]:
+                    channel_dict[v[0][0]].append(_docs[0])
+            _d = {"data":channel_dict,"process_time":getCurrent_date()}
+            final_group_channel.append(_d)
+
+
+        return json.dumps(final_group_channel)
+
+@annotate('string -> string')
+class f_get_remerge_group_channel(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_remerge):
+        if json_remerge is not None:
+            list_group = json.loads(json_remerge)
+            for _group in list_group:
+                self.forward(json.dumps(_group))
+
 @annotate('string -> string')
 class f_get_remerge_group(BaseUDTF):
     '''
@@ -840,7 +1048,29 @@ class f_getMergeProb(BaseUDTF):
 
 
 
+@annotate('string -> bigint,bigint')
+class f_check_remerge_channel(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
+    def process(self,json_remerge):
+        if json_remerge is not None:
+            list_group = json.loads(json_remerge)
+            for _group in list_group:
+                _keys = _group.get("data").keys()
+                if len(_keys)>0:
+                    main_docid = int(list(_keys)[0])
+                    for k,v in _group.get("data",{}).items():
+                        self.forward(main_docid,int(k))
+                        for _v in v:
+                            self.forward(main_docid,int(_v))
 
 @annotate('string -> bigint,bigint')
 class f_check_remerge(BaseUDTF):

+ 2 - 2
BiddingKG/maxcompute/evaluates.py

@@ -76,8 +76,8 @@ def multiLoadEnv():
         # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
         #改为zip引入
         log("=======")
-        include_package_path("BiddingKG.baseline.zip")
-        # include_package_path("BiddingKG.backup.zip")
+        # include_package_path("BiddingKG.baseline.zip")
+        include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
     def load_vector():