|
@@ -1,7 +1,78 @@
|
|
|
#coding:UTF8
|
|
|
+
|
|
|
+
|
|
|
from odps.udf import annotate
|
|
|
-from odps.udf import BaseUDAF
|
|
|
-from odps.udf import BaseUDTF
|
|
|
+from odps.distcache import get_cache_archive
|
|
|
+from odps.distcache import get_cache_file
|
|
|
+from odps.udf import BaseUDTF,BaseUDAF
|
|
|
+
|
|
|
+import threading
|
|
|
+import logging
|
|
|
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+import time
|
|
|
+import json
|
|
|
+
|
|
|
+
|
|
|
+def log(msg):
|
|
|
+ logging.info(msg)
|
|
|
+
|
|
|
+
|
|
|
+# 配置pandas依赖包
|
|
|
+def include_package_path(res_name):
|
|
|
+ import os, sys
|
|
|
+ archive_files = get_cache_archive(res_name)
|
|
|
+ dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
|
|
|
+ if '.dist_info' not in f.name], key=lambda v: len(v))
|
|
|
+
|
|
|
+ _path = dir_names[0].split(".zip/files")[0]+".zip/files"
|
|
|
+ log("add path:%s"%(_path))
|
|
|
+ sys.path.append(_path)
|
|
|
+
|
|
|
+ return os.path.dirname(dir_names[0])
|
|
|
+
|
|
|
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
|
|
|
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
|
|
|
+def include_file(file_name):
|
|
|
+ import os, sys
|
|
|
+ so_file = get_cache_file(file_name)
|
|
|
+ sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
|
|
|
+
|
|
|
+def include_so(file_name):
|
|
|
+ import os, sys
|
|
|
+ so_file = get_cache_file(file_name)
|
|
|
+
|
|
|
+ with open(so_file.name, 'rb') as fp:
|
|
|
+ content=fp.read()
|
|
|
+ so = open(file_name, "wb")
|
|
|
+ so.write(content)
|
|
|
+ so.flush()
|
|
|
+ so.close()
|
|
|
+
|
|
|
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
|
|
|
+def init_env(list_files,package_name):
|
|
|
+ import os,sys
|
|
|
+
|
|
|
+ if len(list_files)==1:
|
|
|
+ so_file = get_cache_file(list_files[0])
|
|
|
+ cmd_line = os.path.abspath(so_file.name)
|
|
|
+ os.system("unzip -o %s -d %s"%(cmd_line,package_name))
|
|
|
+ elif len(list_files)>1:
|
|
|
+ cmd_line = "cat"
|
|
|
+ for _file in list_files:
|
|
|
+ so_file = get_cache_file(_file)
|
|
|
+ cmd_line += " "+os.path.abspath(so_file.name)
|
|
|
+ cmd_line += " > temp.zip"
|
|
|
+ os.system(cmd_line)
|
|
|
+ os.system("unzip -o temp.zip -d %s"%(package_name))
|
|
|
+ # os.system("rm -rf %s/*.dist-info"%(package_name))
|
|
|
+ # return os.listdir(os.path.abspath("local_package"))
|
|
|
+ # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
|
|
|
+ # os.system("source ~/.bashrc")
|
|
|
+ sys.path.insert(0,os.path.abspath(package_name))
|
|
|
+
|
|
|
+ # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
|
|
|
+
|
|
|
+import platform
|
|
|
|
|
|
|
|
|
def getSet(list_dict,key):
|
|
@@ -188,6 +259,7 @@ class f_remege_limit_num_contain(BaseUDAF):
|
|
|
contain_keys = ["contain_column1","contain_column2"]
|
|
|
|
|
|
logging.info(the_group)
|
|
|
+ logging.info(str(re_merge)+str(re_merge_sim))
|
|
|
if re_merge or re_merge_sim:
|
|
|
the_group.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
the_group.sort(key=lambda x:x["page_time_stamp"])
|
|
@@ -275,7 +347,7 @@ class f_remege_limit_num_contain(BaseUDAF):
|
|
|
# final_group.append(list(set(_group["docid"])))
|
|
|
else:
|
|
|
final_group = [list(set([item["docid"] for item in the_group]))]
|
|
|
-
|
|
|
+ log(str(final_group))
|
|
|
return json.dumps(final_group)
|
|
|
|
|
|
@annotate('string -> string')
|
|
@@ -434,8 +506,17 @@ def check_columns(tenderee_less,tenderee_greater,
|
|
|
code_sim = getSimilarityOfString(project_code_less,project_code_greater)
|
|
|
if code_sim>0.6 and code_sim<1:
|
|
|
return False
|
|
|
+
|
|
|
+ #同批次不同编号
|
|
|
+ if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
|
|
|
+ _split_code_less = project_code_less.split("-")
|
|
|
+ _split_code_greater = project_code_greater.split("-")
|
|
|
+ if len(_split_code_less)>1 and len(_split_code_greater)>1:
|
|
|
+ if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
|
|
|
+ return False
|
|
|
+
|
|
|
_set_win_tenderer = set()
|
|
|
- if win_tenderer_less is not None and tenderee_less!="":
|
|
|
+ if win_tenderer_less is not None and win_tenderer_less!="":
|
|
|
_set_win_tenderer.add(win_tenderer_less)
|
|
|
if win_tenderer_greater is not None and win_tenderer_greater!="":
|
|
|
_set_win_tenderer.add(win_tenderer_greater)
|
|
@@ -443,18 +524,20 @@ def check_columns(tenderee_less,tenderee_greater,
|
|
|
return False
|
|
|
_set_win_bid_price = set()
|
|
|
if win_bid_price_less is not None and win_bid_price_less!="":
|
|
|
- _set_win_bid_price.add(win_bid_price_less)
|
|
|
+ _set_win_bid_price.add(float(win_bid_price_less))
|
|
|
if win_bid_price_greater is not None and win_bid_price_greater!="":
|
|
|
- _set_win_bid_price.add(win_bid_price_greater)
|
|
|
+ _set_win_bid_price.add(float(win_bid_price_greater))
|
|
|
if len(_set_win_bid_price)>1:
|
|
|
return False
|
|
|
_set_bidding_budget = set()
|
|
|
if bidding_budget_less is not None and bidding_budget_less!="":
|
|
|
- _set_bidding_budget.add(bidding_budget_less)
|
|
|
+ _set_bidding_budget.add(float(bidding_budget_less))
|
|
|
if bidding_budget_greater is not None and bidding_budget_greater!="":
|
|
|
- _set_bidding_budget.add(bidding_budget_greater)
|
|
|
+ _set_bidding_budget.add(float(bidding_budget_greater))
|
|
|
if len(_set_bidding_budget)>1:
|
|
|
return False
|
|
|
+
|
|
|
+
|
|
|
return True
|
|
|
|
|
|
def getSimLevel(str1,str2):
|
|
@@ -482,7 +565,26 @@ import math
|
|
|
def featurnCount(_count,max_count=100):
|
|
|
return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
|
|
|
|
|
|
-@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
|
|
|
+def getLength(_str):
|
|
|
+ return len(_str if _str is not None else "")
|
|
|
+
|
|
|
+
|
|
|
+@annotate("string->bigint")
|
|
|
+class f_get_min_counts(object):
|
|
|
+
|
|
|
+
|
|
|
+ def evaluate(self,json_context):
|
|
|
+ _context = json.loads(json_context)
|
|
|
+
|
|
|
+ min_counts = 100
|
|
|
+
|
|
|
+ for item in _context:
|
|
|
+ if item["counts"]<min_counts:
|
|
|
+ min_counts = item["counts"]
|
|
|
+ return min_counts
|
|
|
+
|
|
|
+
|
|
|
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
|
|
|
class f_merge_featureMatrix(BaseUDTF):
|
|
|
|
|
|
def __init__(self):
|
|
@@ -494,15 +596,20 @@ class f_merge_featureMatrix(BaseUDTF):
|
|
|
agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
|
bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
|
|
|
- # if not check_columns(tenderee_less,tenderee_greater,
|
|
|
- # agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
- # win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
|
- # bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater)
|
|
|
- # return
|
|
|
+ if not check_columns(tenderee_less,tenderee_greater,
|
|
|
+ agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
+ win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
|
+ bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
|
|
|
+ return
|
|
|
|
|
|
_context = json.loads(json_context)
|
|
|
+
|
|
|
+ min_counts = 100
|
|
|
+
|
|
|
dict_context = {}
|
|
|
for item in _context:
|
|
|
+ if item["counts"]<min_counts:
|
|
|
+ min_counts = item["counts"]
|
|
|
dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
|
|
|
context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
|
|
|
list_matrix = []
|
|
@@ -526,7 +633,208 @@ class f_merge_featureMatrix(BaseUDTF):
|
|
|
list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
|
|
|
list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
|
|
|
list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
|
|
|
- self.forward(json.dumps(list_matrix))
|
|
|
+
|
|
|
+ # set_tenderer = set()
|
|
|
+ # if tenderee_less is not None and tenderee_less!="":
|
|
|
+ # set_tenderer.add(tenderee_less)
|
|
|
+ # if tenderee_greater is not None and tenderee_greater!="":
|
|
|
+ # set_tenderer.add(tenderee_greater)
|
|
|
+ #
|
|
|
+ # set_win_tenderer = set()
|
|
|
+ # if win_tenderer_less is not None and win_tenderer_less!="":
|
|
|
+ # set_win_tenderer.add(win_tenderer_less)
|
|
|
+ # if win_tenderer_greater is not None and win_tenderer_greater!="":
|
|
|
+ # set_win_tenderer.add(win_tenderer_greater)
|
|
|
+ #
|
|
|
+ # set_bidding_budget = set()
|
|
|
+ # if bidding_budget_less is not None and bidding_budget_less!="":
|
|
|
+ # set_bidding_budget.add(bidding_budget_less)
|
|
|
+ # if bidding_budget_greater is not None and bidding_budget_greater!="":
|
|
|
+ # set_bidding_budget.add(bidding_budget_greater)
|
|
|
+ #
|
|
|
+ # set_win_bid_price = set()
|
|
|
+ # if win_bid_price_less is not None and win_bid_price_less!="":
|
|
|
+ # set_win_bid_price.add(win_bid_price_less)
|
|
|
+ # if win_bid_price_greater is not None and win_bid_price_greater!="":
|
|
|
+ # set_win_bid_price.add(win_bid_price_greater)
|
|
|
+
|
|
|
+ json_matrix = json.dumps(list_matrix)
|
|
|
+
|
|
|
+ same_project_code = False
|
|
|
+ if project_code_less==project_code_greater and getLength(project_code_less)>0:
|
|
|
+ same_project_code = True
|
|
|
+
|
|
|
+ same_project_name = False
|
|
|
+ if project_name_less==project_name_greater and getLength(project_name_less)>0:
|
|
|
+ same_project_name = True
|
|
|
+
|
|
|
+ same_doctitle_refine = False
|
|
|
+ if doctitle_refine_less==doctitle_refine_greater and getLength(doctitle_refine_less)>0:
|
|
|
+ same_doctitle_refine = True
|
|
|
+
|
|
|
+ same_tenderee = False
|
|
|
+ if tenderee_less==tenderee_greater and getLength(tenderee_less)>0:
|
|
|
+ same_tenderee = True
|
|
|
+
|
|
|
+ same_agency = False
|
|
|
+ if agency_less==agency_greater and getLength(agency_less)>0:
|
|
|
+ same_agency = True
|
|
|
+
|
|
|
+ same_bidding_budget = False
|
|
|
+ if bidding_budget_less==bidding_budget_greater and getLength(bidding_budget_less)>0:
|
|
|
+ same_bidding_budget = True
|
|
|
+
|
|
|
+ same_win_tenderer = False
|
|
|
+ if win_tenderer_less==win_tenderer_greater and getLength(win_tenderer_less)>0:
|
|
|
+ same_win_tenderer = True
|
|
|
+
|
|
|
+ same_win_bid_price = False
|
|
|
+ if win_bid_price_less==win_bid_price_greater and getLength(win_bid_price_less)>0:
|
|
|
+ same_win_bid_price = True
|
|
|
+
|
|
|
+ contain_doctitle = False
|
|
|
+ if getLength(doctitle_refine_less)>0 and getLength(doctitle_refine_greater)>0 and (doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
|
|
|
+ contain_doctitle = True
|
|
|
+
|
|
|
+ contain_project_name = False
|
|
|
+ if getLength(project_name_less)>0 and getLength(project_name_greater)>0 and (project_name_less in project_name_greater or project_name_greater in project_name_less):
|
|
|
+ contain_project_name = True
|
|
|
+
|
|
|
+
|
|
|
+ total_money_less = 0 if getLength(bidding_budget_less)==0 else float(bidding_budget_less)+0 if getLength(win_bid_price_less)==0 else float(win_bid_price_less)
|
|
|
+ total_money_greater = 0 if getLength(bidding_budget_greater)==0 else float(bidding_budget_greater) +0 if getLength(win_bid_price_greater)==0 else float(win_bid_price_greater)
|
|
|
+
|
|
|
+
|
|
|
+ if min_counts<10:
|
|
|
+ _prob = 0.9
|
|
|
+ if same_project_code and same_win_tenderer and same_tenderee:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_project_name and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_doctitle_refine and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_name and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_doctitle_refine and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_doctitle_refine and same_bidding_budget and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_doctitle_refine and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_project_code and same_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_project_code and same_doctitle_refine:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_bidding_budget and same_project_code:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_bidding_budget and same_doctitle_refine:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_bidding_budget and same_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_doctitle_refine and same_project_code and same_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+
|
|
|
+ if min_counts<=5:
|
|
|
+ _prob = 0.8
|
|
|
+ if same_project_code and same_tenderee:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_name and same_project_code:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_doctitle_refine:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if total_money_less==total_money_greater and total_money_less>100000:
|
|
|
+ if same_win_tenderer and (same_win_bid_price or same_bidding_budget):
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_bidding_budget:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_win_bid_price:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_bidding_budget and same_win_bid_price and (contain_project_name or contain_doctitle):
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+ if min_counts<=3:
|
|
|
+ _prob = 0.7
|
|
|
+ if same_project_name or same_project_code or same_doctitle_refine or contain_doctitle or contain_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+
|
|
|
+ self.forward(json_matrix,0)
|
|
|
+
|
|
|
+
|
|
|
+class MergePredictor():
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.input_size = 46
|
|
|
+ self.output_size = 2
|
|
|
+ self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
|
|
|
+ self.bias = np.array([16.79706382751465, -13.713337898254395])
|
|
|
+ # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
|
|
|
+
|
|
|
+ def activation(self,vec,_type):
|
|
|
+ if _type=="relu":
|
|
|
+ _vec = np.array(vec)
|
|
|
+ return _vec*(_vec>0)
|
|
|
+ if _type=="tanh":
|
|
|
+ return np.tanh(vec)
|
|
|
+ if _type=="softmax":
|
|
|
+ _vec = np.array(vec)
|
|
|
+ _exp = np.exp(_vec)
|
|
|
+ return _exp/np.sum(_exp)
|
|
|
+
|
|
|
+ def predict(self,input):
|
|
|
+ _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
|
|
|
+ # print(self.model.predict(np.array(input).reshape(-1,46)))
|
|
|
+ return _out
|
|
|
+
|
|
|
+@annotate('string,double -> double')
|
|
|
+class f_getMergeProb(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ include_package_path("numpy-1.18.zip")
|
|
|
+ import numpy as np
|
|
|
+ global json,np
|
|
|
+ self.mp = MergePredictor()
|
|
|
+
|
|
|
+
|
|
|
+ def process(self,json_matrix,pre_prob):
|
|
|
+ if not pre_prob>0.5:
|
|
|
+ _matrix = json.loads(json_matrix)
|
|
|
+ _prob = self.mp.predict(_matrix)[0][1]
|
|
|
+ else:
|
|
|
+ _prob = pre_prob
|
|
|
+ if _prob>0.5:
|
|
|
+ self.forward(float(_prob))
|
|
|
|
|
|
|
|
|
|