|
@@ -0,0 +1,2936 @@
|
|
|
+#coding:UTF8
|
|
|
+
|
|
|
+
|
|
|
+from odps.udf import annotate
|
|
|
+from odps.distcache import get_cache_archive
|
|
|
+from odps.distcache import get_cache_file
|
|
|
+from odps.udf import BaseUDTF,BaseUDAF
|
|
|
+
|
|
|
+import threading
|
|
|
+import logging
|
|
|
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+import time
|
|
|
+import json
|
|
|
+from uuid import uuid4
|
|
|
+import traceback
|
|
|
+import re
|
|
|
+
|
|
|
+project_uuid = "uuid"
|
|
|
+project_docids = "docids"
|
|
|
+project_zhao_biao_page_time = "zhao_biao_page_time"
|
|
|
+project_zhong_biao_page_time = "zhong_biao_page_time"
|
|
|
+project_page_time = "page_time"
|
|
|
+project_doctextcon = "doctextcon"
|
|
|
+project_area = "area"
|
|
|
+project_province = "province"
|
|
|
+project_city = "city"
|
|
|
+project_district = "district"
|
|
|
+project_info_type = "info_type"
|
|
|
+project_industry = "industry"
|
|
|
+project_qcodes = "qcodes"
|
|
|
+project_project_name = "project_name"
|
|
|
+project_project_code = "project_code"
|
|
|
+project_project_codes = "project_codes"
|
|
|
+project_project_addr = "project_addr"
|
|
|
+project_tenderee = "tenderee"
|
|
|
+project_tenderee_addr = "tenderee_addr"
|
|
|
+project_tenderee_phone = "tenderee_phone"
|
|
|
+project_tenderee_contact = "tenderee_contact"
|
|
|
+project_agency = "agency"
|
|
|
+project_agency_phone = "agency_phone"
|
|
|
+project_agency_contact = "agency_contact"
|
|
|
+project_sub_project_name = "sub_project_name"
|
|
|
+project_sub_project_code = "sub_project_code"
|
|
|
+project_bidding_budget = "bidding_budget"
|
|
|
+project_win_tenderer = "win_tenderer"
|
|
|
+project_win_bid_price = "win_bid_price"
|
|
|
+project_win_tenderer_manager = "win_tenderer_manager"
|
|
|
+project_win_tenderer_phone = "win_tenderer_phone"
|
|
|
+project_second_tenderer = "second_tenderer"
|
|
|
+project_second_bid_price = "second_bid_price"
|
|
|
+project_second_tenderer_manager = "second_tenderer_manager"
|
|
|
+project_second_tenderer_phone = "second_tenderer_phone"
|
|
|
+project_third_tenderer = "third_tenderer"
|
|
|
+project_third_bid_price = "third_bid_price"
|
|
|
+project_third_tenderer_manager = "third_tenderer_manager"
|
|
|
+project_third_tenderer_phone = "third_tenderer_phone"
|
|
|
+project_procurement_system = "procurement_system"
|
|
|
+project_bidway = "bidway"
|
|
|
+project_dup_data = "dup_data"
|
|
|
+project_docid_number = "docid_number"
|
|
|
+project_project_dynamics = "project_dynamic"
|
|
|
+project_product = "product"
|
|
|
+
|
|
|
+project_moneysource = "moneysource"
|
|
|
+project_service_time = "service_time"
|
|
|
+project_time_bidclose = "time_bidclose"
|
|
|
+project_time_bidopen = "time_bidopen"
|
|
|
+project_time_bidstart = "time_bidstart"
|
|
|
+project_time_commencement = "time_commencement"
|
|
|
+project_time_completion = "time_completion"
|
|
|
+project_time_earnest_money_start = "time_earnest_money_start"
|
|
|
+project_time_earnest_money_end = "time_earnest_money_end"
|
|
|
+project_time_get_file_end = "time_get_file_end"
|
|
|
+project_time_get_file_start = "time_get_file_start"
|
|
|
+project_time_publicity_end = "time_publicity_end"
|
|
|
+project_time_publicity_start = "time_publicity_start"
|
|
|
+project_time_registration_end = "time_registration_end"
|
|
|
+project_time_registration_start = "time_registration_start"
|
|
|
+project_time_release = "time_release"
|
|
|
+
|
|
|
+project_dup_docid = "dup_docid"
|
|
|
+project_info_source = "info_source"
|
|
|
+
|
|
|
+project_delete_uuid = "delete_uuid"
|
|
|
+
|
|
|
+project_nlp_enterprise = "nlp_enterprise"
|
|
|
+project_nlp_enterprise_attachment = "nlp_enterprise_attachment"
|
|
|
+project_update_time = "update_time"
|
|
|
+project_tmp_attrs = "tmp_attrs"
|
|
|
+
|
|
|
+document_partitionkey = "partitionkey"
|
|
|
+document_docid = "docid"
|
|
|
+document_dochtmlcon = "dochtmlcon"
|
|
|
+document_doctextcon = "doctextcon"
|
|
|
+document_doctitle = "doctitle"
|
|
|
+document_attachmenttextcon = "attachmenttextcon"
|
|
|
+document_attachment_path = "page_attachments"
|
|
|
+document_attachment_path_filemd5 = "fileMd5"
|
|
|
+document_attachment_path_fileTitle = "fileTitle"
|
|
|
+document_attachment_path_fileLink = "fileLink"
|
|
|
+document_crtime = "crtime"
|
|
|
+document_status = "status"
|
|
|
+document_page_time = "page_time"
|
|
|
+document_attachment_extract_status = "attachment_extract_status"
|
|
|
+document_web_source_no = "web_source_no"
|
|
|
+document_fingerprint = "fingerprint"
|
|
|
+document_opertime = "opertime"
|
|
|
+document_docchannel = "docchannel"
|
|
|
+document_original_docchannel = "original_docchannel"
|
|
|
+document_life_docchannel = "life_docchannel"
|
|
|
+document_area = "area"
|
|
|
+document_province = "province"
|
|
|
+document_city = "city"
|
|
|
+document_district = "district"
|
|
|
+document_extract_json = "extract_json"
|
|
|
+document_bidway = "bidway"
|
|
|
+document_industry = "industry"
|
|
|
+document_info_type = "info_type"
|
|
|
+document_qcodes = "qcodes"
|
|
|
+document_project_name = "project_name"
|
|
|
+document_project_code = "project_code"
|
|
|
+document_project_codes = "project_codes"
|
|
|
+document_tenderee = "tenderee"
|
|
|
+document_tenderee_addr = "tenderee_addr"
|
|
|
+document_tenderee_phone = "tenderee_phone"
|
|
|
+document_tenderee_contact = "tenderee_contact"
|
|
|
+document_agency = "agency"
|
|
|
+document_agency_phone = "agency_phone"
|
|
|
+document_agency_contact = "agency_contact"
|
|
|
+document_product = "product"
|
|
|
+
|
|
|
+document_moneysource = "moneysource"
|
|
|
+document_service_time = "service_time"
|
|
|
+document_time_bidclose = "time_bidclose"
|
|
|
+document_time_bidopen = "time_bidopen"
|
|
|
+document_time_bidstart = "time_bidstart"
|
|
|
+document_time_commencement = "time_commencement"
|
|
|
+document_time_completion = "time_completion"
|
|
|
+document_time_earnest_money_start = "time_earnest_money_start"
|
|
|
+document_time_earnest_money_end = "time_earnest_money_end"
|
|
|
+document_time_get_file_end = "time_get_file_end"
|
|
|
+document_time_get_file_start = "time_get_file_start"
|
|
|
+document_time_publicity_end = "time_publicity_end"
|
|
|
+document_time_publicity_start = "time_publicity_start"
|
|
|
+document_time_registration_end = "time_registration_end"
|
|
|
+document_time_registration_start = "time_registration_start"
|
|
|
+document_time_release = "time_release"
|
|
|
+document_info_source = "info_source"
|
|
|
+document_nlp_enterprise = "nlp_enterprise"
|
|
|
+document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
|
|
|
+
|
|
|
+document_tmp_partitionkey = "partitionkey"
|
|
|
+document_tmp_docid = "docid"
|
|
|
+document_tmp_dochtmlcon = "dochtmlcon"
|
|
|
+document_tmp_doctextcon = "doctextcon"
|
|
|
+document_tmp_doctitle = "doctitle"
|
|
|
+document_tmp_attachmenttextcon = "attachmenttextcon"
|
|
|
+document_tmp_attachment_path = "page_attachments"
|
|
|
+document_tmp_attachment_path_filemd5 = "fileMd5"
|
|
|
+document_tmp_attachment_path_fileTitle = "fileTitle"
|
|
|
+document_tmp_attachment_path_fileLink = "fileLink"
|
|
|
+document_tmp_uuid = "uuid"
|
|
|
+document_tmp_crtime = "crtime"
|
|
|
+document_tmp_status = "status"
|
|
|
+document_tmp_tenderee = "tenderee"
|
|
|
+document_tmp_agency = "agency"
|
|
|
+document_tmp_project_code = "project_code"
|
|
|
+document_tmp_product = "product"
|
|
|
+document_tmp_project_name = "project_name"
|
|
|
+document_tmp_doctitle_refine = "doctitle_refine"
|
|
|
+document_tmp_extract_count = "extract_count"
|
|
|
+document_tmp_sub_docs_json = "sub_docs_json"
|
|
|
+document_tmp_save = "save"
|
|
|
+document_tmp_dup_docid = "dup_docid"
|
|
|
+document_tmp_merge_uuid = "merge_uuid"
|
|
|
+document_tmp_projects = "projects"
|
|
|
+document_tmp_page_time = "page_time"
|
|
|
+document_tmp_attachment_extract_status = "attachment_extract_status"
|
|
|
+document_tmp_web_source_no = "web_source_no"
|
|
|
+document_tmp_fingerprint = "fingerprint"
|
|
|
+document_tmp_opertime = "opertime"
|
|
|
+document_tmp_docchannel = "docchannel"
|
|
|
+document_tmp_original_docchannel = "original_docchannel"
|
|
|
+
|
|
|
+document_tmp_extract_json = "extract_json"
|
|
|
+document_tmp_industry_json = "industry_json"
|
|
|
+document_tmp_other_json = "other_json"
|
|
|
+
|
|
|
+document_tmp_time_bidclose = "time_bidclose"
|
|
|
+document_tmp_time_bidopen = "time_bidopen"
|
|
|
+document_tmp_time_completion = "time_completion"
|
|
|
+document_tmp_time_earnest_money_end = "time_earnest_money_end"
|
|
|
+document_tmp_time_earnest_money_start = "time_earnest_money_start"
|
|
|
+document_tmp_time_get_file_end = "time_get_file_end"
|
|
|
+document_tmp_time_get_file_start = "time_get_file_start"
|
|
|
+document_tmp_time_publicity_end = "time_publicity_end"
|
|
|
+document_tmp_time_publicity_start = "time_publicity_start"
|
|
|
+document_tmp_time_registration_end = "time_registration_end"
|
|
|
+document_tmp_time_registration_start = "time_registration_start"
|
|
|
+document_tmp_time_release = "time_release"
|
|
|
+
|
|
|
+def log(msg):
|
|
|
+ logging.info(msg)
|
|
|
+
|
|
|
+
|
|
|
+# 配置pandas依赖包
|
|
|
+def include_package_path(res_name):
|
|
|
+ import os, sys
|
|
|
+ archive_files = get_cache_archive(res_name)
|
|
|
+ dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
|
|
|
+ if '.dist_info' not in f.name], key=lambda v: len(v))
|
|
|
+
|
|
|
+ _path = dir_names[0].split(".zip/files")[0]+".zip/files"
|
|
|
+ log("add path:%s"%(_path))
|
|
|
+ sys.path.append(_path)
|
|
|
+
|
|
|
+ return os.path.dirname(dir_names[0])
|
|
|
+
|
|
|
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
|
|
|
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
|
|
|
+def include_file(file_name):
|
|
|
+ import os, sys
|
|
|
+ so_file = get_cache_file(file_name)
|
|
|
+ sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
|
|
|
+
|
|
|
+def include_so(file_name):
|
|
|
+ import os, sys
|
|
|
+ so_file = get_cache_file(file_name)
|
|
|
+
|
|
|
+ with open(so_file.name, 'rb') as fp:
|
|
|
+ content=fp.read()
|
|
|
+ so = open(file_name, "wb")
|
|
|
+ so.write(content)
|
|
|
+ so.flush()
|
|
|
+ so.close()
|
|
|
+
|
|
|
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
|
|
|
+def init_env(list_files,package_name):
|
|
|
+ import os,sys
|
|
|
+
|
|
|
+ if len(list_files)==1:
|
|
|
+ so_file = get_cache_file(list_files[0])
|
|
|
+ cmd_line = os.path.abspath(so_file.name)
|
|
|
+ os.system("unzip -o %s -d %s"%(cmd_line,package_name))
|
|
|
+ elif len(list_files)>1:
|
|
|
+ cmd_line = "cat"
|
|
|
+ for _file in list_files:
|
|
|
+ so_file = get_cache_file(_file)
|
|
|
+ cmd_line += " "+os.path.abspath(so_file.name)
|
|
|
+ cmd_line += " > temp.zip"
|
|
|
+ os.system(cmd_line)
|
|
|
+ os.system("unzip -o temp.zip -d %s"%(package_name))
|
|
|
+ # os.system("rm -rf %s/*.dist-info"%(package_name))
|
|
|
+ # return os.listdir(os.path.abspath("local_package"))
|
|
|
+ # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
|
|
|
+ # os.system("source ~/.bashrc")
|
|
|
+ sys.path.insert(0,os.path.abspath(package_name))
|
|
|
+
|
|
|
+ # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
|
|
|
+
|
|
|
+import platform
|
|
|
+
|
|
|
+
|
|
|
+def getSet(list_dict,key):
|
|
|
+ _set = set()
|
|
|
+ for item in list_dict:
|
|
|
+ if key in item:
|
|
|
+ if item[key]!='' and item[key] is not None:
|
|
|
+ if re.search("^[\d\.]+$",item[key]) is not None:
|
|
|
+ _set.add(str(float(item[key])))
|
|
|
+ else:
|
|
|
+ _set.add(str(item[key]))
|
|
|
+ return _set
|
|
|
+
|
|
|
+def popNoneFromDict(_dict):
|
|
|
+ list_pop = []
|
|
|
+ for k,v in _dict.items():
|
|
|
+ if v is None or v=="":
|
|
|
+ list_pop.append(k)
|
|
|
+ for k in list_pop:
|
|
|
+ _dict.pop(k)
|
|
|
+ return _dict
|
|
|
+
|
|
|
+def split_with_time(list_dict,sort_key,timedelta=86400*120,more_than_one=True):
|
|
|
+ group_num = 1
|
|
|
+ if more_than_one:
|
|
|
+ group_num = 2
|
|
|
+ if len(list_dict)>0:
|
|
|
+ if (isinstance(list_dict[0],dict) and sort_key in list_dict[0]) or (isinstance(list_dict[0],list) and isinstance(sort_key,int) and sort_key<len(list_dict[0])):
|
|
|
+ list_dict.sort(key=lambda x:x[sort_key])
|
|
|
+ list_group = []
|
|
|
+ _begin = 0
|
|
|
+ for i in range(len(list_dict)-1):
|
|
|
+ if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ _group = []
|
|
|
+ for j in range(_begin,i+1):
|
|
|
+ _group.append(list_dict[j])
|
|
|
+ if len(_group)>1:
|
|
|
+ list_group.append(_group)
|
|
|
+ _begin = i + 1
|
|
|
+ if len(list_dict)>=group_num:
|
|
|
+ _group = []
|
|
|
+ for j in range(_begin,len(list_dict)):
|
|
|
+ _group.append(list_dict[j])
|
|
|
+ if len(_group)>0:
|
|
|
+ list_group.append(_group)
|
|
|
+ return list_group
|
|
|
+ return [list_dict]
|
|
|
+
|
|
|
+@annotate('bigint,bigint,string,string,string,string,string,string,bigint->string')
|
|
|
+class f_merge_rule_limit_num_contain_greater(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json,re
|
|
|
+ global json,logging,re
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [list()]
|
|
|
+
|
|
|
+ def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column,greater_column,MAX_NUM):
|
|
|
+ buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
|
|
|
+ "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
|
|
|
+ "contain_column":contain_column,"greater_column":greater_column,"MAX_NUM":MAX_NUM})
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0])
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ MAX_NUM = 5
|
|
|
+ if len(buffer[0])>0:
|
|
|
+ MAX_NUM = buffer[0][0]["MAX_NUM"]
|
|
|
+ list_split = split_with_time(buffer[0],"page_time_stamp")
|
|
|
+ list_group = []
|
|
|
+ for _split in list_split:
|
|
|
+ flag = True
|
|
|
+ keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
|
|
|
+ dict_set = {}
|
|
|
+ for _key in keys:
|
|
|
+ dict_set[_key] = set()
|
|
|
+ if len(_split)>MAX_NUM:
|
|
|
+ flag = False
|
|
|
+ else:
|
|
|
+ for _key in keys:
|
|
|
+ logging.info(_key+str(getSet(_split,_key)))
|
|
|
+ if len(getSet(_split,_key))>1:
|
|
|
+ flag = False
|
|
|
+ break
|
|
|
+
|
|
|
+ MAX_CONTAIN_COLUMN = None
|
|
|
+ #判断组内每条公告是否包含
|
|
|
+ if flag:
|
|
|
+ for _d in _split:
|
|
|
+ contain_column = _d["contain_column"]
|
|
|
+ if contain_column is not None and contain_column !="":
|
|
|
+ if MAX_CONTAIN_COLUMN is None:
|
|
|
+ MAX_CONTAIN_COLUMN = contain_column
|
|
|
+ else:
|
|
|
+ if len(MAX_CONTAIN_COLUMN)<len(contain_column):
|
|
|
+ if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
|
|
|
+ flag = False
|
|
|
+ break
|
|
|
+ MAX_CONTAIN_COLUMN = contain_column
|
|
|
+ else:
|
|
|
+ if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
|
|
|
+ flag = False
|
|
|
+ break
|
|
|
+ if len(getSet(_split,"greater_column"))==1:
|
|
|
+ flag = False
|
|
|
+ break
|
|
|
+ if flag:
|
|
|
+ _set_docid = set()
|
|
|
+ for item in _split:
|
|
|
+ _set_docid.add(item["docid"])
|
|
|
+ if len(_set_docid)>1:
|
|
|
+ list_group.append(list(_set_docid))
|
|
|
+ return json.dumps(list_group)
|
|
|
+
|
|
|
+def getDiffIndex(list_dict,key):
|
|
|
+ _set = set()
|
|
|
+ for _i in range(len(list_dict)):
|
|
|
+ item = list_dict[_i]
|
|
|
+ if key in item:
|
|
|
+ if item[key]!='' and item[key] is not None:
|
|
|
+ if re.search("^\d[\d\.]*$",item[key]) is not None:
|
|
|
+ _set.add(str(float(item[key])))
|
|
|
+ else:
|
|
|
+ _set.add(str(item[key]))
|
|
|
+ if len(_set)>1:
|
|
|
+ return _i
|
|
|
+ return len(list_dict)
|
|
|
+
|
|
|
+@annotate('bigint,bigint,string,string,string,string,string,string,string,bigint->string')
|
|
|
+class f_remege_limit_num_contain(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json,re
|
|
|
+ global json,logging,re
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [list()]
|
|
|
+
|
|
|
+ def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence):
|
|
|
+ buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
|
|
|
+ "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
|
|
|
+ "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence})
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0])
|
|
|
+
|
|
|
+ def getNotLikeSet(self,_dict,column_name):
|
|
|
+ column_value = _dict.get(column_name,None)
|
|
|
+ _set = set()
|
|
|
+ if column_value is not None:
|
|
|
+ for _i in range(1,len(column_value)):
|
|
|
+ _set.add(column_value[_i-1:_i+1])
|
|
|
+ _dict["notLike_set"] = _set
|
|
|
+
|
|
|
+ def getSimilarity(self,_set1,_set2):
|
|
|
+ _sum = max([1,min([len(_set1),len(_set2)])])
|
|
|
+ return len(_set1&_set2)/_sum
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ list_group = []
|
|
|
+ the_group = buffer[0]
|
|
|
+
|
|
|
+ SIM_PROB = 0.6
|
|
|
+ for _d in the_group:
|
|
|
+ self.getNotLikeSet(_d,"notLike_column")
|
|
|
+
|
|
|
+ #判断多个值与否
|
|
|
+ keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
|
|
|
+ re_merge = False
|
|
|
+ for _key in keys:
|
|
|
+ if len(getSet(the_group,_key))>1:
|
|
|
+ re_merge = True
|
|
|
+ break
|
|
|
+ #判断是否相似而不相同
|
|
|
+ re_merge_sim = False
|
|
|
+ for _i1 in range(0,len(the_group)):
|
|
|
+ for _j1 in range(_i1+1,len(the_group)):
|
|
|
+ _set1 = the_group[_i1]["notLike_set"]
|
|
|
+ _set2 = the_group[_j1]["notLike_set"]
|
|
|
+ _sim = self.getSimilarity(_set1,_set2)
|
|
|
+ if _sim>SIM_PROB and _sim<1:
|
|
|
+ re_merge_sim = True
|
|
|
+ break
|
|
|
+ contain_keys = ["contain_column1","contain_column2"]
|
|
|
+
|
|
|
+ logging.info(the_group)
|
|
|
+ logging.info(str(re_merge)+str(re_merge_sim))
|
|
|
+ if re_merge or re_merge_sim:
|
|
|
+ the_group.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+ the_group.sort(key=lambda x:x["page_time_stamp"])
|
|
|
+ #重新成组
|
|
|
+ dict_docid_doc = {}
|
|
|
+ for _doc in the_group:
|
|
|
+ dict_docid_doc[_doc["docid"]] = _doc
|
|
|
+ for _doc in the_group:
|
|
|
+ merge_flag = False
|
|
|
+ for _index in range(len(list_group)):
|
|
|
+ _g = list_group[_index]
|
|
|
+ hit_count = 0
|
|
|
+ dict_temp = dict()
|
|
|
+ #多个值的异常
|
|
|
+ if re_merge:
|
|
|
+ for _c_key in contain_keys:
|
|
|
+ dict_temp[_c_key] = _g[_c_key]
|
|
|
+ if _g[_c_key] is not None and _doc[_c_key] is not None:
|
|
|
+ if len(_g[_c_key])>len(_doc[_c_key]):
|
|
|
+ if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
|
|
|
+ dict_temp[_c_key] = _g[_c_key]
|
|
|
+ hit_count += 1
|
|
|
+ else:
|
|
|
+ if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
|
|
|
+ dict_temp[_c_key] = _doc[_c_key]
|
|
|
+ _g[_c_key] = _doc[_c_key]
|
|
|
+ hit_count += 1
|
|
|
+ else:
|
|
|
+ hit_count = 1
|
|
|
+ # if hit_count==len(contain_keys):
|
|
|
+ if hit_count>0:
|
|
|
+ _flag_sim = False
|
|
|
+ #相似而不相同的异常
|
|
|
+ if re_merge_sim:
|
|
|
+ for _docid in _g["docid"]:
|
|
|
+ tmp_d = dict_docid_doc[_docid]
|
|
|
+ _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
|
|
|
+ if _sim>SIM_PROB and _sim<1:
|
|
|
+ _flag_sim = True
|
|
|
+ if not _flag_sim:
|
|
|
+ for _c_key in dict_temp.keys():
|
|
|
+ _g[_c_key] = dict_temp[_c_key]
|
|
|
+ _g["docid"].append(_doc["docid"])
|
|
|
+ merge_flag = True
|
|
|
+ break
|
|
|
+ if not merge_flag:
|
|
|
+ _dict = dict()
|
|
|
+ _dict["docid"] = [_doc["docid"]]
|
|
|
+ for _c_key in contain_keys:
|
|
|
+ _dict[_c_key] = _doc[_c_key]
|
|
|
+ list_group.append(_dict)
|
|
|
+
|
|
|
+ final_group = []
|
|
|
+ #判断是否符合一个值
|
|
|
+ for _group in list_group:
|
|
|
+ _split = []
|
|
|
+ for _docid in _group["docid"]:
|
|
|
+ _split.append(dict_docid_doc[_docid])
|
|
|
+
|
|
|
+ #通过置信度排序,尽可能保留组
|
|
|
+ _split.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+ #置信度
|
|
|
+ list_key_index = []
|
|
|
+ for _k in keys:
|
|
|
+ list_key_index.append(getDiffIndex(_split,_k))
|
|
|
+
|
|
|
+ _index = min(list_key_index)
|
|
|
+
|
|
|
+
|
|
|
+ final_group.append([_c["docid"] for _c in _split[:_index]])
|
|
|
+ for _c in _split[_index:]:
|
|
|
+ final_group.append([_c["docid"]])
|
|
|
+
|
|
|
+
|
|
|
+ #若是找到两个以上,则全部单独成组,否则成一组
|
|
|
+ # _flag = True
|
|
|
+ # for _key in keys:
|
|
|
+ # if len(getSet(_split,_key))>1:
|
|
|
+ # _flag = False
|
|
|
+ # break
|
|
|
+ # if not _flag:
|
|
|
+ # for _docid in _group["docid"]:
|
|
|
+ # final_group.append([_docid])
|
|
|
+ # else:
|
|
|
+ # final_group.append(list(set(_group["docid"])))
|
|
|
+ else:
|
|
|
+ final_group = [list(set([item["docid"] for item in the_group]))]
|
|
|
+ log(str(final_group))
|
|
|
+ return json.dumps(final_group)
|
|
|
+
|
|
|
+def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
|
|
|
+ _time = time.strftime(format,time.localtime())
|
|
|
+ return _time
|
|
|
+
|
|
|
+@annotate('bigint->string')
|
|
|
+class f_get_single_merged_bychannel(BaseUDTF):
|
|
|
+
|
|
|
+ def process(self,docid):
|
|
|
+ _d = {"data":{str(docid):[]},"process_time":getCurrent_date()}
|
|
|
+ self.forward(json.dumps(_d))
|
|
|
+
|
|
|
+@annotate('string->string')
|
|
|
+class f_get_single_merged_docids(object):
|
|
|
+
|
|
|
+ def evaluate(self,_json):
|
|
|
+ if _json!="" and _json is not None:
|
|
|
+ _d = json.loads(_json)
|
|
|
+ _keys = _d.get("data",{}).keys()
|
|
|
+ return ",".join(list(_keys))
|
|
|
+ return ""
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+@annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,bigint,bigint,string->string')
|
|
|
+class f_remege_limit_num_contain_bychannel(BaseUDAF):
|
|
|
+ '''f_remege_limit_num_contain_bychannel
|
|
|
+ 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json,re
|
|
|
+ global json,logging,re
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [list()]
|
|
|
+
|
|
|
+ def iterate(self, buffer,docid,docchannel,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence,extract_count,json_dicttime):
|
|
|
+ _dict = {"docid":docid,"docchannel":docchannel,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
|
|
|
+ "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
|
|
|
+ "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence,
|
|
|
+ "extract_count":extract_count,"json_dicttime":json_dicttime}
|
|
|
+ buffer[0].append(_dict)
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0])
|
|
|
+
|
|
|
+ def getNotLikeSet(self,_dict,column_name):
|
|
|
+ column_value = _dict.get(column_name,None)
|
|
|
+ _set = set()
|
|
|
+ if column_value is not None:
|
|
|
+ for _i in range(1,len(column_value)):
|
|
|
+ _set.add(column_value[_i-1:_i+1])
|
|
|
+ _dict["notLike_set"] = _set
|
|
|
+
|
|
|
+ def getSimilarity(self,_set1,_set2):
|
|
|
+ _sum = max([1,min([len(_set1),len(_set2)])])
|
|
|
+ return len(_set1&_set2)/_sum
|
|
|
+
|
|
|
+ def difftimecount(self,_dict1,_dict2):
|
|
|
+ _count = 0
|
|
|
+ for k,v in _dict1.items():
|
|
|
+ if v is not None and v!="":
|
|
|
+ v1 = _dict2.get(k)
|
|
|
+ if v1 is not None and v1!="":
|
|
|
+ if v!=v1:
|
|
|
+ _count += 1
|
|
|
+ return _count
|
|
|
+
|
|
|
+ def splitByTimezone(self,list_dict,_key):
|
|
|
+ cluster_docid = []
|
|
|
+ dict_docid_key = {}
|
|
|
+ dict_docid = {}
|
|
|
+ for _dict in list_dict:
|
|
|
+ if _dict.get(_key,"") is None or _dict.get(_key,"")=="":
|
|
|
+ dict_docid_key[_dict.get("docid")] = {}
|
|
|
+ else:
|
|
|
+ dict_docid_key[_dict.get("docid")] = json.loads(_dict.get(_key))
|
|
|
+ dict_docid[_dict.get("docid")] = _dict
|
|
|
+ for _dict in list_dict:
|
|
|
+ _find = False
|
|
|
+ for _cl in cluster_docid:
|
|
|
+ _legal = True
|
|
|
+ for _c in _cl:
|
|
|
+ if self.difftimecount(dict_docid_key.get(_c),dict_docid_key.get(_dict.get("docid")))>0:
|
|
|
+ _legal = False
|
|
|
+ break
|
|
|
+ if _legal:
|
|
|
+ _cl.append(_dict.get("docid"))
|
|
|
+ _find = True
|
|
|
+ if not _find:
|
|
|
+ cluster_docid.append([_dict.get("docid")])
|
|
|
+ _result = []
|
|
|
+ for _cl in cluster_docid:
|
|
|
+ _r = []
|
|
|
+ for _c in _cl:
|
|
|
+ _r.append(dict_docid.get(_c))
|
|
|
+ _result.append(_r)
|
|
|
+ return _result
|
|
|
+
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ list_group = []
|
|
|
+ the_group = buffer[0]
|
|
|
+
|
|
|
+ SIM_PROB = 0.6
|
|
|
+ for _d in the_group:
|
|
|
+ self.getNotLikeSet(_d,"notLike_column")
|
|
|
+
|
|
|
+ #判断多个值与否
|
|
|
+ keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
|
|
|
+ re_merge = False
|
|
|
+ for _key in keys:
|
|
|
+ if len(getSet(the_group,_key))>1:
|
|
|
+ log("has_more_than_one:%s"%str(getSet(the_group,_key)))
|
|
|
+ re_merge = True
|
|
|
+ break
|
|
|
+ #判断是否相似而不相同
|
|
|
+ re_merge_sim = False
|
|
|
+ for _i1 in range(0,len(the_group)):
|
|
|
+ for _j1 in range(_i1+1,len(the_group)):
|
|
|
+ _set1 = the_group[_i1]["notLike_set"]
|
|
|
+ _set2 = the_group[_j1]["notLike_set"]
|
|
|
+ _sim = self.getSimilarity(_set1,_set2)
|
|
|
+ if _sim>SIM_PROB and _sim<1:
|
|
|
+ re_merge_sim = True
|
|
|
+ break
|
|
|
+ contain_keys = ["contain_column1","contain_column2"]
|
|
|
+
|
|
|
+ logging.info(the_group)
|
|
|
+ logging.info(str(re_merge)+str(re_merge_sim))
|
|
|
+ #重新成组
|
|
|
+ dict_docid_doc = {}
|
|
|
+ for _doc in the_group:
|
|
|
+ dict_docid_doc[_doc["docid"]] = _doc
|
|
|
+ if re_merge or re_merge_sim:
|
|
|
+ the_group.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+ the_group.sort(key=lambda x:x["page_time_stamp"])
|
|
|
+
|
|
|
+ for _doc in the_group:
|
|
|
+ merge_flag = False
|
|
|
+ for _index in range(len(list_group)):
|
|
|
+ _g = list_group[_index]
|
|
|
+ hit_count = 0
|
|
|
+ dict_temp = dict()
|
|
|
+ #多个值的异常
|
|
|
+ if re_merge:
|
|
|
+ for _c_key in contain_keys:
|
|
|
+ dict_temp[_c_key] = _g[_c_key]
|
|
|
+ if _g[_c_key] is not None and _doc[_c_key] is not None:
|
|
|
+ if len(_g[_c_key])>len(_doc[_c_key]):
|
|
|
+ if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
|
|
|
+ dict_temp[_c_key] = _g[_c_key]
|
|
|
+ hit_count += 1
|
|
|
+ else:
|
|
|
+ if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
|
|
|
+ dict_temp[_c_key] = _doc[_c_key]
|
|
|
+ _g[_c_key] = _doc[_c_key]
|
|
|
+ hit_count += 1
|
|
|
+ else:
|
|
|
+ hit_count = 1
|
|
|
+ # if hit_count==len(contain_keys):
|
|
|
+ if hit_count>0:
|
|
|
+ _flag_sim = False
|
|
|
+ #相似而不相同的异常
|
|
|
+ if re_merge_sim:
|
|
|
+ for _docid in _g["docid"]:
|
|
|
+ tmp_d = dict_docid_doc[_docid]
|
|
|
+ _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
|
|
|
+ if _sim>SIM_PROB and _sim<1:
|
|
|
+ _flag_sim = True
|
|
|
+ if not _flag_sim:
|
|
|
+ for _c_key in dict_temp.keys():
|
|
|
+ _g[_c_key] = dict_temp[_c_key]
|
|
|
+ _g["docid"].append(_doc["docid"])
|
|
|
+ merge_flag = True
|
|
|
+ break
|
|
|
+ if not merge_flag:
|
|
|
+ _dict = dict()
|
|
|
+ _dict["docid"] = [_doc["docid"]]
|
|
|
+ for _c_key in contain_keys:
|
|
|
+ _dict[_c_key] = _doc[_c_key]
|
|
|
+ list_group.append(_dict)
|
|
|
+
|
|
|
+ final_group = []
|
|
|
+ #判断是否符合一个值
|
|
|
+ for _group in list_group:
|
|
|
+ _split = []
|
|
|
+ for _docid in _group["docid"]:
|
|
|
+ _split.append(dict_docid_doc[_docid])
|
|
|
+
|
|
|
+ #通过置信度排序,尽可能保留组
|
|
|
+ _split.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+ #置信度
|
|
|
+ list_key_index = []
|
|
|
+ for _k in keys:
|
|
|
+ list_key_index.append(getDiffIndex(_split,_k))
|
|
|
+
|
|
|
+ _index = min(list_key_index)
|
|
|
+
|
|
|
+
|
|
|
+ final_group.append([_c["docid"] for _c in _split[:_index]])
|
|
|
+ for _c in _split[_index:]:
|
|
|
+ final_group.append([_c["docid"]])
|
|
|
+
|
|
|
+
|
|
|
+ #若是找到两个以上,则全部单独成组,否则成一组
|
|
|
+ # _flag = True
|
|
|
+ # for _key in keys:
|
|
|
+ # if len(getSet(_split,_key))>1:
|
|
|
+ # _flag = False
|
|
|
+ # break
|
|
|
+ # if not _flag:
|
|
|
+ # for _docid in _group["docid"]:
|
|
|
+ # final_group.append([_docid])
|
|
|
+ # else:
|
|
|
+ # final_group.append(list(set(_group["docid"])))
|
|
|
+ else:
|
|
|
+ final_group = [list(set([item["docid"] for item in the_group]))]
|
|
|
+ log("%s--%s"%("final_group",str(final_group)))
|
|
|
+
|
|
|
+ #每个channel选择一篇公告
|
|
|
+ final_group_channel = []
|
|
|
+ for _group in final_group:
|
|
|
+ dict_channel_id = {}
|
|
|
+ otherChannel = 10000
|
|
|
+ for _docid in _group:
|
|
|
+ _channel = dict_docid_doc[_docid].get("docchannel")
|
|
|
+ if _channel in [114,115,116,117]:
|
|
|
+ otherChannel += 1
|
|
|
+ _channel = otherChannel
|
|
|
+ if _channel not in dict_channel_id:
|
|
|
+ dict_channel_id[_channel] = []
|
|
|
+ dict_channel_id[_channel].append({"docid":_docid,"page_time_stamp":dict_docid_doc[_docid].get("page_time_stamp"),
|
|
|
+ "extract_count":dict_docid_doc[_docid].get("extract_count"),
|
|
|
+ "json_dicttime":dict_docid_doc[_docid].get("json_dicttime")})
|
|
|
+
|
|
|
+ #根据日期进行切分
|
|
|
+ new_dict_channel_id = {}
|
|
|
+ log("%s:%s"%("dict_channel_id",str(dict_channel_id)))
|
|
|
+ for k,v in dict_channel_id.items():
|
|
|
+ list_time_docids = split_with_time(v,"page_time_stamp",86400*6,more_than_one=False)
|
|
|
+ log(list_time_docids)
|
|
|
+ for _l in list_time_docids:
|
|
|
+ list_t = self.splitByTimezone(_l,"json_dicttime")
|
|
|
+ for _t in list_t:
|
|
|
+ otherChannel += 1
|
|
|
+ new_dict_channel_id[otherChannel] = _t
|
|
|
+ log("%s:%s"%("new_dict_channel_id",str(new_dict_channel_id)))
|
|
|
+ channel_dict = {}
|
|
|
+ for k,v in new_dict_channel_id.items():
|
|
|
+ v.sort(key=lambda x:x["docid"])
|
|
|
+ v.sort(key=lambda x:x["extract_count"],reverse=True)
|
|
|
+ channel_dict[v[0]["docid"]] = []
|
|
|
+ for _docs in v[1:]:
|
|
|
+ channel_dict[v[0]["docid"]].append(_docs["docid"])
|
|
|
+ _d = {"data":channel_dict,"process_time":getCurrent_date()}
|
|
|
+ final_group_channel.append(_d)
|
|
|
+
|
|
|
+ return json.dumps(final_group_channel)
|
|
|
+
|
|
|
+@annotate('string -> string')
|
|
|
+class f_get_remerge_group_channel(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,json_remerge):
|
|
|
+ if json_remerge is not None:
|
|
|
+ list_group = json.loads(json_remerge)
|
|
|
+ for _group in list_group:
|
|
|
+ self.forward(json.dumps(_group))
|
|
|
+
|
|
|
+@annotate('string -> string')
|
|
|
+class f_get_remerge_group(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,json_remerge):
|
|
|
+ if json_remerge is not None:
|
|
|
+ list_group = json.loads(json_remerge)
|
|
|
+ for _group in list_group:
|
|
|
+ l_g = list(set(_group))
|
|
|
+ l_g.sort(key=lambda x:x)
|
|
|
+ list_docid = [str(_docid) for _docid in l_g]
|
|
|
+ self.forward(",".join(list_docid))
|
|
|
+
|
|
|
+@annotate('bigint,bigint,string->string')
|
|
|
+class f_merge_probability(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ global json
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [[]]
|
|
|
+
|
|
|
+ def iterate(self, buffer,docid,page_time_stamp,_type):
|
|
|
+ buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0])
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ list_dict = buffer[0]
|
|
|
+ list_dict = list_dict[:10000]
|
|
|
+ list_group = split_with_time(list_dict,sort_key="page_time_stamp",timedelta=86400*120)
|
|
|
+
|
|
|
+ return json.dumps(list_group)
|
|
|
+
|
|
|
+@annotate('string -> bigint,bigint,bigint,bigint,string')
|
|
|
+class f_split_merge_probability(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global logging,json
|
|
|
+ logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,list_group_str):
|
|
|
+ logging.info("0")
|
|
|
+ logging.info(list_group_str)
|
|
|
+ if list_group_str is not None:
|
|
|
+ logging.info("1")
|
|
|
+ try:
|
|
|
+ list_group = json.loads(list_group_str)
|
|
|
+ logging.info("2")
|
|
|
+ for _group in list_group:
|
|
|
+ if len(_group)>0:
|
|
|
+ _type = _group[0].get("type","")
|
|
|
+ logging.info("3%d"%len(list_group))
|
|
|
+ # _group.sort(key=lambda x:x["page_time_stamp"])
|
|
|
+ _len = min(100,len(_group))
|
|
|
+ for _index_i in range(_len):
|
|
|
+ _count = 0
|
|
|
+ for _index_j in range(_index_i+1,_len):
|
|
|
+ if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
|
|
|
+ break
|
|
|
+ _count += 1
|
|
|
+ _docid1 = _group[_index_i]["docid"]
|
|
|
+ _docid2 = _group[_index_j]["docid"]
|
|
|
+ if _docid1<_docid2:
|
|
|
+ self.forward(_docid1,_docid2,1,_len,_type)
|
|
|
+ else:
|
|
|
+ self.forward(_docid2,_docid1,1,_len,_type)
|
|
|
+ except Exception as e:
|
|
|
+ logging(str(e))
|
|
|
+
|
|
|
+
|
|
|
+@annotate('bigint,bigint,string->string')
|
|
|
+class f_merge_groupPairs(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ global json
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [[]]
|
|
|
+
|
|
|
+ def iterate(self, buffer,is_exists,counts,_type):
|
|
|
+ buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0])
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ list_dict = buffer[0]
|
|
|
+ list_dict = list_dict[:10000]
|
|
|
+
|
|
|
+ return json.dumps(list_dict)
|
|
|
+
|
|
|
+@annotate("string -> bigint,bigint,bigint")
|
|
|
+class f_merge_getLabel(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global logging,json
|
|
|
+
|
|
|
+ def process(self,str_docids):
|
|
|
+ if str_docids is not None:
|
|
|
+ list_docids = [int(i) for i in str_docids.split(",")]
|
|
|
+ list_docids.sort(key=lambda x:x)
|
|
|
+ _len = min(100,len(list_docids))
|
|
|
+ for index_i in range(_len):
|
|
|
+ docid_less = list_docids[index_i]
|
|
|
+
|
|
|
+ for index_j in range(index_i+1,_len):
|
|
|
+ docid_greater = list_docids[index_j]
|
|
|
+ self.forward(docid_less,docid_greater,1)
|
|
|
+
|
|
|
+def getSimilarityOfString(str1,str2,nums=2):
|
|
|
+ _set1 = set()
|
|
|
+ _set2 = set()
|
|
|
+ if len(str1)<=nums or len(str2)<=nums:
|
|
|
+ if str1!=str2:
|
|
|
+ return 0.8
|
|
|
+ else:
|
|
|
+ return 1
|
|
|
+ if str1 is not None:
|
|
|
+ for i in range(nums,len(str1)):
|
|
|
+ _set1.add(str1[i-nums:i+1])
|
|
|
+ if str2 is not None:
|
|
|
+ for i in range(nums,len(str2)):
|
|
|
+ _set2.add(str2[i-nums:i+1])
|
|
|
+ _len = max(1,min(len(_set1),len(_set2)))
|
|
|
+ return len(_set1&_set2)/_len
|
|
|
+
|
|
|
+def check_columns(tenderee_less,tenderee_greater,
|
|
|
+ agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
+ win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
|
+ bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
|
|
|
+ flag = True
|
|
|
+ _set_tenderee = set()
|
|
|
+ if tenderee_less is not None and tenderee_less!="":
|
|
|
+ _set_tenderee.add(tenderee_less)
|
|
|
+ if tenderee_greater is not None and tenderee_greater!="":
|
|
|
+ _set_tenderee.add(tenderee_greater)
|
|
|
+ if len(_set_tenderee)>1:
|
|
|
+ return False
|
|
|
+ code_sim = getSimilarityOfString(project_code_less,project_code_greater)
|
|
|
+ if code_sim>0.6 and code_sim<1:
|
|
|
+ return False
|
|
|
+
|
|
|
+ #同批次不同编号
|
|
|
+ if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
|
|
|
+ _split_code_less = project_code_less.split("-")
|
|
|
+ _split_code_greater = project_code_greater.split("-")
|
|
|
+ if len(_split_code_less)>1 and len(_split_code_greater)>1:
|
|
|
+ if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
|
|
|
+ return False
|
|
|
+
|
|
|
+ _set_win_tenderer = set()
|
|
|
+ if win_tenderer_less is not None and win_tenderer_less!="":
|
|
|
+ _set_win_tenderer.add(win_tenderer_less)
|
|
|
+ if win_tenderer_greater is not None and win_tenderer_greater!="":
|
|
|
+ _set_win_tenderer.add(win_tenderer_greater)
|
|
|
+ if len(_set_win_tenderer)>1:
|
|
|
+ return False
|
|
|
+ _set_win_bid_price = set()
|
|
|
+ if win_bid_price_less is not None and win_bid_price_less!="":
|
|
|
+ _set_win_bid_price.add(float(win_bid_price_less))
|
|
|
+ if win_bid_price_greater is not None and win_bid_price_greater!="":
|
|
|
+ _set_win_bid_price.add(float(win_bid_price_greater))
|
|
|
+ if len(_set_win_bid_price)>1:
|
|
|
+ return False
|
|
|
+ _set_bidding_budget = set()
|
|
|
+ if bidding_budget_less is not None and bidding_budget_less!="":
|
|
|
+ _set_bidding_budget.add(float(bidding_budget_less))
|
|
|
+ if bidding_budget_greater is not None and bidding_budget_greater!="":
|
|
|
+ _set_bidding_budget.add(float(bidding_budget_greater))
|
|
|
+ if len(_set_bidding_budget)>1:
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+def getSimLevel(str1,str2):
|
|
|
+ str1_null = False
|
|
|
+ str2_null = False
|
|
|
+ _v = 0
|
|
|
+ if str1 is None or str1=="":
|
|
|
+ str1_null = True
|
|
|
+ if str2 is None or str2=="":
|
|
|
+ str2_null = True
|
|
|
+ if str1_null and str2_null:
|
|
|
+ _v = 2
|
|
|
+ elif str1_null and not str2_null:
|
|
|
+ _v = 4
|
|
|
+ elif not str1_null and str2_null:
|
|
|
+ _v = 6
|
|
|
+ elif not str1_null and not str2_null:
|
|
|
+ if str1==str2:
|
|
|
+ _v = 10
|
|
|
+ else:
|
|
|
+ _v = 0
|
|
|
+ return _v
|
|
|
+
|
|
|
+import math
|
|
|
+def featurnCount(_count,max_count=100):
|
|
|
+ return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
|
|
|
+
|
|
|
+def getLength(_str):
|
|
|
+ return len(_str if _str is not None else "")
|
|
|
+
|
|
|
+
|
|
|
+@annotate("string->bigint")
|
|
|
+class f_get_min_counts(object):
|
|
|
+
|
|
|
+
|
|
|
+ def evaluate(self,json_context):
|
|
|
+ _context = json.loads(json_context)
|
|
|
+
|
|
|
+ min_counts = 100
|
|
|
+
|
|
|
+ for item in _context:
|
|
|
+ if item["counts"]<min_counts:
|
|
|
+ min_counts = item["counts"]
|
|
|
+ return min_counts
|
|
|
+
|
|
|
+
|
|
|
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
|
|
|
+class f_merge_featureMatrix(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global logging,json
|
|
|
+
|
|
|
+ def process(self,json_context,tenderee_less,tenderee_greater,
|
|
|
+ agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
+ win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
|
+ bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
|
|
|
+ if not check_columns(tenderee_less,tenderee_greater,
|
|
|
+ agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
|
|
|
+ win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
|
|
|
+ bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
|
|
|
+ return
|
|
|
+
|
|
|
+ _context = json.loads(json_context)
|
|
|
+
|
|
|
+ min_counts = 100
|
|
|
+
|
|
|
+ dict_context = {}
|
|
|
+ for item in _context:
|
|
|
+ if item["counts"]<min_counts:
|
|
|
+ min_counts = item["counts"]
|
|
|
+ dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
|
|
|
+ context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
|
|
|
+ list_matrix = []
|
|
|
+ for index_i in range(len(context_key)):
|
|
|
+ for index_j in range(index_i+1,len(context_key)):
|
|
|
+ _key = "%s&%s"%(context_key[index_i],context_key[index_j])
|
|
|
+ _v = featurnCount(dict_context.get(_key,[0,0])[1])
|
|
|
+ list_matrix.append(_v)
|
|
|
+ context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
|
|
|
+ for index_i in range(len(context3_key)):
|
|
|
+ for index_j in range(index_i+1,len(context3_key)):
|
|
|
+ for index_k in range(index_j+1,len(context3_key)):
|
|
|
+ _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
|
|
|
+ _v = featurnCount(dict_context.get(_key,[0,0])[1])
|
|
|
+ list_matrix.append(_v)
|
|
|
+ list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
|
|
|
+ list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
|
|
|
+ list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
|
|
|
+ list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
|
|
|
+ list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
|
|
|
+ list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
|
|
|
+ list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
|
|
|
+ list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
|
|
|
+
|
|
|
+ # set_tenderer = set()
|
|
|
+ # if tenderee_less is not None and tenderee_less!="":
|
|
|
+ # set_tenderer.add(tenderee_less)
|
|
|
+ # if tenderee_greater is not None and tenderee_greater!="":
|
|
|
+ # set_tenderer.add(tenderee_greater)
|
|
|
+ #
|
|
|
+ # set_win_tenderer = set()
|
|
|
+ # if win_tenderer_less is not None and win_tenderer_less!="":
|
|
|
+ # set_win_tenderer.add(win_tenderer_less)
|
|
|
+ # if win_tenderer_greater is not None and win_tenderer_greater!="":
|
|
|
+ # set_win_tenderer.add(win_tenderer_greater)
|
|
|
+ #
|
|
|
+ # set_bidding_budget = set()
|
|
|
+ # if bidding_budget_less is not None and bidding_budget_less!="":
|
|
|
+ # set_bidding_budget.add(bidding_budget_less)
|
|
|
+ # if bidding_budget_greater is not None and bidding_budget_greater!="":
|
|
|
+ # set_bidding_budget.add(bidding_budget_greater)
|
|
|
+ #
|
|
|
+ # set_win_bid_price = set()
|
|
|
+ # if win_bid_price_less is not None and win_bid_price_less!="":
|
|
|
+ # set_win_bid_price.add(win_bid_price_less)
|
|
|
+ # if win_bid_price_greater is not None and win_bid_price_greater!="":
|
|
|
+ # set_win_bid_price.add(win_bid_price_greater)
|
|
|
+
|
|
|
+ json_matrix = json.dumps(list_matrix)
|
|
|
+
|
|
|
+ same_project_code = False
|
|
|
+ if project_code_less==project_code_greater and getLength(project_code_less)>0:
|
|
|
+ same_project_code = True
|
|
|
+
|
|
|
+ same_project_name = False
|
|
|
+ if project_name_less==project_name_greater and getLength(project_name_less)>0:
|
|
|
+ same_project_name = True
|
|
|
+
|
|
|
+ same_doctitle_refine = False
|
|
|
+ if doctitle_refine_less==doctitle_refine_greater and getLength(doctitle_refine_less)>0:
|
|
|
+ same_doctitle_refine = True
|
|
|
+
|
|
|
+ same_tenderee = False
|
|
|
+ if tenderee_less==tenderee_greater and getLength(tenderee_less)>0:
|
|
|
+ same_tenderee = True
|
|
|
+
|
|
|
+ same_agency = False
|
|
|
+ if agency_less==agency_greater and getLength(agency_less)>0:
|
|
|
+ same_agency = True
|
|
|
+
|
|
|
+ same_bidding_budget = False
|
|
|
+ if bidding_budget_less==bidding_budget_greater and getLength(bidding_budget_less)>0:
|
|
|
+ same_bidding_budget = True
|
|
|
+
|
|
|
+ same_win_tenderer = False
|
|
|
+ if win_tenderer_less==win_tenderer_greater and getLength(win_tenderer_less)>0:
|
|
|
+ same_win_tenderer = True
|
|
|
+
|
|
|
+ same_win_bid_price = False
|
|
|
+ if win_bid_price_less==win_bid_price_greater and getLength(win_bid_price_less)>0:
|
|
|
+ same_win_bid_price = True
|
|
|
+
|
|
|
+ contain_doctitle = False
|
|
|
+ if getLength(doctitle_refine_less)>0 and getLength(doctitle_refine_greater)>0 and (doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
|
|
|
+ contain_doctitle = True
|
|
|
+
|
|
|
+ contain_project_name = False
|
|
|
+ if getLength(project_name_less)>0 and getLength(project_name_greater)>0 and (project_name_less in project_name_greater or project_name_greater in project_name_less):
|
|
|
+ contain_project_name = True
|
|
|
+
|
|
|
+
|
|
|
+ total_money_less = 0 if getLength(bidding_budget_less)==0 else float(bidding_budget_less)+0 if getLength(win_bid_price_less)==0 else float(win_bid_price_less)
|
|
|
+ total_money_greater = 0 if getLength(bidding_budget_greater)==0 else float(bidding_budget_greater) +0 if getLength(win_bid_price_greater)==0 else float(win_bid_price_greater)
|
|
|
+
|
|
|
+
|
|
|
+ if min_counts<10:
|
|
|
+ _prob = 0.9
|
|
|
+ if same_project_code and same_win_tenderer and same_tenderee:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_project_name and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_doctitle_refine and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_name and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_doctitle_refine and same_win_bid_price and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_doctitle_refine and same_bidding_budget and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_doctitle_refine and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_project_code and same_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_project_code and same_doctitle_refine:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_bidding_budget and same_project_code:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_bidding_budget and same_doctitle_refine:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_tenderee and same_bidding_budget and same_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_doctitle_refine and same_project_code and same_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+
|
|
|
+ if min_counts<=5:
|
|
|
+ _prob = 0.8
|
|
|
+ if same_project_code and same_tenderee:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_win_tenderer:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_name and same_project_code:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_doctitle_refine:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if total_money_less==total_money_greater and total_money_less>100000:
|
|
|
+ if same_win_tenderer and (same_win_bid_price or same_bidding_budget):
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_bidding_budget:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_project_code and same_win_bid_price:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+ if same_bidding_budget and same_win_bid_price and (contain_project_name or contain_doctitle):
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+ if min_counts<=3:
|
|
|
+ _prob = 0.7
|
|
|
+ if same_project_name or same_project_code or same_doctitle_refine or contain_doctitle or contain_project_name:
|
|
|
+ self.forward(json_matrix,_prob)
|
|
|
+ return
|
|
|
+
|
|
|
+ self.forward(json_matrix,0)
|
|
|
+
|
|
|
+
|
|
|
+class MergePredictor():
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.input_size = 46
|
|
|
+ self.output_size = 2
|
|
|
+ self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
|
|
|
+ self.bias = np.array([16.79706382751465, -13.713337898254395])
|
|
|
+ # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
|
|
|
+
|
|
|
+ def activation(self,vec,_type):
|
|
|
+ if _type=="relu":
|
|
|
+ _vec = np.array(vec)
|
|
|
+ return _vec*(_vec>0)
|
|
|
+ if _type=="tanh":
|
|
|
+ return np.tanh(vec)
|
|
|
+ if _type=="softmax":
|
|
|
+ _vec = np.array(vec)
|
|
|
+ _exp = np.exp(_vec)
|
|
|
+ return _exp/np.sum(_exp)
|
|
|
+
|
|
|
+ def predict(self,input):
|
|
|
+ _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
|
|
|
+ # print(self.model.predict(np.array(input).reshape(-1,46)))
|
|
|
+ return _out
|
|
|
+
|
|
|
+@annotate('string,double -> double')
|
|
|
+class f_getMergeProb(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ include_package_path("numpy-1.18.zip")
|
|
|
+ import numpy as np
|
|
|
+ global json,np
|
|
|
+ self.mp = MergePredictor()
|
|
|
+
|
|
|
+
|
|
|
+ def process(self,json_matrix,pre_prob):
|
|
|
+ if not pre_prob>0.5:
|
|
|
+ _matrix = json.loads(json_matrix)
|
|
|
+ _prob = self.mp.predict(_matrix)[0][1]
|
|
|
+ else:
|
|
|
+ _prob = pre_prob
|
|
|
+ if _prob>0.5:
|
|
|
+ self.forward(float(_prob))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+@annotate('string -> bigint,bigint')
|
|
|
+class f_check_remerge_channel(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,json_remerge):
|
|
|
+ if json_remerge is not None:
|
|
|
+ list_group = json.loads(json_remerge)
|
|
|
+ for _group in list_group:
|
|
|
+ _keys = _group.get("data").keys()
|
|
|
+ if len(_keys)>0:
|
|
|
+ main_docid = int(list(_keys)[0])
|
|
|
+ for k,v in _group.get("data",{}).items():
|
|
|
+ self.forward(main_docid,int(k))
|
|
|
+ for _v in v:
|
|
|
+ self.forward(main_docid,int(_v))
|
|
|
+
|
|
|
+@annotate('string -> bigint,bigint')
|
|
|
+class f_check_remerge(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,json_remerge):
|
|
|
+ if json_remerge is not None:
|
|
|
+ list_group = json.loads(json_remerge)
|
|
|
+ for _group in list_group:
|
|
|
+ for _docid in _group:
|
|
|
+ self.forward(_group[-1],_docid)
|
|
|
+
|
|
|
+def getConfidence(rule_id):
|
|
|
+ if rule_id >=1 and rule_id <=20:
|
|
|
+ return 30
|
|
|
+ elif rule_id>=31 and rule_id<=50:
|
|
|
+ return 20
|
|
|
+ else:
|
|
|
+ return 10
|
|
|
+
|
|
|
+@annotate('string,bigint -> bigint,bigint,bigint')
|
|
|
+class f_arrange_group_single(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,json_set_docid,rule_id):
|
|
|
+ if json_set_docid is not None:
|
|
|
+ list_group = json.loads(json_set_docid)
|
|
|
+ for _group in list_group:
|
|
|
+ for index_i in range(len(_group)):
|
|
|
+ for index_j in range(len(_group)):
|
|
|
+ # if index_i!=index_j and _group[index_i]!=_group[index_j]:
|
|
|
+ if index_i!=index_j:
|
|
|
+ self.forward(_group[index_i],_group[index_j],getConfidence(rule_id))
|
|
|
+
|
|
|
+@annotate('bigint,bigint->string')
|
|
|
+class f_get_merge_docids(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ global json
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [set()]
|
|
|
+
|
|
|
+ def iterate(self, buffer,docid1,docid2):
|
|
|
+ buffer[0].add(docid1)
|
|
|
+ buffer[0].add(docid2)
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0] |= pbuffer[0]
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ set_docid = buffer[0]
|
|
|
+ list_docid = list(set_docid)
|
|
|
+ list_docid.sort(key=lambda x:x)
|
|
|
+ list_docid_str = []
|
|
|
+ for _docid in list_docid:
|
|
|
+ list_docid_str.append(str(_docid))
|
|
|
+ return ",".join(list_docid_str)
|
|
|
+
|
|
|
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
|
|
|
+class f_encode_time(object):
|
|
|
+
|
|
|
+
|
|
|
+ def evaluate(self,time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release):
|
|
|
+ _dict = {"time_bidclose":time_bidclose,"time_bidopen":time_bidopen,"time_bidstart":time_bidstart,
|
|
|
+ "time_commencement":time_commencement,"time_completion":time_completion,"time_earnest_money_end":time_earnest_money_end,
|
|
|
+ "time_earnest_money_start":time_earnest_money_start,"time_get_file_end":time_get_file_end,"time_get_file_start":time_get_file_start,
|
|
|
+ "time_publicity_end":time_publicity_end,"time_publicity_start":time_publicity_start,"time_registration_end":time_registration_end,
|
|
|
+ "time_registration_start":time_registration_start,"time_release":time_release}
|
|
|
+ _encode = json.dumps(_dict)
|
|
|
+
|
|
|
+ return _encode
|
|
|
+
|
|
|
+@annotate('string,string -> string,string')
|
|
|
+class f_decode_ruwei(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self, page_time,sub_docs_json):
|
|
|
+ if sub_docs_json is not None:
|
|
|
+ for sub_docs in json.loads(sub_docs_json):
|
|
|
+ if sub_docs.get("win_tenderer","")!="":
|
|
|
+ self.forward(page_time,sub_docs.get("win_tenderer",""))
|
|
|
+ if sub_docs.get("second_tenderer","")!="":
|
|
|
+ self.forward(page_time,sub_docs.get("second_tenderer",""))
|
|
|
+ if sub_docs.get("third_tenderer","")!="":
|
|
|
+ self.forward(page_time,sub_docs.get("third_tenderer",""))
|
|
|
+
|
|
|
+@annotate('string,string -> bigint,string')
|
|
|
+class f_get_docid_uuid(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self, uuid,docids):
|
|
|
+ log("%s-%s"%(str(uuid),str(docids)))
|
|
|
+ if docids is not None and docids!="":
|
|
|
+ l_docid = docids.split(",")
|
|
|
+ for _docid in l_docid:
|
|
|
+ try:
|
|
|
+ self.forward(int(_docid),uuid)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+
|
|
|
+@annotate('string,string->string')
|
|
|
+class f_concat_str(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ global json
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [[]]
|
|
|
+
|
|
|
+ def iterate(self, buffer,_str,concat_str):
|
|
|
+ buffer[0].append([_str,concat_str])
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0])
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ list_str_concat = buffer[0]
|
|
|
+ list_str = [a[0] for a in list_str_concat]
|
|
|
+ concat_str = ","
|
|
|
+ if len(list_str_concat)>0:
|
|
|
+ concat_str = list_str_concat[0][1]
|
|
|
+ return concat_str.join(list_str)
|
|
|
+
|
|
|
+def generate_common_properties(list_docs):
|
|
|
+ '''
|
|
|
+ #通用属性生成
|
|
|
+ :param list_docis:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ #计数法选择
|
|
|
+ choose_dict = {}
|
|
|
+ project_dict = {}
|
|
|
+ for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
|
|
|
+ for _doc in list_docs:
|
|
|
+ _value = _doc.get(_key,"")
|
|
|
+ if _value!="":
|
|
|
+ if _key not in choose_dict:
|
|
|
+ choose_dict[_key] = {}
|
|
|
+ if _value not in choose_dict[_key]:
|
|
|
+ choose_dict[_key][_value] = 0
|
|
|
+ choose_dict[_key][_value] += 1
|
|
|
+
|
|
|
+
|
|
|
+ _find = False
|
|
|
+ for _key in [document_district,document_city,document_province,document_area]:
|
|
|
+ area_dict = {}
|
|
|
+ for _doc in list_docs:
|
|
|
+ loc = _doc.get(_key,"未知")
|
|
|
+ if loc not in ('全国','未知',"0"):
|
|
|
+ if loc not in area_dict:
|
|
|
+ area_dict[loc] = 0
|
|
|
+ area_dict[loc] += 1
|
|
|
+ list_loc = []
|
|
|
+ for k,v in area_dict.items():
|
|
|
+ list_loc.append([k,v])
|
|
|
+ list_loc.sort(key=lambda x:x[1],reverse=True)
|
|
|
+ if len(list_loc)>0:
|
|
|
+ project_dict[document_district] = _doc.get(document_district)
|
|
|
+ project_dict[document_city] = _doc.get(document_city)
|
|
|
+ project_dict[document_province] = _doc.get(document_province)
|
|
|
+ project_dict[document_area] = _doc.get(document_area)
|
|
|
+ _find = True
|
|
|
+ break
|
|
|
+ if not _find:
|
|
|
+ if len(list_docs)>0:
|
|
|
+ project_dict[document_district] = list_docs[0].get(document_district)
|
|
|
+ project_dict[document_city] = list_docs[0].get(document_city)
|
|
|
+ project_dict[document_province] = list_docs[0].get(document_province)
|
|
|
+ project_dict[document_area] = list_docs[0].get(document_area)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ for _key,_value in choose_dict.items():
|
|
|
+ _l = []
|
|
|
+ for k,v in _value.items():
|
|
|
+ _l.append([k,v])
|
|
|
+ _l.sort(key=lambda x:x[1],reverse=True)
|
|
|
+ if len(_l)>0:
|
|
|
+ _v = _l[0][0]
|
|
|
+ if _v in ('全国','未知'):
|
|
|
+ if len(_l)>1:
|
|
|
+ _v = _l[1][0]
|
|
|
+ project_dict[_key] = _v
|
|
|
+
|
|
|
+
|
|
|
+ list_dynamics = []
|
|
|
+ docid_number = 0
|
|
|
+ visuable_docids = []
|
|
|
+ zhao_biao_page_time = ""
|
|
|
+ zhong_biao_page_time = ""
|
|
|
+ list_codes = []
|
|
|
+
|
|
|
+ list_product = []
|
|
|
+ p_page_time = ""
|
|
|
+ remove_docids = set()
|
|
|
+ set_nlp_enterprise = set()
|
|
|
+ set_nlp_enterprise_attachment = set()
|
|
|
+ for _doc in list_docs:
|
|
|
+ table_name = _doc.get("table_name")
|
|
|
+ status = _doc.get(document_status,0)
|
|
|
+ _save = _doc.get(document_tmp_save,1)
|
|
|
+ doctitle = _doc.get(document_doctitle,"")
|
|
|
+ docchannel = _doc.get(document_docchannel)
|
|
|
+ page_time = _doc.get(document_page_time,"")
|
|
|
+ _docid = _doc.get(document_docid)
|
|
|
+ _bidway = _doc.get(document_bidway,"")
|
|
|
+ _docchannel = _doc.get(document_life_docchannel,0)
|
|
|
+ project_codes = _doc.get(document_project_codes)
|
|
|
+ product = _doc.get(document_product)
|
|
|
+ sub_docs = _doc.get("sub_docs",[])
|
|
|
+
|
|
|
+ is_multipack = True if len(sub_docs)>1 else False
|
|
|
+ extract_count = _doc.get(document_tmp_extract_count,0)
|
|
|
+
|
|
|
+ try:
|
|
|
+ set_nlp_enterprise |= set(json.loads(_doc.get(document_nlp_enterprise,"[]")))
|
|
|
+ set_nlp_enterprise_attachment |= set(json.loads(_doc.get(document_nlp_enterprise_attachment,"[]")))
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ if product is not None:
|
|
|
+ list_product.extend(product.split(","))
|
|
|
+
|
|
|
+ if project_codes is not None:
|
|
|
+ _c = project_codes.split(",")
|
|
|
+ list_codes.extend(_c)
|
|
|
+
|
|
|
+ if p_page_time=="":
|
|
|
+ p_page_time = page_time
|
|
|
+
|
|
|
+ if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
|
|
|
+ zhao_biao_page_time = page_time
|
|
|
+ if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
|
|
|
+ zhong_biao_page_time = page_time
|
|
|
+ is_visuable = 0
|
|
|
+ if table_name=="document":
|
|
|
+ if status>=201 and status<=300:
|
|
|
+ docid_number +=1
|
|
|
+ visuable_docids.append(str(_docid))
|
|
|
+ is_visuable = 1
|
|
|
+ else:
|
|
|
+ remove_docids.add(str(_docid))
|
|
|
+ else:
|
|
|
+ if _save==1:
|
|
|
+ docid_number +=1
|
|
|
+ visuable_docids.append(str(_docid))
|
|
|
+ is_visuable = 1
|
|
|
+ else:
|
|
|
+ remove_docids.add(str(_docid))
|
|
|
+ list_dynamics.append({document_docid:_docid,
|
|
|
+ document_doctitle:doctitle,
|
|
|
+ document_docchannel:_docchannel,
|
|
|
+ document_bidway:_bidway,
|
|
|
+ document_page_time:page_time,
|
|
|
+ document_status:201 if is_visuable==1 else 401,
|
|
|
+ "is_multipack":is_multipack,
|
|
|
+ document_tmp_extract_count:extract_count
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ project_dict[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
|
|
|
+ project_dict[project_docid_number] = docid_number
|
|
|
+ project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
|
|
|
+ if zhao_biao_page_time !="":
|
|
|
+ project_dict[project_zhao_biao_page_time] = zhao_biao_page_time
|
|
|
+ if zhong_biao_page_time !="":
|
|
|
+ project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
|
|
|
+ project_dict[project_project_codes] = ",".join(list(set(list_codes)))
|
|
|
+ project_dict[project_page_time] = p_page_time
|
|
|
+ project_dict[project_product] = ",".join(list(set(list_product)))
|
|
|
+ project_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
|
|
|
+ project_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
|
|
|
+
|
|
|
+ return project_dict
|
|
|
+
|
|
|
+
|
|
|
+def generate_packages_properties(list_docs):
|
|
|
+ '''
|
|
|
+ 生成分包属性
|
|
|
+ :param list_docs:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+
|
|
|
+ list_properties = []
|
|
|
+ set_key = set()
|
|
|
+ for _doc in list_docs:
|
|
|
+ _dict = {}
|
|
|
+ sub_docs = _doc.get("sub_docs")
|
|
|
+ if sub_docs is not None:
|
|
|
+ for _d in sub_docs:
|
|
|
+ sub_project_code = _d.get(project_sub_project_code,"")
|
|
|
+ sub_project_name = _d.get(project_sub_project_name,"")
|
|
|
+ win_tenderer = _d.get(project_win_tenderer,"")
|
|
|
+ win_bid_price = _d.get(project_win_bid_price,"")
|
|
|
+ _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
|
|
|
+ if _key in set_key:
|
|
|
+ continue
|
|
|
+ set_key.add(_key)
|
|
|
+ list_properties.append(_d)
|
|
|
+ return list_properties
|
|
|
+
|
|
|
+def generate_projects(list_docs):
|
|
|
+ '''
|
|
|
+ #通过公告生成projects
|
|
|
+ :param list_docids:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ #判断标段数
|
|
|
+
|
|
|
+ list_projects = []
|
|
|
+
|
|
|
+ project_dict = generate_common_properties(list_docs)
|
|
|
+
|
|
|
+ list_package_properties = generate_packages_properties(list_docs)
|
|
|
+ #生成包数据
|
|
|
+ for _pp in list_package_properties:
|
|
|
+ _pp.update(project_dict)
|
|
|
+ list_projects.append(_pp)
|
|
|
+
|
|
|
+ return list_projects
|
|
|
+
|
|
|
+@annotate("string->bigint")
|
|
|
+class totimestamp(object):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import time
|
|
|
+ global time
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ import re
|
|
|
+ global json,logging,re
|
|
|
+ self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def evaluate(self, str_time):
|
|
|
+ try:
|
|
|
+ logging.info(str_time)
|
|
|
+ if str_time is not None and re.search(self.time_pattern,str_time) is not None:
|
|
|
+ timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
|
|
|
+ timeStamp = int(time.mktime(timeArray))
|
|
|
+ return timeStamp
|
|
|
+ else:
|
|
|
+ return 0
|
|
|
+ except Exception as e:
|
|
|
+ return 0
|
|
|
+
|
|
|
+@annotate('bigint,string,string,bigint,string,bigint,string,string,string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
|
|
|
+class f_generate_projects_from_document(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+ self.ToTimeStamp = totimestamp()
|
|
|
+
|
|
|
+
|
|
|
+ def process(self, docid,
|
|
|
+ extract_json,
|
|
|
+ doctitle,
|
|
|
+ save,
|
|
|
+ bidway,
|
|
|
+ status,
|
|
|
+ page_time,
|
|
|
+ info_source,
|
|
|
+ fingerprint,
|
|
|
+ docchannel,
|
|
|
+ life_docchannel,
|
|
|
+ area,
|
|
|
+ province,
|
|
|
+ city,
|
|
|
+ district,
|
|
|
+ sub_docs_json,
|
|
|
+ industry,
|
|
|
+ info_type,
|
|
|
+ qcodes,
|
|
|
+ project_name,
|
|
|
+ project_code,
|
|
|
+ tenderee,
|
|
|
+ tenderee_addr,
|
|
|
+ tenderee_phone,
|
|
|
+ tenderee_contact,
|
|
|
+ agency,
|
|
|
+ agency_phone,
|
|
|
+ agency_contact,
|
|
|
+ procurement_system,
|
|
|
+ project_codes,
|
|
|
+ product,
|
|
|
+ moneysource,
|
|
|
+ time_bidclose,
|
|
|
+ time_bidopen,
|
|
|
+ time_bidstart,
|
|
|
+ time_commencement,
|
|
|
+ time_completion,
|
|
|
+ time_earnest_money_start,
|
|
|
+ time_earnest_money_end,
|
|
|
+ time_get_file_end,
|
|
|
+ time_get_file_start,
|
|
|
+ time_publicity_end,
|
|
|
+ time_publicity_start,
|
|
|
+ time_registration_end,
|
|
|
+ time_registration_start,
|
|
|
+ time_release,
|
|
|
+ extract_count,
|
|
|
+ uuids):
|
|
|
+ attrs_dict = {}
|
|
|
+ _extract = {}
|
|
|
+ try:
|
|
|
+ attrs_dict["sub_docs"] = json.loads(sub_docs_json)
|
|
|
+ _extract = json.loads(extract_json)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ attrs_dict[document_nlp_enterprise] = json.dumps(_extract.get(document_nlp_enterprise,[]),ensure_ascii=False)
|
|
|
+ attrs_dict[document_nlp_enterprise_attachment] = json.dumps(_extract.get(document_nlp_enterprise_attachment,[]),ensure_ascii=False)
|
|
|
+
|
|
|
+ attrs_dict[document_docid] = docid
|
|
|
+ attrs_dict[document_doctitle] = doctitle
|
|
|
+ attrs_dict[document_tmp_save] = save
|
|
|
+ attrs_dict[document_bidway] = bidway
|
|
|
+ attrs_dict[document_status] = status
|
|
|
+ attrs_dict[document_page_time] = page_time
|
|
|
+ attrs_dict[document_info_source] = info_source
|
|
|
+ attrs_dict[document_fingerprint] = fingerprint
|
|
|
+ attrs_dict[document_docchannel] = docchannel
|
|
|
+ if life_docchannel is not None:
|
|
|
+ attrs_dict[document_life_docchannel] = life_docchannel
|
|
|
+ else:
|
|
|
+ attrs_dict[document_life_docchannel] = docchannel
|
|
|
+ attrs_dict[document_area] = area
|
|
|
+ attrs_dict[document_province] = province
|
|
|
+ attrs_dict[document_city] = city
|
|
|
+ attrs_dict[document_district] = district
|
|
|
+ attrs_dict[document_tmp_sub_docs_json] = sub_docs_json
|
|
|
+ attrs_dict[document_industry] = industry
|
|
|
+ attrs_dict[document_info_type] = info_type
|
|
|
+ attrs_dict[document_qcodes] = qcodes
|
|
|
+ attrs_dict[document_project_name] = project_name
|
|
|
+ attrs_dict[document_project_code] = project_code
|
|
|
+ attrs_dict[document_tenderee] = tenderee
|
|
|
+ attrs_dict[document_tenderee_addr] = tenderee_addr
|
|
|
+ attrs_dict[document_tenderee_phone] = tenderee_phone
|
|
|
+ attrs_dict[document_tenderee_contact] = tenderee_contact
|
|
|
+ attrs_dict[document_agency] = agency
|
|
|
+ attrs_dict[document_agency_phone] = agency_phone
|
|
|
+ attrs_dict[document_agency_contact] = agency_contact
|
|
|
+ attrs_dict[project_procurement_system] = procurement_system
|
|
|
+ attrs_dict[document_project_codes] = project_codes
|
|
|
+ attrs_dict[document_product] = product
|
|
|
+ attrs_dict[document_moneysource] = moneysource
|
|
|
+ attrs_dict[document_time_bidclose] = time_bidclose
|
|
|
+ attrs_dict[document_time_bidopen] = time_bidopen
|
|
|
+ attrs_dict[document_time_bidstart] = time_bidstart
|
|
|
+ attrs_dict[document_time_commencement] = time_commencement
|
|
|
+ attrs_dict[document_time_completion] = time_completion
|
|
|
+ attrs_dict[document_time_earnest_money_start] = time_earnest_money_start
|
|
|
+ attrs_dict[document_time_earnest_money_end] = time_earnest_money_end
|
|
|
+ attrs_dict[document_time_get_file_end] = time_get_file_end
|
|
|
+ attrs_dict[document_time_get_file_start] = time_get_file_start
|
|
|
+ attrs_dict[document_time_publicity_end] = time_publicity_end
|
|
|
+ attrs_dict[document_time_publicity_start] = time_publicity_start
|
|
|
+ attrs_dict[document_time_registration_end] = time_registration_end
|
|
|
+ attrs_dict[document_time_registration_start] = time_registration_start
|
|
|
+ attrs_dict[document_time_release] = time_release
|
|
|
+ attrs_dict[document_tmp_extract_count] = _extract.get(document_tmp_extract_count,0)
|
|
|
+ attrs_dict["table_name"] = "document"
|
|
|
+
|
|
|
+ list_projects = generate_projects([attrs_dict])
|
|
|
+ if len(list_projects)>0:
|
|
|
+ list_projects[0][project_delete_uuid] = uuids if uuids is not None else ""
|
|
|
+
|
|
|
+ log(str(list_projects))
|
|
|
+ for _project in list_projects:
|
|
|
+ _uuid = uuid4().hex
|
|
|
+ docids = _project.get(project_docids,"")
|
|
|
+ page_time = _project.get(project_page_time,"")
|
|
|
+ project_name = _project.get(project_project_name,"")
|
|
|
+ project_codes = _project.get(project_project_codes,"")
|
|
|
+ tenderee = _project.get(project_tenderee,"")
|
|
|
+ agency = _project.get(project_agency,"")
|
|
|
+ bidding_budget = float(_project.get(project_bidding_budget,-1))
|
|
|
+ win_tenderer = _project.get(project_win_tenderer,"")
|
|
|
+ win_bid_price = float(_project.get(project_win_bid_price,-1))
|
|
|
+ product = _project.get(project_product,"")
|
|
|
+ attrs_json = json.dumps(_project,ensure_ascii=False)
|
|
|
+ list_codes = project_codes.split(",")
|
|
|
+ page_time_stamp = self.ToTimeStamp.evaluate(page_time)
|
|
|
+ if len(list_codes)==0:
|
|
|
+ list_codes.append("")
|
|
|
+ list_product = product.split(",")
|
|
|
+ if len(list_product)==0:
|
|
|
+ list_product.append("")
|
|
|
+ for _i in range(min(max(len(list_codes),len(list_product)),20)):
|
|
|
+ _project_code = list_codes[_i%len(list_codes)]
|
|
|
+ _product = list_product[_i%len(list_product)]
|
|
|
+ self.forward(_uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
|
|
|
+
|
|
|
+@annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
|
|
|
+class f_generate_projects_from_project(BaseUDTF):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+ self.ToTimeStamp = totimestamp()
|
|
|
+
|
|
|
+
|
|
|
+ def process(self, uuid,
|
|
|
+ docids,
|
|
|
+ zhao_biao_page_time,
|
|
|
+ zhong_biao_page_time,
|
|
|
+ page_time,
|
|
|
+ area,
|
|
|
+ province,
|
|
|
+ city,
|
|
|
+ district,
|
|
|
+ info_type,
|
|
|
+ industry,
|
|
|
+ qcodes,
|
|
|
+ project_name,
|
|
|
+ project_code,
|
|
|
+ project_codes,
|
|
|
+ project_addr,
|
|
|
+ tenderee,
|
|
|
+ tenderee_addr,
|
|
|
+ tenderee_phone,
|
|
|
+ tenderee_contact,
|
|
|
+ agency,
|
|
|
+ agency_phone,
|
|
|
+ agency_contact,
|
|
|
+ sub_project_name,
|
|
|
+ sub_project_code,
|
|
|
+ bidding_budget,
|
|
|
+ win_tenderer,
|
|
|
+ win_bid_price,
|
|
|
+ win_tenderer_manager,
|
|
|
+ win_tenderer_phone,
|
|
|
+ second_tenderer,
|
|
|
+ second_bid_price,
|
|
|
+ second_tenderer_manager,
|
|
|
+ second_tenderer_phone,
|
|
|
+ third_tenderer,
|
|
|
+ third_bid_price,
|
|
|
+ third_tenderer_manager,
|
|
|
+ third_tenderer_phone,
|
|
|
+ procurement_system,
|
|
|
+ bidway,
|
|
|
+ dup_data,
|
|
|
+ docid_number,
|
|
|
+ project_dynamic,
|
|
|
+ product,
|
|
|
+ moneysource,
|
|
|
+ service_time,
|
|
|
+ time_bidclose,
|
|
|
+ time_bidopen,
|
|
|
+ time_bidstart,
|
|
|
+ time_commencement,
|
|
|
+ time_completion,
|
|
|
+ time_earnest_money_start,
|
|
|
+ time_earnest_money_end,
|
|
|
+ time_get_file_end,
|
|
|
+ time_get_file_start,
|
|
|
+ time_publicity_end,
|
|
|
+ time_publicity_start,
|
|
|
+ time_registration_end,
|
|
|
+ time_registration_start,
|
|
|
+ time_release,
|
|
|
+ dup_docid,
|
|
|
+ info_source,
|
|
|
+ nlp_enterprise,
|
|
|
+ nlp_enterprise_attachment,
|
|
|
+ update_time):
|
|
|
+ attrs_dict = {}
|
|
|
+
|
|
|
+ attrs_dict[project_uuid] = uuid
|
|
|
+ attrs_dict[project_docids] = docids
|
|
|
+ attrs_dict[project_zhao_biao_page_time] = zhao_biao_page_time
|
|
|
+ attrs_dict[project_zhong_biao_page_time] = zhong_biao_page_time
|
|
|
+ attrs_dict[project_page_time] = page_time
|
|
|
+ attrs_dict[project_area] = area
|
|
|
+ attrs_dict[project_province] = province
|
|
|
+ attrs_dict[project_city] = city
|
|
|
+ attrs_dict[project_district] = district
|
|
|
+ attrs_dict[project_info_type] = info_type
|
|
|
+ attrs_dict[project_industry] = industry
|
|
|
+ attrs_dict[project_qcodes] = qcodes
|
|
|
+ attrs_dict[project_project_name] = project_name
|
|
|
+ attrs_dict[project_project_code] = project_code
|
|
|
+ attrs_dict[project_project_codes] = project_codes
|
|
|
+ attrs_dict[project_project_addr] = project_addr
|
|
|
+ attrs_dict[project_tenderee] = tenderee
|
|
|
+ attrs_dict[project_tenderee_addr] = tenderee_addr
|
|
|
+ attrs_dict[project_tenderee_phone] = tenderee_phone
|
|
|
+ attrs_dict[project_tenderee_contact] = tenderee_contact
|
|
|
+ attrs_dict[project_agency] = agency
|
|
|
+ attrs_dict[project_agency_phone] = agency_phone
|
|
|
+ attrs_dict[project_agency_contact] = agency_contact
|
|
|
+ attrs_dict[project_sub_project_name] = sub_project_name
|
|
|
+ attrs_dict[project_sub_project_code] = sub_project_code
|
|
|
+ attrs_dict[project_bidding_budget] = bidding_budget
|
|
|
+ attrs_dict[project_win_tenderer] = win_tenderer
|
|
|
+ attrs_dict[project_win_bid_price] = win_bid_price
|
|
|
+ attrs_dict[project_win_tenderer_manager] = win_tenderer_manager
|
|
|
+ attrs_dict[project_win_tenderer_phone] = win_tenderer_phone
|
|
|
+ attrs_dict[project_second_tenderer] = second_tenderer
|
|
|
+ attrs_dict[project_second_bid_price] = second_bid_price
|
|
|
+ attrs_dict[project_second_tenderer_manager] = second_tenderer_manager
|
|
|
+ attrs_dict[project_second_tenderer_phone] = second_tenderer_phone
|
|
|
+ attrs_dict[project_third_tenderer] = third_tenderer
|
|
|
+ attrs_dict[project_third_bid_price] = third_bid_price
|
|
|
+ attrs_dict[project_third_tenderer_manager] = third_tenderer_manager
|
|
|
+ attrs_dict[project_third_tenderer_phone] = third_tenderer_phone
|
|
|
+ attrs_dict[project_procurement_system] = procurement_system
|
|
|
+ attrs_dict[project_bidway] = bidway
|
|
|
+ attrs_dict[project_dup_data] = dup_data
|
|
|
+ attrs_dict[project_docid_number] = docid_number
|
|
|
+ attrs_dict[project_project_dynamics] = project_dynamic
|
|
|
+ attrs_dict[project_product] = product
|
|
|
+ attrs_dict[project_moneysource] = moneysource
|
|
|
+ attrs_dict[project_service_time] = service_time
|
|
|
+ attrs_dict[project_time_bidclose] = time_bidclose
|
|
|
+ attrs_dict[project_time_bidopen] = time_bidopen
|
|
|
+ attrs_dict[project_time_bidstart] = time_bidstart
|
|
|
+ attrs_dict[project_time_commencement] = time_commencement
|
|
|
+ attrs_dict[project_time_completion] = time_completion
|
|
|
+ attrs_dict[project_time_earnest_money_start] = time_earnest_money_start
|
|
|
+ attrs_dict[project_time_earnest_money_end] = time_earnest_money_end
|
|
|
+ attrs_dict[project_time_get_file_end] = time_get_file_end
|
|
|
+ attrs_dict[project_time_get_file_start] = time_get_file_start
|
|
|
+ attrs_dict[project_time_publicity_end] = time_publicity_end
|
|
|
+ attrs_dict[project_time_publicity_start] = time_publicity_start
|
|
|
+ attrs_dict[project_time_registration_end] = time_registration_end
|
|
|
+ attrs_dict[project_time_registration_start] = time_registration_start
|
|
|
+ attrs_dict[project_time_release] = time_release
|
|
|
+ attrs_dict[project_dup_docid] = dup_docid
|
|
|
+ attrs_dict[project_info_source] = info_source
|
|
|
+ attrs_dict[project_nlp_enterprise] = nlp_enterprise
|
|
|
+ attrs_dict[project_nlp_enterprise_attachment] = nlp_enterprise_attachment
|
|
|
+ attrs_dict[project_update_time] = update_time
|
|
|
+
|
|
|
+
|
|
|
+ popNoneFromDict(attrs_dict)
|
|
|
+
|
|
|
+ attrs_json = json.dumps(attrs_dict,ensure_ascii=False)
|
|
|
+ if bidding_budget is None:
|
|
|
+ bidding_budget = -1
|
|
|
+
|
|
|
+ if win_bid_price is None:
|
|
|
+ win_bid_price = -1
|
|
|
+
|
|
|
+ list_codes = project_codes.split(",")
|
|
|
+ page_time_stamp = self.ToTimeStamp.evaluate(page_time)
|
|
|
+ if len(list_codes)==0:
|
|
|
+ list_codes.append("")
|
|
|
+ list_product = product.split(",")
|
|
|
+ if len(list_product)==0:
|
|
|
+ list_product.append("")
|
|
|
+ for _i in range(min(max(len(list_codes),len(list_product)),20)):
|
|
|
+ _project_code = list_codes[_i%len(list_codes)]
|
|
|
+ _product = list_product[_i%len(list_product)]
|
|
|
+ self.forward(uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
|
|
|
+
|
|
|
+def appendKeyvalueCount(list_projects,keys=[project_tenderee,project_agency,project_win_tenderer,project_win_bid_price,project_bidding_budget,project_product]):
|
|
|
+ for _proj in list_projects:
|
|
|
+ _count = 0
|
|
|
+ for k in keys:
|
|
|
+ v = _proj.get(k,"")
|
|
|
+ if isinstance(v,str):
|
|
|
+ if not (v is None or v==""):
|
|
|
+ _count += 1
|
|
|
+ elif isinstance(v,(int,float)):
|
|
|
+ if v>0:
|
|
|
+ _count += 1
|
|
|
+ _proj["keyvaluecount"] = _count
|
|
|
+
|
|
|
+
|
|
|
+def dumplicate_projects(list_projects,b_log=False):
|
|
|
+ '''
|
|
|
+ 对多标段项目进行去重
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ appendKeyvalueCount(list_projects)
|
|
|
+ list_projects.sort(key=lambda x:x.get(project_page_time,""))
|
|
|
+ list_projects.sort(key=lambda x:x.get("keyvaluecount",0),reverse=True)
|
|
|
+ cluster_projects = list_projects
|
|
|
+ while 1:
|
|
|
+ _update = False
|
|
|
+ list_p = []
|
|
|
+ log("================")
|
|
|
+ for _p in cluster_projects:
|
|
|
+ log("docids:%s"%(_p.get(project_docids,"")))
|
|
|
+
|
|
|
+ for _pp in cluster_projects:
|
|
|
+ _find = False
|
|
|
+ for _p in list_p:
|
|
|
+ if check_merge_rule(_p,_pp,b_log):
|
|
|
+ update_projects_by_project(_pp,[_p])
|
|
|
+ _find = True
|
|
|
+ _update = True
|
|
|
+ break
|
|
|
+ if not _find:
|
|
|
+ list_p.append(_pp)
|
|
|
+
|
|
|
+ if len(cluster_projects)==len(list_p):
|
|
|
+ break
|
|
|
+ cluster_projects = list_p
|
|
|
+ return cluster_projects
|
|
|
+
|
|
|
+def update_projects_by_project(project_dict,projects):
|
|
|
+
|
|
|
+ _dict = {}
|
|
|
+ #更新公共属性
|
|
|
+ for k,v in project_dict.items():
|
|
|
+ if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment):
|
|
|
+ continue
|
|
|
+ for _proj in projects:
|
|
|
+ if k not in _proj:
|
|
|
+ _dict[k] = v
|
|
|
+ else:
|
|
|
+ _v = _proj.get(k)
|
|
|
+ if type(v)==type(_v):
|
|
|
+ if isinstance(_v,str):
|
|
|
+ if _v in ('',"未知","全国"):
|
|
|
+ _dict[k] = v
|
|
|
+ elif isinstance(_v,(int,float)):
|
|
|
+ if _v==0:
|
|
|
+ _dict[k] = v
|
|
|
+ for _proj in projects:
|
|
|
+ _proj.update(_dict)
|
|
|
+
|
|
|
+ #拼接属性
|
|
|
+ append_dict = {}
|
|
|
+ set_docid = set()
|
|
|
+ set_product = set()
|
|
|
+ set_code = set()
|
|
|
+ set_uuid = set()
|
|
|
+ set_delete_uuid = set()
|
|
|
+ set_nlp_enterprise = set()
|
|
|
+ set_nlp_enterprise_attachment = set()
|
|
|
+ for _proj in projects:
|
|
|
+ _docids = _proj.get(project_docids,"")
|
|
|
+ _codes = _proj.get(project_project_codes,"")
|
|
|
+ _product = _proj.get(project_product,"")
|
|
|
+ _uuid = _proj.get(project_uuid,"")
|
|
|
+ delete_uuid = _proj.get(project_delete_uuid,"")
|
|
|
+ set_docid = set_docid | set(_docids.split(","))
|
|
|
+ set_code = set_code | set(_codes.split(","))
|
|
|
+ set_product = set_product | set(_product.split(","))
|
|
|
+ set_uuid = set_uuid | set(_uuid.split(","))
|
|
|
+ set_delete_uuid = set_delete_uuid | set(delete_uuid.split(","))
|
|
|
+ try:
|
|
|
+ set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
|
|
|
+ set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
|
|
|
+ set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
|
|
|
+ set_product = set_product | set(project_dict.get(project_product,"").split(","))
|
|
|
+
|
|
|
+ set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
|
|
|
+ set_delete_uuid = set_delete_uuid | set(project_dict.get(project_delete_uuid,"").split(","))
|
|
|
+
|
|
|
+ try:
|
|
|
+ set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
|
|
|
+ set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+
|
|
|
+ append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
|
|
|
+ append_dict[project_docid_number] = len(set_docid)
|
|
|
+ append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""][:30])
|
|
|
+ append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""][:30])
|
|
|
+ append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
|
|
|
+ append_dict[project_delete_uuid] = ",".join([a for a in list(set_delete_uuid) if a!=""])
|
|
|
+ append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
|
|
|
+ append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
|
|
|
+
|
|
|
+ dict_dynamic = {}
|
|
|
+ set_docid = set()
|
|
|
+ for _proj in projects:
|
|
|
+ _dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
|
|
|
+ for _dy in _dynamic:
|
|
|
+ _docid = _dy.get("docid")
|
|
|
+ dict_dynamic[_docid] = _dy
|
|
|
+ _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
|
|
|
+ for _dy in _dynamic:
|
|
|
+ _docid = _dy.get("docid")
|
|
|
+ dict_dynamic[_docid] = _dy
|
|
|
+ list_dynamics = []
|
|
|
+ for k,v in dict_dynamic.items():
|
|
|
+ list_dynamics.append(v)
|
|
|
+ list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
|
|
|
+
|
|
|
+ append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
|
|
|
+
|
|
|
+ for _proj in projects:
|
|
|
+ _proj.update(append_dict)
|
|
|
+
|
|
|
+def getTimeStamp(page_time):
|
|
|
+ try:
|
|
|
+ return time.mktime(time.strptime(page_time,'%Y-%m-%d'))
|
|
|
+ except Exception as e:
|
|
|
+ return 0
|
|
|
+
|
|
|
+def timeAdd(_time,days):
|
|
|
+ try:
|
|
|
+ a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
|
|
|
+
|
|
|
+ _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
|
|
|
+ return _time1
|
|
|
+ except Exception as e:
|
|
|
+ return None
|
|
|
+
|
|
|
+def check_time_merge(json_time_less,json_time_greater,b_log,set_time_key=set([project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start])):
|
|
|
+ same_count = 0
|
|
|
+ if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
|
|
|
+ if isinstance(json_time_less,dict):
|
|
|
+ time_less = json_time_less
|
|
|
+ else:
|
|
|
+ time_less = json.loads(json_time_less)
|
|
|
+ if isinstance(json_time_greater,dict):
|
|
|
+ time_greater = json_time_greater
|
|
|
+ else:
|
|
|
+ time_greater = json.loads(json_time_greater)
|
|
|
+ for k,v in time_less.items():
|
|
|
+ if k in set_time_key:
|
|
|
+ if getLength(v)>0:
|
|
|
+ v1 = time_greater.get(k,"")
|
|
|
+ if getLength(v1)>0:
|
|
|
+ _dis = getTimeStamp(v)-getTimeStamp(v1)
|
|
|
+ if _dis>86400*2 or _dis<-86400*2:
|
|
|
+ if b_log:
|
|
|
+ log("check time failed %s-%s-%s"%(str(k),str(v),str(v1)))
|
|
|
+ return -1
|
|
|
+ else:
|
|
|
+ same_count += 1
|
|
|
+ if same_count>0:
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+def check_product_merge(product,product_to_merge,b_log):
|
|
|
+ #check product
|
|
|
+ set_product = set([a for a in product.split(",") if a!=""])
|
|
|
+ set_product_to_merge = set([a for a in product_to_merge.split(",") if a!=""])
|
|
|
+ if len(set_product)>0 and len(set_product_to_merge)>0:
|
|
|
+ if len(set_product&set_product_to_merge)==0:
|
|
|
+ if b_log:
|
|
|
+ log("check product failed %s===%s"%(str(product),str(product_to_merge)))
|
|
|
+ return -1
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+
|
|
|
+def check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit):
|
|
|
+ page_time_stamp = getTimeStamp(page_time)
|
|
|
+ page_time_to_merge_stamp = getTimeStamp(page_time_to_merge)
|
|
|
+ if page_time_stamp is not None and page_time_to_merge_stamp is not None:
|
|
|
+ _dis = max(page_time_stamp,page_time_to_merge_stamp)-min(page_time_stamp,page_time_to_merge_stamp)
|
|
|
+ if _dis>time_limit:
|
|
|
+ if b_log:
|
|
|
+ log("check page_time_dis failed %s===%s"%(str(page_time),str(page_time_to_merge)))
|
|
|
+ return -1
|
|
|
+ if _dis<time_limit//8:
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+def check_project_name_merge(project_name,project_name_to_merge,b_log):
|
|
|
+ #判断项目名称
|
|
|
+ return 0
|
|
|
+ if len(project_name)>15 and len(project_name_to_merge)>15:
|
|
|
+ _sim = getSimilarityOfString(project_name,project_name_to_merge)
|
|
|
+ if _sim<0.7:
|
|
|
+ if b_log:
|
|
|
+ log("check project_name failed %s===%s"%(str(project_name),str(project_name_to_merge)))
|
|
|
+ return -1
|
|
|
+ return 1
|
|
|
+
|
|
|
+def check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,b_log):
|
|
|
+ if (len(zhong_biao_page_time)>0 and len(zhao_biao_page_time_to_merge)>0 and zhong_biao_page_time<zhao_biao_page_time_to_merge) or (len(zhong_biao_page_time_to_merge)>0 and len(zhao_biao_page_time)>0 and zhong_biao_page_time_to_merge<zhao_biao_page_time):
|
|
|
+ if b_log:
|
|
|
+ log("check zhaobiao zhongbiao page_time failed %s=%s===%s=%s"%(str(zhao_biao_page_time),str(zhong_biao_page_time),str(zhao_biao_page_time_to_merge),str(zhong_biao_page_time_to_merge)))
|
|
|
+ return -1
|
|
|
+ return 1
|
|
|
+
|
|
|
+def check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log):
|
|
|
+ #check sub_project_name
|
|
|
+ _set = set([a for a in [sub_project_name.replace("Project",""),sub_project_name_to_merge.replace("Project","")] if a!=""])
|
|
|
+ if len(_set)>1:
|
|
|
+ if b_log:
|
|
|
+ log("check sub_project_name failed %s===%s"%(str(sub_project_name),str(sub_project_name_to_merge)))
|
|
|
+ return -1
|
|
|
+ return 1
|
|
|
+
|
|
|
+def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log):
|
|
|
+ _set1 = set([a for a in [tenderee,tenderee_to_merge] if a!=""])
|
|
|
+ if len(_set1)>1:
|
|
|
+ if tenderee in enterprise_to_merge or tenderee_to_merge in enterprise:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ if getSimilarityOfString(tenderee,tenderee_to_merge)==1:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ if b_log:
|
|
|
+ log("check tenderee failed %s===%s"%(str(tenderee),str(tenderee_to_merge)))
|
|
|
+ return -1
|
|
|
+ _set2 = set([a for a in [agency,agency_to_merge] if a!=""])
|
|
|
+ if len(_set2)>1:
|
|
|
+ if getSimilarityOfString(agency,agency_to_merge)==1:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ if b_log:
|
|
|
+ log("check agency failed %s===%s"%(str(agency),str(agency_to_merge)))
|
|
|
+ return -1
|
|
|
+ _set3 = set([a for a in [win_tenderer,win_tenderer_to_merge] if a!=""])
|
|
|
+ if len(_set3)>1:
|
|
|
+ if win_tenderer in enterprise_to_merge or win_tenderer_to_merge in enterprise:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ if getSimilarityOfString(win_tenderer,win_tenderer_to_merge)==1:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ if b_log:
|
|
|
+ log("check win_tenderer failed %s===%s"%(str(win_tenderer),str(win_tenderer_to_merge)))
|
|
|
+ return -1
|
|
|
+ if len(_set1)+len(_set2)+len(_set3)>=2:
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+def check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log):
|
|
|
+ _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])
|
|
|
+ if len(_set)>1:
|
|
|
+ if b_log:
|
|
|
+ log("check bidding_budget failed %s===%s"%(str(bidding_budget),str(bidding_budget_to_merge)))
|
|
|
+ return -1
|
|
|
+
|
|
|
+ _set1 = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
|
|
|
+ if len(_set1)>1:
|
|
|
+ if b_log:
|
|
|
+ log("check win_bid_price failed %s===%s"%(str(win_bid_price),str(win_bid_price_to_merge)))
|
|
|
+ return -1
|
|
|
+ #check money
|
|
|
+
|
|
|
+ if len(_set)==1 and len(_set1)==1:
|
|
|
+ max_win_bid_price = max(_set1)
|
|
|
+ max_bidding_budget = max(_set)
|
|
|
+ radio = max_win_bid_price/max_bidding_budget
|
|
|
+ if max_win_bid_price>max_bidding_budget:
|
|
|
+ if b_log:
|
|
|
+ log("check money failed %s===%s"%(str(max(_set1)),str(max(_set))))
|
|
|
+ return -1
|
|
|
+ else:
|
|
|
+ if radio<0.3:
|
|
|
+ return -1
|
|
|
+ if (bidding_budget>0 and bidding_budget_to_merge>0) or (win_bid_price>0 and win_bid_price_to_merge>0):
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+def check_project_codes_merge(list_code,list_code_to_merge,b_log):
|
|
|
+ #check project_codes
|
|
|
+ has_same = False
|
|
|
+ has_similar = False
|
|
|
+ for _c in list_code:
|
|
|
+ for _c1 in list_code_to_merge:
|
|
|
+ _simi = getSimilarityOfString(_c,_c1,3)
|
|
|
+ if _simi==1:
|
|
|
+ has_same = True
|
|
|
+ elif _simi>0.6:
|
|
|
+ has_similar = True
|
|
|
+ else:
|
|
|
+ if len(_c)==len(_c1) and len(_c)>8 and _c!=_c1:
|
|
|
+ has_similar = True
|
|
|
+
|
|
|
+ if not has_same and has_similar:
|
|
|
+ if b_log:
|
|
|
+ log("check code failed %s===%s"%(str(list_code),str(list_code_to_merge)))
|
|
|
+ return -1
|
|
|
+ if has_same:
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=False):
|
|
|
+ page_time = _proj.get(project_page_time,"")
|
|
|
+ project_codes = _proj.get(project_project_codes,"")
|
|
|
+ project_name = _proj.get(project_project_name,"")
|
|
|
+ tenderee = _proj.get(project_tenderee,"")
|
|
|
+ agency = _proj.get(project_agency,"")
|
|
|
+ product = _proj.get(project_product,"")
|
|
|
+ sub_project_name = _proj.get(project_sub_project_name,"")
|
|
|
+ bidding_budget = float(_proj.get(project_bidding_budget,-1))
|
|
|
+ win_tenderer = _proj.get(project_win_tenderer,"")
|
|
|
+ win_bid_price = float(_proj.get(project_win_bid_price,-1))
|
|
|
+ project_code = _proj.get(project_project_code,"")
|
|
|
+ zhao_biao_page_time = _proj.get(project_zhao_biao_page_time,"")
|
|
|
+ zhong_biao_page_time = _proj.get(project_zhong_biao_page_time,"")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ enterprise = _proj.get("enterprise")
|
|
|
+ if enterprise is None:
|
|
|
+ try:
|
|
|
+ enterprise = set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
|
|
|
+ enterprise |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
+ _proj["enterprise"] = enterprise
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ list_code = [a for a in project_codes.split(",") if a!='']
|
|
|
+ if project_code!="":
|
|
|
+ list_code.append(project_code)
|
|
|
+ list_code = [a for a in list_code if a is not None]
|
|
|
+
|
|
|
+ page_time_to_merge = _dict.get(project_page_time,"")
|
|
|
+ project_codes_to_merge = _dict.get(project_project_codes,"")
|
|
|
+ project_name_to_merge = _dict.get(project_project_name,"")
|
|
|
+ tenderee_to_merge = _dict.get(project_tenderee,"")
|
|
|
+ agency_to_merge = _dict.get(project_agency,"")
|
|
|
+ product_to_merge = _dict.get(project_product,"")
|
|
|
+ sub_project_name_to_merge = _dict.get(project_sub_project_name,"")
|
|
|
+ bidding_budget_to_merge = float(_dict.get(project_bidding_budget,-1))
|
|
|
+ win_tenderer_to_merge = _dict.get(project_win_tenderer,"")
|
|
|
+ win_bid_price_to_merge = float(_dict.get(project_win_bid_price,-1))
|
|
|
+ project_code_to_merge = _dict.get(project_project_code,"")
|
|
|
+
|
|
|
+ zhao_biao_page_time_to_merge = _dict.get(project_zhao_biao_page_time,"")
|
|
|
+ zhong_biao_page_time_to_merge = _dict.get(project_zhong_biao_page_time,"")
|
|
|
+
|
|
|
+ list_code_to_merge = [a for a in project_codes_to_merge.split(",") if a!='']
|
|
|
+ if project_code_to_merge!="":
|
|
|
+ list_code_to_merge.append(project_code_to_merge)
|
|
|
+
|
|
|
+ list_code_to_merge = [a for a in list_code_to_merge if a is not None]
|
|
|
+
|
|
|
+
|
|
|
+ enterprise_to_merge = _dict.get("enterprise")
|
|
|
+ if enterprise_to_merge is None:
|
|
|
+ try:
|
|
|
+ enterprise_to_merge = set(json.loads(_dict.get(project_nlp_enterprise,"[]")))
|
|
|
+ enterprise_to_merge |= set(json.loads(_dict.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
+ _dict["enterprise"] = enterprise_to_merge
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+
|
|
|
+ check_dict = {0:0,1:0,-1:0}
|
|
|
+
|
|
|
+ _zhaozhong_check = check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,b_log)
|
|
|
+ check_dict[_zhaozhong_check] += 1
|
|
|
+ if check_dict[-1]>0:
|
|
|
+ if return_prob:
|
|
|
+ return False,0
|
|
|
+ return False
|
|
|
+
|
|
|
+ _money_check = check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log)
|
|
|
+ check_dict[_money_check] += 1
|
|
|
+ if check_dict[-1]>0:
|
|
|
+ if return_prob:
|
|
|
+ return False,0
|
|
|
+ return False
|
|
|
+
|
|
|
+ _roles_check = check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log)
|
|
|
+ check_dict[_roles_check] += 1
|
|
|
+ if check_dict[-1]>0:
|
|
|
+ if return_prob:
|
|
|
+ return False,0
|
|
|
+ return False
|
|
|
+
|
|
|
+ _codes_check = check_project_codes_merge(list_code,list_code_to_merge,b_log)
|
|
|
+ check_dict[_codes_check] += 1
|
|
|
+ if check_dict[-1]>0:
|
|
|
+ if return_prob:
|
|
|
+ return False,0
|
|
|
+ return False
|
|
|
+
|
|
|
+ _product_check = check_product_merge(product,product_to_merge,b_log)
|
|
|
+ check_dict[_product_check] += 1
|
|
|
+ if check_dict[-1]>0:
|
|
|
+ if return_prob:
|
|
|
+ return False,0
|
|
|
+ return False
|
|
|
+
|
|
|
+ _time_check = check_time_merge(_proj,_dict,b_log)
|
|
|
+ check_dict[_time_check] += 1
|
|
|
+
|
|
|
+ _sub_project_name_check = check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log)
|
|
|
+ check_dict[_sub_project_name_check] += 1
|
|
|
+
|
|
|
+ _project_name_check = check_project_name_merge(project_name,project_name_to_merge,b_log)
|
|
|
+ check_dict[_project_name_check] += 1
|
|
|
+
|
|
|
+ _page_time_check = check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit)
|
|
|
+ check_dict[_page_time_check] += 1
|
|
|
+
|
|
|
+ _prob = check_dict[1]/(check_dict[-1]+check_dict[0]+check_dict[1])
|
|
|
+ if check_dict[-1]>0:
|
|
|
+ if check_dict[-1]==1:
|
|
|
+ if (_codes_check==1 and _roles_check==1 and _product_check==1) or (_roles_check==1 and _money_check==1 and _product_check==1):
|
|
|
+ if return_prob:
|
|
|
+ return True,_prob
|
|
|
+ return True
|
|
|
+ if return_prob:
|
|
|
+ return False,0
|
|
|
+ return False
|
|
|
+
|
|
|
+ if return_prob:
|
|
|
+ return True,_prob
|
|
|
+ return True
|
|
|
+
|
|
|
+@annotate('string,bigint,string->string')
|
|
|
+class f_group_merge_projects(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ global json
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [[]]
|
|
|
+
|
|
|
+ def iterate(self, buffer,_uuid,page_time_stamp,attrs_json):
|
|
|
+ buffer[0].append([_uuid,page_time_stamp,attrs_json])
|
|
|
+ buffer[0] = buffer[0][:1000]
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0][:1000])
|
|
|
+ buffer[0] = buffer[0][:1000]
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ set_uuid = set()
|
|
|
+ list_data = []
|
|
|
+ log("111:\n%s"%(str(buffer)))
|
|
|
+ for _uuid,page_time_stamp,attrs_json in buffer[0]:
|
|
|
+ if _uuid in set_uuid:
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ attrs = json.loads(attrs_json)
|
|
|
+ list_data.append([_uuid,page_time_stamp,attrs])
|
|
|
+ set_uuid.add(_uuid)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ list_group_data = []
|
|
|
+ list_group = split_with_time(list_data,1)
|
|
|
+
|
|
|
+ for _group in list_group:
|
|
|
+ list_group_pair = []
|
|
|
+ _group = _group[:50]
|
|
|
+ for _i in range(len(_group)):
|
|
|
+ for _j in range(_i+1,len(_group)):
|
|
|
+ _p_uuid,_,_p = _group[_i]
|
|
|
+ _pp_uuid,_,_pp = _group[_j]
|
|
|
+ if check_merge_rule(_p,_pp,True):
|
|
|
+ list_group_pair.append([_p_uuid,_pp_uuid])
|
|
|
+ if len(list_group_pair)>0:
|
|
|
+ list_group_data.append(list_group_pair)
|
|
|
+
|
|
|
+ return json.dumps(list_group_data)
|
|
|
+
|
|
|
+@annotate('string -> string,string')
|
|
|
+class f_extract_uuid_groups(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,json_groups):
|
|
|
+ if json_groups is not None:
|
|
|
+ list_group = json.loads(json_groups)
|
|
|
+ for l_group in list_group:
|
|
|
+ for _group in l_group:
|
|
|
+ self.forward(_group[0],_group[1])
|
|
|
+ self.forward(_group[1],_group[0])
|
|
|
+
|
|
|
+@annotate('string,string->string')
|
|
|
+class f_group_uuids(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ global json
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [[]]
|
|
|
+
|
|
|
+ def iterate(self, buffer,uuid_1,uuid_2):
|
|
|
+ buffer[0].append([uuid_1,uuid_2])
|
|
|
+ buffer[0] = buffer[0][:1000]
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0][:1000])
|
|
|
+ buffer[0] = buffer[0][:1000]
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ set_uuid = set()
|
|
|
+ for uuid_1,uuid_2 in buffer[0]:
|
|
|
+ set_uuid.add(uuid_1)
|
|
|
+ set_uuid.add(uuid_2)
|
|
|
+
|
|
|
+ list_uuid = list(set_uuid)
|
|
|
+ list_uuid.sort(key=lambda x:x)
|
|
|
+
|
|
|
+ return ",".join(list_uuid)
|
|
|
+
|
|
|
+@annotate('string -> string,string')
|
|
|
+class f_extract_union_group(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,str_uuids):
|
|
|
+ if str_uuids is not None:
|
|
|
+ list_uuid = [a for a in str_uuids.split(",") if a!=""]
|
|
|
+ if len(list_uuid)>0:
|
|
|
+ for i in range(len(list_uuid)):
|
|
|
+ for j in range(i,len(list_uuid)):
|
|
|
+ self.forward(list_uuid[i],list_uuid[j])
|
|
|
+ self.forward(list_uuid[j],list_uuid[i])
|
|
|
+
|
|
|
+@annotate('string -> string,string')
|
|
|
+class f_extract_group_uuids(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,str_uuids):
|
|
|
+ if str_uuids is not None:
|
|
|
+ list_uuid = [a for a in str_uuids.split(",") if a!=""]
|
|
|
+ if len(list_uuid)>0:
|
|
|
+ main_uuid = list_uuid[0]
|
|
|
+ for _uuid in list_uuid:
|
|
|
+ self.forward(main_uuid,_uuid)
|
|
|
+
|
|
|
+class MyEncoder(json.JSONEncoder):
|
|
|
+ def default(self, obj):
|
|
|
+ if isinstance(obj, np.ndarray):
|
|
|
+ return obj.tolist()
|
|
|
+ elif isinstance(obj, bytes):
|
|
|
+ return str(obj, encoding='utf-8')
|
|
|
+ elif isinstance(obj, (np.float_, np.float16, np.float32,
|
|
|
+ np.float64)):
|
|
|
+ return float(obj)
|
|
|
+ elif isinstance(obj,str):
|
|
|
+ return obj
|
|
|
+ return json.JSONEncoder.default(self, obj)
|
|
|
+
|
|
|
+def to_project_json(projects):
|
|
|
+
|
|
|
+ list_proj = []
|
|
|
+ for _proj in projects:
|
|
|
+ _uuid = _proj.get(project_uuid,"")
|
|
|
+ if "enterprise" in _proj:
|
|
|
+ _proj.pop("enterprise")
|
|
|
+ list_uuid = [a for a in _uuid.split(",") if a!=""]
|
|
|
+ if len(list_uuid)>0:
|
|
|
+ _proj["keep_uuid"] = list_uuid[0]
|
|
|
+ _proj["delete_uuid"] = ",".join(list_uuid[1:])
|
|
|
+ else:
|
|
|
+ _proj["keep_uuid"] = _proj.get("keep_uuid","")
|
|
|
+ _proj["delete_uuid"] = _proj.get("delete_uuid","")
|
|
|
+ list_proj.append(_proj)
|
|
|
+ if project_uuid in _proj:
|
|
|
+ _proj.pop(project_uuid)
|
|
|
+ return json.dumps(list_proj,cls=MyEncoder,ensure_ascii=False)
|
|
|
+
|
|
|
+def get_page_time_dis(page_time,n_page_time):
|
|
|
+ _dis = -1
|
|
|
+ try:
|
|
|
+ page_time_stamp = time.mktime(time.strptime(page_time,'%Y-%m-%d'))
|
|
|
+ n_page_time_stamp = time.mktime(time.strptime(n_page_time,'%Y-%m-%d'))
|
|
|
+ _dis = (max(page_time_stamp,n_page_time_stamp)-min(page_time_stamp,n_page_time_stamp))//86400
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+
|
|
|
+ return _dis
|
|
|
+
|
|
|
+def check_page_time_dup(page_time,n_page_time):
|
|
|
+ _dis = get_page_time_dis(page_time,n_page_time)
|
|
|
+ if _dis>=0 and _dis<=10:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def dumplicate_document_in_merge(list_projects):
|
|
|
+ '''
|
|
|
+ 合并时去重
|
|
|
+ :param list_projects:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+
|
|
|
+ for _proj in list_projects:
|
|
|
+ try:
|
|
|
+ dict_channel_proj = {}
|
|
|
+ _project_dynamics = _proj.get(project_project_dynamics,"[]")
|
|
|
+ list_dynamics = json.loads(_project_dynamics)
|
|
|
+ set_dup_docid = set()
|
|
|
+ _time = time.time()
|
|
|
+ for _d in list_dynamics:
|
|
|
+ docid = _d.get(document_docid)
|
|
|
+ _status = _d.get(document_status,201)
|
|
|
+ is_multipack = _d.get("is_multipack",True)
|
|
|
+ extract_count = _d.get(document_tmp_extract_count,0)
|
|
|
+ docchannel = _d.get(document_docchannel,0)
|
|
|
+ page_time = _d.get(document_page_time,"")
|
|
|
+ if _status>=201 and _status<=300 and docchannel>0:
|
|
|
+ if docchannel in dict_channel_proj:
|
|
|
+ n_d = dict_channel_proj[docchannel]
|
|
|
+ n_docid = n_d.get(document_docid)
|
|
|
+ n_is_multipack = n_d.get("is_multipack",True)
|
|
|
+ n_extract_count = n_d.get(document_tmp_extract_count,0)
|
|
|
+ n_page_time = n_d.get(document_page_time,"")
|
|
|
+ if docid==n_docid:
|
|
|
+ continue
|
|
|
+ if not check_page_time_dup(page_time,n_page_time):
|
|
|
+ continue
|
|
|
+ if not is_multipack and not n_is_multipack:
|
|
|
+ if extract_count>n_extract_count:
|
|
|
+ set_dup_docid.add(str(n_docid))
|
|
|
+ dict_channel_proj[docchannel] = _d
|
|
|
+ elif extract_count==n_extract_count:
|
|
|
+ if int(n_docid)>int(docid):
|
|
|
+ set_dup_docid.add(str(n_docid))
|
|
|
+ dict_channel_proj[docchannel] = _d
|
|
|
+ elif int(n_docid)<int(docid):
|
|
|
+ set_dup_docid.add(str(docid))
|
|
|
+ else:
|
|
|
+ set_dup_docid.add(str(docid))
|
|
|
+ else:
|
|
|
+ dict_channel_proj[docchannel] = _d
|
|
|
+
|
|
|
+ docids = _proj.get(project_docids,"")
|
|
|
+ set_docids = set([a for a in docids.split(",") if a!=""])
|
|
|
+ set_docids = set_docids-set_dup_docid
|
|
|
+ if len(set_docids)==0:
|
|
|
+ log("projects set_docids length is zero %s"%(docids))
|
|
|
+ else:
|
|
|
+ _proj[project_docids] = ",".join(list(set_docids))
|
|
|
+ _proj[project_docid_number] = len(set_docids)
|
|
|
+ _proj[project_dup_docid] = ",".join(list(set_dup_docid))
|
|
|
+ log("dumplicate_document docid%s dynamic %d takes%.3f"%(str(docid),len(list_dynamics),time.time()-_time))
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+@annotate('string,string->string')
|
|
|
+class f_dumplicate_projects(BaseUDAF):
|
|
|
+ '''
|
|
|
+ 合并组为一条记录
|
|
|
+ '''
|
|
|
+ def __init__(self):
|
|
|
+ import json
|
|
|
+ import sys
|
|
|
+ global json,sys
|
|
|
+
|
|
|
+ def new_buffer(self):
|
|
|
+ return [[]]
|
|
|
+
|
|
|
+ def iterate(self, buffer,_uuid,attrs_json):
|
|
|
+ buffer[0].append([_uuid,attrs_json])
|
|
|
+ buffer[0] = buffer[0][:1000]
|
|
|
+
|
|
|
+ def merge(self, buffer, pbuffer):
|
|
|
+ buffer[0].extend(pbuffer[0][:1000])
|
|
|
+ buffer[0] = buffer[0][:1000]
|
|
|
+
|
|
|
+ def terminate(self, buffer):
|
|
|
+ set_uuid = set()
|
|
|
+ list_data = []
|
|
|
+ for uuid_1,attrs_json in buffer[0]:
|
|
|
+ if uuid_1 in set_uuid:
|
|
|
+ continue
|
|
|
+ list_data.append(json.loads(attrs_json))
|
|
|
+ set_uuid.add(uuid_1)
|
|
|
+
|
|
|
+ list_projects = dumplicate_projects(list_data,True)
|
|
|
+ dumplicate_document_in_merge(list_projects)
|
|
|
+
|
|
|
+ log("===========2")
|
|
|
+ project_json = to_project_json(list_projects)
|
|
|
+
|
|
|
+ return project_json
|
|
|
+
|
|
|
+@annotate('string -> string')
|
|
|
+class f_generate_project_with_attrs_json(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,attrs_json):
|
|
|
+ if attrs_json is not None:
|
|
|
+ _group = json.loads(attrs_json)
|
|
|
+ self.forward(json.dumps([_group]),ensure_ascii=False)
|
|
|
+
|
|
|
+@annotate('string -> string')
|
|
|
+class f_generate_project_with_delete_uuid(BaseUDTF):
|
|
|
+ '''
|
|
|
+ 将多个组拆解成多条记录
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ import logging
|
|
|
+ import json
|
|
|
+ global json,logging
|
|
|
+ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def process(self,delete_uuid):
|
|
|
+ if delete_uuid is not None:
|
|
|
+ _group = {project_delete_uuid:delete_uuid,
|
|
|
+ "to_delete":True}
|
|
|
+ self.forward(json.dumps([_group]),ensure_ascii=False)
|
|
|
+
|
|
|
+def test_remerge():
|
|
|
+ a = f_remege_limit_num_contain_bychannel()
|
|
|
+ buffer = a.new_buffer()
|
|
|
+ tmp_s = '''
|
|
|
+ 266523906 266539038 2022-09-08 1662566400 SDGP371525000202201000421_A 冠县第二实验小学平台教育信息化设备采购智慧屏 冠县第二实验小学平台教育信息化设备采购智慧屏成交公告 冠县第二实验小学平台教育信息化设备智慧屏 冠县第二实验小学 聊城市采购中心 山东润博网络有限公司 246890.0 101 0 12 "{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
|
|
|
+ 266523906 266523906 2022-09-15 1663171200 SDGP371525000202201000421_A 冠县第二实验小学平台教育信息化设备采购智慧屏 冠县第二实验小学平台教育信息化设备采购智慧屏成交公告 冠县第二实验小学平台教育信息化设备智慧屏 冠县第二实验小学 聊城市采购中心 山东润博网络有限公司 246890.0 101 999 12 "{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
|
|
|
+
|
|
|
+ '''
|
|
|
+ for _s in tmp_s.split("\n"):
|
|
|
+ ls = _s.split("\t")
|
|
|
+ if len(ls)!=17:
|
|
|
+ continue
|
|
|
+ _confid = 1 if ls[14] =="" else ls[14]
|
|
|
+ a.iterate(buffer,ls[1],ls[13],int(ls[3]),ls[8],ls[10],ls[11],ls[12],ls[7],ls[5],ls[4],_confid,ls[15],ls[16][1:-1])
|
|
|
+ # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
|
|
|
+ # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
|
|
|
+ # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
|
|
|
+ print(a.terminate(buffer))
|
|
|
+ print(1)
|
|
|
+
|
|
|
+ print(getSimilarityOfString('37168100014015220220012_40785671','SDGP371681000202201000912'))
|
|
|
+
|
|
|
+
|
|
|
+def test_merge_rule():
|
|
|
+ o_a = {
|
|
|
+ "bidding_budget":2022,
|
|
|
+ "bidding_budget_unit":"",
|
|
|
+ "second_bid_price":0,
|
|
|
+ "second_bid_price_unit":"",
|
|
|
+ "second_service_time":"",
|
|
|
+ "second_tenderer":"丹江口市金智恒贸易有限宏茗Verito",
|
|
|
+ "sub_project_code":"",
|
|
|
+ "sub_project_name":"Project",
|
|
|
+ "win_bid_price":4950,
|
|
|
+ "win_bid_price_unit":"万元",
|
|
|
+ "win_service_time":"",
|
|
|
+ "win_tenderer":"丹江口市方谊电脑网络有限公司",
|
|
|
+ "win_tenderer_manager":"汤蕙冰",
|
|
|
+ "win_tenderer_phone":"07195232489",
|
|
|
+ "district":"丹江口",
|
|
|
+ "city":"十堰",
|
|
|
+ "province":"湖北",
|
|
|
+ "area":"华中",
|
|
|
+ "industry":"通用设备",
|
|
|
+ "info_type":"计算机设备",
|
|
|
+ "info_source":"政府采购",
|
|
|
+ "qcodes": "",
|
|
|
+ "project_name":"丹江口市交通运输局财务专用电脑采购",
|
|
|
+ "project_code":"丹采计备【2022】XY0002号",
|
|
|
+ "tenderee":"丹江口市交通运输局",
|
|
|
+ "tenderee_addr": "",
|
|
|
+ "tenderee_phone":"0719-5222536",
|
|
|
+ "agency":"丹江口市交通运输局",
|
|
|
+ "agency_phone":"0719-5222536",
|
|
|
+ "procurement_system":"交通系统",
|
|
|
+ "time_bidopen":"2022-04-02",
|
|
|
+ "extract_count":0,
|
|
|
+ "project_dynamic":"[{\"docid\": 230964885, \"doctitle\": \"丹江口市交通运输局财务专用电脑采购中标(成交)结果公告\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2022-04-03\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}]",
|
|
|
+ "docid_number":1,
|
|
|
+ "docids":"230964885",
|
|
|
+ "zhong_biao_page_time":"2022-04-03",
|
|
|
+ "project_codes":"2022001,BJ2022040280753,丹采计备【2022】XY0002号",
|
|
|
+ "page_time":"2022-04-03",
|
|
|
+ "product":"躁魉鼙锼鹅缝,交通运输躅台式电脑舍,台式计算机(强制节能),财务专用电脑,台式电脑,办公设备",
|
|
|
+ "nlp_enterprise":"[]",
|
|
|
+ "nlp_enterprise_attachment":"[]",
|
|
|
+ "delete_uuid":"5aa174e2-859b-4ea9-8d64-5f2174886084",
|
|
|
+ "keyvaluecount":6,
|
|
|
+ "dup_docid":"",
|
|
|
+ "keep_uuid":""
|
|
|
+ }
|
|
|
+ o_b = {
|
|
|
+ "bidding_budget":0,
|
|
|
+ "bidding_budget_unit":"",
|
|
|
+ "sub_project_code":"",
|
|
|
+ "sub_project_name":"Project",
|
|
|
+ "win_bid_price":4950,
|
|
|
+ "win_bid_price_unit":"万元",
|
|
|
+ "win_service_time":"",
|
|
|
+ "win_tenderer":"丹江口市方谊电脑网络有限公司",
|
|
|
+ "district":"丹江口",
|
|
|
+ "city":"十堰",
|
|
|
+ "province":"湖北",
|
|
|
+ "area":"华中",
|
|
|
+ "industry":"通用设备",
|
|
|
+ "info_type":"计算机设备",
|
|
|
+ "info_source":"工程建设",
|
|
|
+ "qcodes": "",
|
|
|
+ "project_name":"丹江口市交通运输局财务专用电脑采购",
|
|
|
+ "project_code":"丹采计备【2022】XY0002号",
|
|
|
+ "tenderee":"丹江口市交通运输局",
|
|
|
+ "tenderee_addr": "",
|
|
|
+ "tenderee_phone":"07195222536",
|
|
|
+ "tenderee_contact":"洪书梅",
|
|
|
+ "agency":"丹江口市交通运输局",
|
|
|
+ "agency_phone":"07195222536",
|
|
|
+ "agency_contact":"洪书梅",
|
|
|
+ "procurement_system":"交通系统",
|
|
|
+ "time_bidopen":"2022-04-02",
|
|
|
+ "extract_count":0,
|
|
|
+ "project_dynamic":"[{\"docid\": 232857494, \"doctitle\": \"丹江口市交通运输局交通运输局财务专用电脑采购合同公告\", \"docchannel\": 120, \"bidway\": \"询价\", \"page_time\": \"2022-04-12\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}, {\"docid\": 234180491, \"doctitle\": \"丹江口市交通运输局财务专用电脑采购中标(成交)结果公告\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2022-04-19\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}]",
|
|
|
+ "docid_number":2,
|
|
|
+ "docids":"232857494,234180491",
|
|
|
+ "zhong_biao_page_time":"2022-04-19",
|
|
|
+ "project_codes":"2022001,丹采计备【2022】XY0002号,20220402271923",
|
|
|
+ "page_time":"2022-04-19",
|
|
|
+ "product":"财务专用电脑,台式电脑",
|
|
|
+ "nlp_enterprise":"[]",
|
|
|
+ "nlp_enterprise_attachment":"[]",
|
|
|
+ "delete_uuid":"b2a2594c-764d-46c2-9717-80307b63937c",
|
|
|
+ "keyvaluecount":5,
|
|
|
+ "win_tenderer_manager":"",
|
|
|
+ "win_tenderer_phone":"13329854499",
|
|
|
+ "bidway":"询价",
|
|
|
+ "time_release":"2022-04-12",
|
|
|
+ "dup_docid":"",
|
|
|
+ "keep_uuid":""
|
|
|
+ }
|
|
|
+
|
|
|
+ print(check_merge_rule(o_a,o_b,True))
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ test_merge_rule()
|