Browse Source

Merge remote-tracking branch 'origin/master'

znj 3 years ago
parent
commit
4e56c2005f

+ 1 - 0
BiddingKG/dl/common/Utils.py

@@ -42,6 +42,7 @@ def getLazyLoad():
 
 
 
+
 model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
 model_word = None
 lock_model_word = RLock()

+ 0 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -1144,7 +1144,6 @@ def segment(soup,final=True):
             LOOP_BEGIN += LOOP_LEN
         text = _text
 
-
     return text
 
 '''

BIN
BiddingKG/dl/interface/channel_savedmodel/channel.pb


BIN
BiddingKG/dl/interface/channel_savedmodel/doctype.pb


+ 14 - 7
BiddingKG/dl/interface/extract.py

@@ -3,6 +3,7 @@ Created on 2019年1月4日
 
 @author: User
 '''
+import os
 
 from bs4 import BeautifulSoup, Comment
 import copy
@@ -15,6 +16,7 @@ import time
 
 _time1 = time.time()
 sys.path.append(os.path.abspath("../.."))
+
 from BiddingKG.dl.common.Utils import *
 import BiddingKG.dl.entityLink.entityLink as entityLink
 import BiddingKG.dl.interface.predictor as predictor
@@ -128,12 +130,17 @@ def test(name,content):
 
 if __name__=="__main__":
     import pandas as pd
+
     df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')
-    # for i in range(50):
-    i = 246
-    doc_id = df.loc[i, 'docid']
-    text = df.loc[i, 'dochtmlcon']
-    title = df.loc[i, 'doctitle']
-    rs = predict(doc_id,text,title)
-    print(rs)
+    new_prem = []
+    for i in range(len(df)):
+    # i = 246
+        doc_id = df.loc[i, 'docid']
+        text = df.loc[i, 'dochtmlcon']
+        title = df.loc[i, 'doctitle']
+        rs = predict(doc_id,text,title)
+        # print(rs)
+        new_prem.append(rs)
+    df['new_prem'] = pd.Series(new_prem)
+    df.to_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0813.xlsx')
     pass

+ 1 - 0
BiddingKG/dl/interface/predictor.py

@@ -1050,6 +1050,7 @@ class FormPredictor():
             return self.model_form_context.predict(form_datas)
         else:
             return self.getModel(type).predict(form_datas)
+
     
 
 #角色规则

+ 1 - 1
BiddingKG/dl/role/train.py

@@ -283,7 +283,7 @@ def generate_data():
 
 if __name__=="__main__":
     # loadTrainData()
-    # train()
+    train()
     # relabel()
     # generate_data()
     test()

+ 2 - 0
BiddingKG/dl/test/12.py

@@ -0,0 +1,2 @@
+
+import filetype

+ 62 - 7
BiddingKG/maxcompute/attachmentRec.py

@@ -193,13 +193,68 @@ class f_getRandomStr(object):
     def __init__(self):
         import random
         global random
+        self.result_s = ""
+
 
     def evaluate(self,count):
-        result_s = ""
-        for i in range(count):
-            result_s+= "a"
-        for i in range(5):
-            index = random.randint(0,len(result_s)-1)
-            result_s[index] = 'b'
-        return result_s
+
+        if self.result_s=="":
+            list_c = [chr(ord('a')+i) for i in range(26)]
+            result_s = ""
+            for i in range(count):
+                index = random.randint(0,len(list_c)-1)
+                result_s += list_c[index]
+            self.result_s = result_s
+        for i in range(count//200):
+            index = random.randint(0,len(self.result_s)-1)
+            index_1 = random.randint(0,len(self.result_s)-1)
+            self.result_s = self.result_s[:index]+self.result_s[index_1:index_1+1]+self.result_s[index+1:]
+        return self.result_s
+
+@annotate('string->string')
+class f_extract_pageAttachments(BaseUDTF):
+
+    def __init__(self):
+        include_package_path("envs_py37.env.zip")
+        import json
+        from uuid import uuid4
+        from bs4 import BeautifulSoup
+        import logging
+        global json,BeautifulSoup
+
+    def process(self,_html):
+        if _html is not None:
+            page_attachments = self.extract_pageAttachments(_html)
+            if len(page_attachments)>0:
+                self.forward(json.dumps(page_attachments,ensure_ascii=False))
+
+    def extract_pageAttachments(self,_html):
+        fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"]
+        _soup = BeautifulSoup(_html,"lxml")
+        list_a = _soup.find_all("a")
+        list_img = _soup.find_all("img")
+        page_attachments = []
+        for _a in list_a:
+            _text  =_a.get_text()
+            _url = _a.attrs.get("href","")
+            if _url.find("http://www.bidizhaobiao.com")>=0:
+                continue
+            is_attach = False
+            for suf in fileSuffix:
+                if _text.find(suf)>=0 or _url.find(suf)>=0:
+                    is_attach = True
+            if is_attach:
+                page_attachments.append({"fileLink":_url,"fileTitle":_text})
+        for _a in list_img:
+            _text  =_a.get_text()
+            _url = _a.attrs.get("src","")
+            if _url.find("http://www.bidizhaobiao.com")>=0:
+                continue
+            is_attach = False
+            for suf in fileSuffix:
+                if _text.find(suf)>=0 or _url.find(suf)>=0:
+                    is_attach = True
+            if is_attach:
+                page_attachments.append({"fileLink":_url,"fileTitle":_text})
+        return page_attachments
 

+ 5 - 2
BiddingKG/maxcompute/documentDumplicate.py

@@ -44,8 +44,11 @@ class f_decode_extract(BaseUDTF):
         web_source_no = _other.get("webSourceNo","")
         docchannel = _other.get("docchannel",0)
         if re.search(self.time_pattern,page_time) is not None:
-            timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
-            page_time_stamp = int(time.mktime(timeArray))
+            try:
+                timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
+                page_time_stamp = int(time.mktime(timeArray))
+            except Exception as e:
+                pass
         list_code = _extract.get("code",[])
         if len(list_code)>0:
             project_code = list_code[0]

+ 323 - 15
BiddingKG/maxcompute/documentMerge.py

@@ -1,7 +1,78 @@
 #coding:UTF8
+
+
 from odps.udf import annotate
-from odps.udf import BaseUDAF
-from odps.udf import BaseUDTF
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF,BaseUDAF
+
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+import json
+
+
+def log(msg):
+    logging.info(msg)
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+
+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
+    log("add path:%s"%(_path))
+    sys.path.append(_path)
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
+import platform
 
 
 def getSet(list_dict,key):
@@ -188,6 +259,7 @@ class f_remege_limit_num_contain(BaseUDAF):
         contain_keys = ["contain_column1","contain_column2"]
 
         logging.info(the_group)
+        logging.info(str(re_merge)+str(re_merge_sim))
         if re_merge or re_merge_sim:
             the_group.sort(key=lambda x:x["confidence"],reverse=True)
             the_group.sort(key=lambda x:x["page_time_stamp"])
@@ -275,7 +347,7 @@ class f_remege_limit_num_contain(BaseUDAF):
                 #     final_group.append(list(set(_group["docid"])))
         else:
             final_group = [list(set([item["docid"] for item in the_group]))]
-
+        log(str(final_group))
         return json.dumps(final_group)
 
 @annotate('string -> string')
@@ -434,8 +506,17 @@ def check_columns(tenderee_less,tenderee_greater,
     code_sim = getSimilarityOfString(project_code_less,project_code_greater)
     if code_sim>0.6 and code_sim<1:
         return False
+
+    #同批次不同编号
+    if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
+        _split_code_less = project_code_less.split("-")
+        _split_code_greater = project_code_greater.split("-")
+        if len(_split_code_less)>1 and len(_split_code_greater)>1:
+            if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
+                return False
+
     _set_win_tenderer = set()
-    if win_tenderer_less is not None and tenderee_less!="":
+    if win_tenderer_less is not None and win_tenderer_less!="":
         _set_win_tenderer.add(win_tenderer_less)
     if win_tenderer_greater is not None and win_tenderer_greater!="":
         _set_win_tenderer.add(win_tenderer_greater)
@@ -443,18 +524,20 @@ def check_columns(tenderee_less,tenderee_greater,
         return False
     _set_win_bid_price = set()
     if win_bid_price_less is not None and win_bid_price_less!="":
-        _set_win_bid_price.add(win_bid_price_less)
+        _set_win_bid_price.add(float(win_bid_price_less))
     if win_bid_price_greater is not None and win_bid_price_greater!="":
-        _set_win_bid_price.add(win_bid_price_greater)
+        _set_win_bid_price.add(float(win_bid_price_greater))
     if len(_set_win_bid_price)>1:
         return False
     _set_bidding_budget = set()
     if bidding_budget_less is not None and bidding_budget_less!="":
-        _set_bidding_budget.add(bidding_budget_less)
+        _set_bidding_budget.add(float(bidding_budget_less))
     if bidding_budget_greater is not None and bidding_budget_greater!="":
-        _set_bidding_budget.add(bidding_budget_greater)
+        _set_bidding_budget.add(float(bidding_budget_greater))
     if len(_set_bidding_budget)>1:
         return False
+
+
     return True
 
 def getSimLevel(str1,str2):
@@ -482,7 +565,26 @@ import math
 def featurnCount(_count,max_count=100):
     return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
 
-@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
+def getLength(_str):
+    return len(_str if _str is not None else "")
+
+
+@annotate("string->bigint")
+class f_get_min_counts(object):
+
+
+    def evaluate(self,json_context):
+        _context = json.loads(json_context)
+
+        min_counts = 100
+
+        for item in _context:
+            if item["counts"]<min_counts:
+                min_counts = item["counts"]
+        return min_counts
+
+
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
 class f_merge_featureMatrix(BaseUDTF):
 
     def __init__(self):
@@ -494,15 +596,20 @@ class f_merge_featureMatrix(BaseUDTF):
                                     agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
                                     win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
                                     bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
-        # if not check_columns(tenderee_less,tenderee_greater,
-        #                      agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
-        #                      win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
-        #                      bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater)
-        #     return
+        if not check_columns(tenderee_less,tenderee_greater,
+                             agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
+                             win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
+                             bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
+            return
 
         _context = json.loads(json_context)
+
+        min_counts = 100
+
         dict_context = {}
         for item in _context:
+            if item["counts"]<min_counts:
+                min_counts = item["counts"]
             dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
         context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
         list_matrix = []
@@ -526,7 +633,208 @@ class f_merge_featureMatrix(BaseUDTF):
         list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
         list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
         list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
-        self.forward(json.dumps(list_matrix))
+
+        # set_tenderer = set()
+        # if tenderee_less is not None and tenderee_less!="":
+        #     set_tenderer.add(tenderee_less)
+        # if tenderee_greater is not None and tenderee_greater!="":
+        #     set_tenderer.add(tenderee_greater)
+        #
+        # set_win_tenderer = set()
+        # if win_tenderer_less is not None and win_tenderer_less!="":
+        #     set_win_tenderer.add(win_tenderer_less)
+        # if win_tenderer_greater is not None and win_tenderer_greater!="":
+        #     set_win_tenderer.add(win_tenderer_greater)
+        #
+        # set_bidding_budget = set()
+        # if bidding_budget_less is not None and bidding_budget_less!="":
+        #     set_bidding_budget.add(bidding_budget_less)
+        # if bidding_budget_greater is not None and bidding_budget_greater!="":
+        #     set_bidding_budget.add(bidding_budget_greater)
+        #
+        # set_win_bid_price = set()
+        # if win_bid_price_less is not None and win_bid_price_less!="":
+        #     set_win_bid_price.add(win_bid_price_less)
+        # if win_bid_price_greater is not None and win_bid_price_greater!="":
+        #     set_win_bid_price.add(win_bid_price_greater)
+
+        json_matrix = json.dumps(list_matrix)
+
+        same_project_code = False
+        if project_code_less==project_code_greater and getLength(project_code_less)>0:
+            same_project_code = True
+
+        same_project_name = False
+        if project_name_less==project_name_greater and getLength(project_name_less)>0:
+            same_project_name = True
+
+        same_doctitle_refine = False
+        if doctitle_refine_less==doctitle_refine_greater and getLength(doctitle_refine_less)>0:
+            same_doctitle_refine = True
+
+        same_tenderee = False
+        if tenderee_less==tenderee_greater and getLength(tenderee_less)>0:
+            same_tenderee = True
+
+        same_agency = False
+        if agency_less==agency_greater and getLength(agency_less)>0:
+            same_agency = True
+
+        same_bidding_budget = False
+        if bidding_budget_less==bidding_budget_greater and getLength(bidding_budget_less)>0:
+            same_bidding_budget = True
+
+        same_win_tenderer = False
+        if win_tenderer_less==win_tenderer_greater and getLength(win_tenderer_less)>0:
+            same_win_tenderer = True
+
+        same_win_bid_price = False
+        if win_bid_price_less==win_bid_price_greater and getLength(win_bid_price_less)>0:
+            same_win_bid_price = True
+
+        contain_doctitle = False
+        if getLength(doctitle_refine_less)>0 and getLength(doctitle_refine_greater)>0 and (doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
+            contain_doctitle = True
+
+        contain_project_name = False
+        if getLength(project_name_less)>0 and getLength(project_name_greater)>0 and (project_name_less in project_name_greater or project_name_greater in project_name_less):
+            contain_project_name = True
+
+
+        total_money_less = 0 if getLength(bidding_budget_less)==0 else float(bidding_budget_less)+0 if getLength(win_bid_price_less)==0 else float(win_bid_price_less)
+        total_money_greater = 0 if getLength(bidding_budget_greater)==0 else float(bidding_budget_greater) +0 if getLength(win_bid_price_greater)==0 else float(win_bid_price_greater)
+
+
+        if min_counts<10:
+            _prob = 0.9
+            if same_project_code and same_win_tenderer and same_tenderee:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_project_name and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_doctitle_refine and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_name and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_doctitle_refine and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_doctitle_refine and same_bidding_budget and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_doctitle_refine and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_project_code and same_project_name:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_project_code and same_doctitle_refine:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_bidding_budget and same_project_code:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_bidding_budget and same_doctitle_refine:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_bidding_budget and same_project_name:
+                self.forward(json_matrix,_prob)
+                return
+            if same_doctitle_refine and same_project_code and same_project_name:
+                self.forward(json_matrix,_prob)
+                return
+
+        if min_counts<=5:
+            _prob = 0.8
+            if same_project_code and same_tenderee:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_name and same_project_code:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_doctitle_refine:
+                self.forward(json_matrix,_prob)
+                return
+            if total_money_less==total_money_greater and total_money_less>100000:
+                if same_win_tenderer and (same_win_bid_price or same_bidding_budget):
+                    self.forward(json_matrix,_prob)
+                    return
+            if same_project_code and same_bidding_budget:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_win_bid_price:
+                self.forward(json_matrix,_prob)
+                return
+            if same_bidding_budget and same_win_bid_price and (contain_project_name or contain_doctitle):
+                self.forward(json_matrix,_prob)
+                return
+
+
+        if min_counts<=3:
+            _prob = 0.7
+            if same_project_name or same_project_code or same_doctitle_refine or contain_doctitle or contain_project_name:
+                self.forward(json_matrix,_prob)
+                return
+
+        self.forward(json_matrix,0)
+
+
+class MergePredictor():
+
+    def __init__(self):
+        self.input_size = 46
+        self.output_size = 2
+        self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
+        self.bias = np.array([16.79706382751465, -13.713337898254395])
+        # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+
+    def activation(self,vec,_type):
+        if _type=="relu":
+            _vec = np.array(vec)
+            return _vec*(_vec>0)
+        if _type=="tanh":
+            return np.tanh(vec)
+        if _type=="softmax":
+            _vec = np.array(vec)
+            _exp = np.exp(_vec)
+            return _exp/np.sum(_exp)
+
+    def predict(self,input):
+        _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
+        # print(self.model.predict(np.array(input).reshape(-1,46)))
+        return _out
+
+@annotate('string,double -> double')
+class f_getMergeProb(BaseUDTF):
+
+    def __init__(self):
+        import json
+        include_package_path("numpy-1.18.zip")
+        import numpy as np
+        global json,np
+        self.mp = MergePredictor()
+
+
+    def process(self,json_matrix,pre_prob):
+        if not pre_prob>0.5:
+            _matrix = json.loads(json_matrix)
+            _prob = self.mp.predict(_matrix)[0][1]
+        else:
+            _prob = pre_prob
+        if _prob>0.5:
+            self.forward(float(_prob))
 
 
 

+ 4 - 3
BiddingKG/maxcompute/documentMerge/train.py

@@ -104,7 +104,7 @@ class MergePredictor():
         self.output_size = 2
         self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
         self.bias = np.array([16.79706382751465, -13.713337898254395])
-        self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+        # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
 
     def activation(self,vec,_type):
         if _type=="relu":
@@ -118,8 +118,9 @@ class MergePredictor():
             return _exp/np.sum(_exp)
 
     def predict(self,input):
-        print(self.activation(self.activation(np.matmul(np.array(input).reshape(-1,46),self.matrix)+self.bias,"tanh"),"softmax"))
-        print(self.model.predict(np.array(input).reshape(-1,46)))
+        _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
+        # print(self.model.predict(np.array(input).reshape(-1,46)))
+        return _out
 
 import tensorflow as tf
 def getVariable():

+ 48 - 0
BiddingKG/maxcompute/extract_check.py

@@ -550,3 +550,51 @@ class f_splitAttach(BaseUDTF):
                 _find.decompose()
             doctextcon = _soup.get_text()
         self.forward(doctextcon,attachmenttextcon)
+
+def getTitleFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"data":filemd5})
+    _title = ""
+    if _find is not None:
+        _title = _find.get_text()
+    return _title
+
+def getSourceLinkFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"filelink":filemd5})
+    filelink = ""
+    if _find is None:
+        _find = _soup.find("img",attrs={"filelink":filemd5})
+        if _find is not None:
+            filelink = _find.attrs.get("src","")
+    else:
+        filelink = _find.attrs.get("href","")
+    return filelink
+
+def turnAttachmentsFromHtml(dochtmlcon,page_attachments):
+    new_attachments = json.loads(page_attachments)
+    for _atta in new_attachments:
+        fileMd5 = _atta.get("fileMd5")
+        if fileMd5 is not None:
+            fileTitle = getTitleFromHtml(fileMd5,dochtmlcon)
+            fileLink = getSourceLinkFromHtml(fileMd5,dochtmlcon)
+            _atta["fileTitle"] = fileTitle
+            _atta["fileLink"] = fileLink
+    print(new_attachments)
+    return json.dumps(new_attachments,ensure_ascii=False)
+
+@annotate('string,string->string')
+class f_turnPageattachments(object):
+
+
+    def evaluate(self,dochtmlcon,page_attachments):
+        new_page_attachments = None
+        if page_attachments is not None:
+            if "fileMd5" in page_attachments:
+                new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
+        return new_page_attachments
+
+
+

+ 163 - 0
BiddingKG/maxcompute/test.py

@@ -0,0 +1,163 @@
+#coding:utf8
+
+_html = '''
+                <div id="pcontent" class="pcontent"><html><body><div>
+<div>
+   车里垃圾填埋场改造PPP项目消石灰采购招标公告 
+ </div>
+<div>
+   车里垃圾填埋场改造PPP项目消石灰采购招标公告 
+ </div>
+<div>
+   【信息发布时间:2021-06-15 19:27:13】 
+ </div>
+<div>
+<p><span>车里垃圾填埋场改造PPP项目消石灰采购招标公告</span></p>
+<p> </p>
+<p> <span>中节能(福州)环保能源有限公司根据采购工作安排,就下述项目委托北京国电工程招标有限公司组织国内公开招标,现诚邀符合资格要求的潜在投标人参加本项目投标。</span></p>
+<p><span>一、</span><span>标段(包)名称:车里垃圾填埋场改造PPP项目消石灰采购 </span></p>
+<p><span>二、</span><span>标段(包)编号:20210502030124690001001</span></p>
+<p><span>三、</span><span>采购方式:公开招标</span></p>
+<p><span>四、</span><span>招标人(采购组织人):中节能(福州)环保能源有限公司</span></p>
+<p><span>五、</span><span>代理机构:北京国电工程招标有限公司</span></p>
+<p><span>六、</span><span>项目概况:</span></p>
+<p><span>(1)</span><span>资金来源:自筹</span></p>
+<p><span>(2)</span><span>项目概况:本工程为新建项目,日处理生活垃圾1500吨,年处理生活垃圾54.75万吨。配置2×750t/d机械炉排炉+2×15MW次高压中温高转速凝汽式汽轮机配2×18MW发电机组,焚烧线年运行8000小时。</span></p>
+<p><span>七、</span><span>招标货物内容与数量:项目所用消石灰,合同有效期为合同签订后1年。暂定年用量为3500吨,具体以招标人确定的实际用量为准,双方将根据招标人确认的实际用量据实结算,具体范围及要求详见技术规范书。</span></p>
+<p><span>八、</span><span>招标货物主要技术规格要求:详见技术规范书。</span></p>
+<p><span>九、</span><span>招标货物交货期(工期)要求:合同有效期为合同签订后1年。合同签定后,招标人根据实际需求情况通知投标人供货,投标人收到招标人供货通知后3个日历日内完成供货。每批交货数量由招标人临时确定。投标人按时送货到现场,紧急情况时,投标人应24小时内供货到现场。</span></p>
+<p><span>十、</span><span>招标货物交付地点:福州市长乐区航城街道车里垃圾填埋场西侧中节能(福州)环保能源有限公司厂内指定地点。</span></p>
+<p><span>十一、</span><span>对投标人的资格要求:(一)本项目不接受联合体参与。 (二)其他资格要求: 1.投标人须在中华人民共和国境内依法注册并具有独立法人资格,主要经营范围包括本次招标内容。 2.投标人为生产厂家时,需提供相关生产资质。本标段接受代理商,代理商需提供2家及以上所供厂家授权证明; 3.投标人具有良好的商业信誉及健全的财务会计制度,企业运营正常,未处于歇业、被责令停业或破产等非正常状态,且资产未被重组、接管和冻结,须提供2017年度、2018年度和2019年度经会计师事务所审计的财务报告。 4.投标人必须提供生产许可证或提供具有检测资质的实验室出具的消石灰检测报告,化验报告时间为2021年1月1日至今并符合本采购技术指标要求。 5.信誉要求:未列入“国家企业信用信息公示系统”网站(http://www.gsxt.gov.cn)经营异常名录信息和严重违法失信企业名单(黑名单)信息;未列入“信用中国”网站(https://www.creditchina.gov.cn)失信被执行人名单;近三年内投标人或其法定代表人在“中国裁判文书网”(http://wenshu.court.gov.cn/)无行贿犯罪记录。 6.业绩要求:投标人在2017年1月1日(含)至今至少具有1项(含)以上符合技术规范书中所要求的理化性质的消石灰供货业绩(提供合同关键页复印件,包含项目名称、供货范围和签字盖章页等内容,时间以合同签订时间为准)。 7.最高限价:无。 8.其他要求:与招标人存在利害关系可能影响采购公正性的法人、其他组织,不得参加投标;单位负责人为同一人或者存在控股、管理关系的不同单位,不得参加同一标段的采购或者未划分标段的同一采购项目的投标。</span></p>
+<p><span>十二、</span><span>是否允许联合体投标:本标段不接受联合体投标</span></p>
+<p><span>十三、</span><span>资格审查方式:资格后审</span></p>
+<p><span>十四、</span><span>招标文件的获取:</span></p>
+<p><span>(1)</span><span>招标文件获取截止时间:2021年06月15日到2021年06月22日23时59分59秒</span></p>
+<p><span>(2)</span><span>招标文件获取方式:通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)选择本项目进行报名,完成平台服务费缴纳后,直接下载招标文件。</span></p>
+<p><span>十五、</span><span>投标文件的递交:投标人须在投标截止时间前通过中国节能环保集团有限公司电子采购平台投标文件递交菜单(http://www.ebidding.cecep.cn/TPBidder)线上递交投标文件。</span></p>
+<p><span>十六、</span><span>开标时间(暨投标截止时间)及地点:</span></p>
+<p><span>(1)</span><span>开标时间:2021年07月06日 10时00分</span></p>
+<p><span>(2)</span><span>开标地点:通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)在线开标。</span></p>
+<p><span>(3)</span><span>开标方式:通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)在线开标。投标人须持数字证书(CA)在线参加开标会议并进行远程解锁及开标确认操作。</span></p>
+<p><span>十七、投标保证金:</span></p>
+<p><span>(1)</span><span>投标保证金金额:人民币40000.0元</span></p>
+<p><span>(2)</span><span>接收投标保证金账户信息:</span></p>
+<p><span>(3)</span> <span>保证金账号:收款账户名称:中国节能环保集团有限公司绿色供应链管理服务分公司<br/> 开户银行:上海浦东发展银行股份有限公司北京海淀园支行<br/> 收款账号:0154801391398</span></p>
+<p><span>十八、平台操作说明:</span></p>
+<p><span>(1)</span> <span>凡是拟参与中国节能环保集团有限公司电子采购平台投标活动的投标人需先在中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)上</span><span>完成</span><span>注册</span><span>审核后,方可办理在线报名、缴纳平台服务费后获取下载招标文件。</span></p>
+<p><span>(2)</span> <span>投标人须在注册同时提交</span><span>数字证书(CA)办理</span><span>资料</span><span>,</span><span>电子</span><span>采购平台所使用的数字证书(CA)办理方式见中国节能环保集团电子采购平台网站服务指南栏目《中国节能环保集团电子采购平台数字证书办理须知》(http://www.ebidding.cecep.cn/</span><span>)。投标人须</span><span>使用</span><span>《中国节能投标文件制作软件》配合</span><span>数字证书(CA)</span><span>完成投标文件编制上传,并在开标时使用</span><span>数字证书(CA)</span><span>完成开标解锁和开标结果确认等后续环节工作。</span></p>
+<p><span>(3)</span> <span>电子采购平台技术服务热线:400</span><span>-</span><span>928</span><span>-</span><span>0095</span></p>
+<p><span>十九、</span><span>公告发布媒介:本招标公告通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)和中国招标投标公共服务平台(http://www.cebpubservice.com/)对外公开发布。</span></p>
+<p><span>二十、</span><span>联系方式:</span></p>
+<table>
+<tbody>
+<tr>
+<td> <p><span>招标人(采购组织人):中节能(福州)环保能源有限公司</span></p> </td>
+<td> <p><span>代理机构:北京国电工程招标有限公司</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>地址:福州市长乐区航城街道车里垃圾填埋场西侧</span></p> </td>
+<td> <p><span>地址:北京市石景山区银河大街6号院1号楼北楼一层西侧</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>联系人:</span></p> </td>
+<td> <p><span>联系人:袁超</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>电话:</span></p> </td>
+<td> <p><span>电话:010-68777764</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>邮箱:</span></p> </td>
+<td> <p><span>邮箱:yuanchaocweme@163.com</span></p> </td>
+</tr>
+</tbody>
+</table>
+<p> </p>
+<p> </p> 附件: 
+  <a filelink="5b92dbe377ba9e517888105e875298cd" href="http://www.ebidding.cecep.cn/EpointWebBuilder/WebbuilderMIS/attach/downloadZtbAttach.jspx?attachGuid=108a8688-19a0-4ff7-8b22-1e2626ea21e1&amp;appUrlFlag=ztbAttach&amp;siteGuid=7eb5f7f1-9041-43ad-8e13-8fcb82ea831a" title="招标公告.pdf">招标公告.pdf</a>
+<a data="5b92dbe377ba9e517888105e875298cd" href="http://www.bidizhaobiao.com/file/20210615/2021-06-15/DX006570/1623756674672.pdf" style="display:none">招标公告.pdf</a>
+</div>
+<span> 项目概况 </span>
+<ul>
+<li> 采购方式:公开招标</li>
+<li> 资格审查:资格后审</li>
+<li> 文件领取截止时间:2021-06-22</li>
+<li> 文件递交截止时间:2021-07-06 10:00:00</li>
+</ul>
+<span> 公告内容 </span>
+<div>
+   附件: 
+  <a filelink="5b92dbe377ba9e517888105e875298cd" href="http://www.ebidding.cecep.cn/EpointWebBuilder/WebbuilderMIS/attach/downloadZtbAttach.jspx?attachGuid=108a8688-19a0-4ff7-8b22-1e2626ea21e1&amp;appUrlFlag=ztbAttach&amp;siteGuid=7eb5f7f1-9041-43ad-8e13-8fcb82ea831a" title="招标公告.pdf">招标公告.pdf</a>
+<a data="5b92dbe377ba9e517888105e875298cd" href="http://www.bidizhaobiao.com/file/20210615/2021-06-15/DX006570/1623756675715.pdf" style="display:none">招标公告.pdf</a>
+</div>
+<p></p>
+<button onclick="baoming()"> 我要报名 </button>
+<button onclick="zhuce()"> 我要注册 </button>
+</div></body></html>
+<div style="display:none;" class="richTextFetch">
+</div></div>
+                
+      
+'''
+from bs4 import BeautifulSoup
+import json
+def process(dochtmlcon):
+    doctextcon = ""
+    attachmenttextcon = ""
+
+    if dochtmlcon is not None:
+        _soup = BeautifulSoup(dochtmlcon,"lxml")
+
+        _find = _soup.find("div",attrs={"class":"richTextFetch"})
+        if _find is not None:
+            attachmenttextcon = _find.get_text()
+            _find.decompose()
+        doctextcon = _soup.get_text()
+    print(doctextcon)
+    print("==========")
+    print(attachmenttextcon)
+
+def getTitleFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"data":filemd5})
+    _title = ""
+    if _find is not None:
+        _title = _find.get_text()
+    return _title
+
+def getSourceLinkFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"filelink":filemd5})
+    filelink = ""
+    if _find is None:
+        _find = _soup.find("img",attrs={"filelink":filemd5})
+        if _find is not None:
+            filelink = _find.attrs.get("src","")
+    else:
+        filelink = _find.attrs.get("href","")
+    return filelink
+
+def turnAttachmentsFromHtml(dochtmlcon,page_attachments):
+    new_attachments = json.loads(page_attachments)
+    for _atta in new_attachments:
+        fileMd5 = _atta.get("fileMd5")
+        if fileMd5 is not None:
+            fileTitle = getTitleFromHtml(fileMd5,dochtmlcon)
+            fileLink = getSourceLinkFromHtml(fileMd5,dochtmlcon)
+            _atta["fileTitle"] = fileTitle
+            _atta["fileLink"] = fileLink
+    print(new_attachments)
+    return json.dumps(new_attachments,ensure_ascii=False)
+
+def evaluate(dochtmlcon,page_attachments):
+    new_page_attachments = None
+    if page_attachments is not None:
+        if "fileMd5" in page_attachments:
+            new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
+    return new_page_attachments
+
+if __name__=="__main__":
+    # process(_html)
+    print(evaluate(_html,'[{"fileTitle":"招标公告.pdf","fileMd5":"5b92dbe377ba9e517888105e875298cd"},{"fileTitle":"招标公告.pdf","fileMd5":"5b92dbe377ba9e517888105e875298cd"}]'))

+ 0 - 0
BiddingKG/readme.md