Browse Source

云上去重,预处理使用中表格识别使用inItem

luojiehua 3 years ago
parent
commit
f4a98872fc

+ 2 - 2
BiddingKG.iml

@@ -2,13 +2,13 @@
 <module type="JAVA_MODULE" version="4">
   <component name="FacetManager">
     <facet type="Python" name="Python">
-      <configuration sdkName="Python 3.5 (BiddingKG)" />
+      <configuration sdkName="Python 3.5 (dl_nlp)" />
     </facet>
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
+    <orderEntry type="library" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>
 </module>

+ 5 - 5
BiddingKG/app.py

@@ -198,12 +198,12 @@ class MyProcessor(allspark.BaseProcessor):
 if __name__ == '__main__':
     # paramter worker_threads indicates concurrency of processing
     #本地运行
-    allspark.default_properties().put("rpc.keepalive", 120000)
-
-
-    runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15030")
+    # allspark.default_properties().put("rpc.keepalive", 180000)
+    #
+    #
+    # runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15030")
     #PAI平台运行
-    # runner = MyProcessor()
+    runner = MyProcessor()
 
 
     runner.run()

+ 3 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -8,7 +8,7 @@ import time
 import codecs
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
-from BiddingKG.dl.table_head.predict import predict
+# from BiddingKG.dl.table_head.predict import predict
 
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
@@ -995,6 +995,8 @@ def tableToText(soup):
         if in_attachment:
             if tbody.name=='table':
                 _tbody = tbody.find('tbody')
+                if _tbody is None:
+                    _tbody = tbody
             else:
                 _tbody = tbody
             _td_len_list = []

+ 1 - 1
BiddingKG/dl/table_head/models/model.py

@@ -2,7 +2,7 @@ import sys
 import os
 import numpy as np
 from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
-from keras_preprocessing.sequence import pad_sequences
+from keras.preprocessing.sequence import pad_sequences
 sys.path.append(os.path.dirname(__file__))
 
 from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \

+ 1 - 1
BiddingKG/dl/table_head/pre_process.py

@@ -1,7 +1,6 @@
 import os
 import random
 import sys
-import psycopg2
 import numpy as np
 sys.path.append(os.path.dirname(__file__) + "/../")
 from common.Utils import embedding_word, embedding_word_forward
@@ -26,6 +25,7 @@ def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
 
 
 def postgresql_util(sql, limit):
+    import psycopg2
     conn = psycopg2.connect(dbname="table_head_label", user="postgres", password="postgres",
                             host="192.168.2.103")
     cursor = conn.cursor()

+ 1 - 1
BiddingKG/dl/test/test4.py

@@ -30,7 +30,7 @@ def test(name,content):
     myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
 
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    _url = "http://192.168.2.102:15030/article_extract"
+    _url = "http://192.168.2.102:8080/article_extract"
     _resp = requests.post(_url, json=user, headers=myheaders, verify=True)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
     resp_json = _resp.content.decode("utf-8")

+ 9 - 7
BiddingKG/extract.app.json

@@ -1,18 +1,20 @@
 {
   "generate_token": "true",
   "metadata": {
-    "cpu": 7,
-    "instance": 4,
-    "memory": 18000,
+    "cpu": 4,
+    "instance": 7,
+    "memory": 11000,
     "region": "cn-hangzhou",
     "resource": "eas-r-9oq7xupatg8yoiyuvk",
     "rpc": {
-      "batching": "true",
-      "keepalive": 60000,
-      "max_batch_size": 40
+      "batching": "false",
+      "keepalive": 180000,
+      "max_queue_size": 100,
+      "io_threads": 4,
+      "worker_threads": 5
     }
   },
-  "workers":7,
+  "workers":5,
   "name": "content_extract",
   "processor_entry": "./BiddingKG/app.py",
   "processor_path": "oss://eas-model-hangzhou/1255640119316927/BiddingKG_eas.zip",

+ 152 - 24
BiddingKG/maxcompute/documentDumplicate.py

@@ -74,7 +74,11 @@ class f_decode_extract(BaseUDTF):
         time_release = _extract.get("time_release")
 
         # docchannel = _other.get("docchannel",0)
-        docchannel = self.dict_channel.get(_extract.get("docchannel",""),0)
+        docchannel_name = _extract.get("docchannel",{}).get("docchannel")
+        doctype_name = _extract.get("docchannel",{}).get("doctype")
+        if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
+            docchannel_name = doctype_name
+        docchannel = self.dict_channel.get(docchannel_name,0)
         if re.search(self.time_pattern,page_time) is not None:
             try:
                 timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
@@ -94,20 +98,36 @@ class f_decode_extract(BaseUDTF):
                 if bidding_budget=="":
                     bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
             for _role in dict_pack[_key]["roleList"]:
-                extract_count += 1
-                if _role[2]!='' and float(_role[2])>0:
+                if isinstance(_role,list):
                     extract_count += 1
-                if _role[0]=="tenderee":
-                    tenderee = _role[1]
-                if _role[0]=="win_tenderer":
-                    if  win_tenderer=="":
-                        win_tenderer = _role[1]
                     if _role[2]!='' and float(_role[2])>0:
                         extract_count += 1
-                        if win_bid_price=="":
-                            win_bid_price = str(float(_role[2]))
-                if _role[0]=="agency":
-                    agency = _role[1]
+                    if _role[0]=="tenderee":
+                        tenderee = _role[1]
+                    if _role[0]=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role[1]
+                        if _role[2]!='' and float(_role[2])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role[2]))
+                    if _role[0]=="agency":
+                        agency = _role[1]
+                if isinstance(_role,dict):
+                    extract_count += 1
+                    if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                        extract_count += 1
+                    if _role["role_name"]=="tenderee":
+                        tenderee = _role["role_text"]
+                    if _role["role_name"]=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role["role_text"]
+                        if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role["role_money"]["money"]))
+                    if _role["role_name"]=="agency":
+                        agency = _role["role_text"]
 
 
         if project_code!="":
@@ -151,25 +171,43 @@ class f_get_extractCount(object):
         win_tenderer = ""
         win_bid_price = ""
         for _key in dict_pack.keys():
-            if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
+            if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
                 extract_count += 1
                 if bidding_budget=="":
                     bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
             for _role in dict_pack[_key]["roleList"]:
-                extract_count += 1
-                if _role[2]!='' and float(_role[2])>0:
+                if isinstance(_role,list):
                     extract_count += 1
-                if _role[0]=="tenderee":
-                    tenderee = _role[1]
-                if _role[0]=="win_tenderer":
-                    if  win_tenderer=="":
-                        win_tenderer = _role[1]
                     if _role[2]!='' and float(_role[2])>0:
                         extract_count += 1
-                        if win_bid_price=="":
-                            win_bid_price = str(float(_role[2]))
-                if _role[0]=="agency":
-                    agency = _role[1]
+                    if _role[0]=="tenderee":
+                        tenderee = _role[1]
+                    if _role[0]=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role[1]
+                        if _role[2]!='' and float(_role[2])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role[2]))
+                    if _role[0]=="agency":
+                        agency = _role[1]
+                if isinstance(_role,dict):
+                    extract_count += 1
+                    if "role_money" in _role:
+                        if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
+                            extract_count += 1
+                    if _role.get("role_name")=="tenderee":
+                        tenderee = _role["role_text"]
+                    if _role.get("role_name")=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role["role_text"]
+                        if "role_money" in _role:
+                            if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                                extract_count += 1
+                                if win_bid_price=="":
+                                    win_bid_price = str(float(_role["role_money"]["money"]))
+                    if _role["role_name"]=="agency":
+                        agency = _role["role_text"]
 
 
         if project_code!="":
@@ -1085,3 +1123,93 @@ class f_is_legal(object):
             return 0
         return 1
 
+@annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
+class f_autorule_group(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
+                project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
+        buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
+                          "doctitle_refine":doctitle_refine,"area":area,"province":province,
+                          "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
+                          "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
+                          "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
+                          "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0][:100])
+        buffer[0] = buffer[0][:100]
+
+    def getSameKeys(self,_dict1,_dict2):
+        list_keys = []
+        for k,v in _dict1.items():
+            if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
+                continue
+            v2 = _dict2.get(k,"")
+            if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
+                list_keys.append(k)
+        list_keys.sort(key=lambda x:x)
+        return "=".join(list_keys)
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+        the_group.sort(key=lambda x:x["confidence"],reverse=True)
+        if len(the_group)>5:
+            keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
+        else:
+            keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
+
+
+        #置信度
+        list_key_index = []
+        for _k in keys:
+            if _k=="doctitle":
+                list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
+            else:
+                list_key_index.append(getDiffIndex(the_group,_k))
+        final_group = []
+
+        _index = min(list_key_index)
+        if _index>1:
+            for item in the_group[:_index]:
+                final_group.append(item)
+
+        list_rules = []
+        for i in range(len(final_group)):
+            for j in range(i+1,len(final_group)):
+                _dict1 = final_group[i]
+                _dict2 = final_group[j]
+                _rule = self.getSameKeys(_dict1,_dict2)
+                list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
+
+        return json.dumps(list_rules)
+
+@annotate('string -> string,bigint,bigint')
+class f_autorule_group_extract(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+    def process(self,rules_json):
+        list_rules = json.loads(rules_json)
+        for _rule in list_rules:
+            self.forward(_rule[0],_rule[1],_rule[2])

+ 39 - 17
BiddingKG/maxcompute/documentMerge.py

@@ -86,7 +86,10 @@ def getSet(list_dict,key):
                     _set.add(str(item[key]))
     return _set
 
-def split_with_time(list_dict,sort_key,timedelta=86400*120):
+def split_with_time(list_dict,sort_key,timedelta=86400*120,more_than_one=True):
+    group_num = 1
+    if more_than_one:
+        group_num = 2
     if len(list_dict)>0:
         if sort_key in list_dict[0]:
             list_dict.sort(key=lambda x:x[sort_key])
@@ -102,7 +105,7 @@ def split_with_time(list_dict,sort_key,timedelta=86400*120):
                     if len(_group)>1:
                         list_group.append(_group)
                     _begin = i + 1
-            if len(list_dict)>1:
+            if len(list_dict)>=group_num:
                 _group = []
                 for j in range(_begin,len(list_dict)):
                     _group.append(list_dict[j])
@@ -442,12 +445,6 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
         return _result
 
 
-
-
-
-
-
-
     def terminate(self, buffer):
         list_group = []
         the_group = buffer[0]
@@ -461,6 +458,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
         re_merge = False
         for _key in keys:
             if len(getSet(the_group,_key))>1:
+                log("has_more_than_one:%s"%str(getSet(the_group,_key)))
                 re_merge = True
                 break
         #判断是否相似而不相同
@@ -565,8 +563,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
                 #     final_group.append(list(set(_group["docid"])))
         else:
             final_group = [list(set([item["docid"] for item in the_group]))]
-        log(str(final_group))
-
+        log("%s--%s"%("final_group",str(final_group)))
 
         #每个channel选择一篇公告
         final_group_channel = []
@@ -586,16 +583,16 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
 
             #根据日期进行切分
             new_dict_channel_id = {}
-            print(dict_channel_id)
+            log("%s:%s"%("dict_channel_id",str(dict_channel_id)))
             for k,v in dict_channel_id.items():
-                list_time_docids = split_with_time(v,"page_time_stamp",86400*6)
-                print(list_time_docids)
+                list_time_docids = split_with_time(v,"page_time_stamp",86400*6,more_than_one=False)
+                log(list_time_docids)
                 for _l in list_time_docids:
                     list_t = self.splitByTimezone(_l,"json_dicttime")
                     for _t in list_t:
                         otherChannel += 1
                         new_dict_channel_id[otherChannel] = _t
-            print(new_dict_channel_id)
+            log("%s:%s"%("new_dict_channel_id",str(new_dict_channel_id)))
             channel_dict = {}
             for k,v in new_dict_channel_id.items():
                 v.sort(key=lambda x:x["docid"])
@@ -1231,11 +1228,36 @@ class f_encode_time(object):
 
         return _encode
 
+
 if __name__ == '__main__':
     a = f_remege_limit_num_contain_bychannel()
     buffer = a.new_buffer()
-    a.iterate(buffer,1,1,86400*1,"1","1","1","1","1","1","1",5,5,None)
-    a.iterate(buffer,3,1,86400*4,"1","1","1","1","1","1","1",5,5,'{"a":"dbb"}')
-    a.iterate(buffer,5,1,86400*10,"1","1","1","1","1","1","1",5,5,"{}")
+    tmp_s = '''
+    225405503	230202661	2022-04-02	1648828800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230202661	2022-04-02	1648828800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230101787	2022-03-31	1648656000	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230101787	2022-03-31	1648656000	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230093569	2022-03-31	1648656000	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230093569	2022-03-31	1648656000	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+
+    '''
+    for _s in tmp_s.split("\n"):
+        ls = _s.split("\t")
+        if len(ls)!=17:
+            continue
+        _confid = 1 if ls[14] =="" else ls[14]
+        a.iterate(buffer,ls[1],ls[13],int(ls[3]),ls[8],ls[10],ls[11],ls[12],ls[7],ls[5],ls[4],_confid,ls[15],ls[16][1:-1])
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
     print(a.terminate(buffer))
     print(1)

+ 46 - 1
BiddingKG/maxcompute/enterpriseFix.py

@@ -1,5 +1,9 @@
 #coding:utf8
 from odps.udf import annotate,BaseUDAF,BaseUDTF
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import json
+import traceback
 
 @annotate('string->string')
 class getYearMonth(object):
@@ -127,4 +131,45 @@ class f_turn_circle(object):
         if name is not None:
             return name.replace("(","(").replace(")",")")
         else:
-            return ""
+            return ""
+
+@annotate('string,string->string,bigint')
+class f_dumplicate_contacts(BaseUDTF):
+
+    def __init__(self):
+        pass
+
+    def process(self,name,contacts):
+        if contacts is None:
+            self.forward(contacts,1)
+            return
+        try:
+            list_contacts = json.loads(contacts)
+            _set = set()
+            _phone_set = set()
+            new_list_contacts = []
+            list_contacts.sort(key=lambda x:len(x.get("contact_person","")),reverse=True)
+            for _conta in list_contacts:
+                contact_person = _conta.get("contact_person","")
+                mobile_no = _conta.get("mobile_no","")
+                phone_no = _conta.get("phone_no","")
+                if contact_person=="" and (mobile_no in _phone_set or phone_no in _phone_set):
+                    continue
+                _key = "%s-%s-%s"%(contact_person,mobile_no,phone_no)
+                if _key in _set:
+                    continue
+                if mobile_no!="":
+                    _phone_set.add(mobile_no)
+                if phone_no!="":
+                    _phone_set.add(phone_no)
+                new_list_contacts.append(_conta)
+                _set.add(_key)
+            if len(new_list_contacts)!=len(list_contacts):
+                logging.info(name)
+            new_list_contacts.sort(key=lambda x:x.get("level",0),reverse=True)
+            self.forward(json.dumps(new_list_contacts,ensure_ascii=False),1)
+        except Exception as e:
+            traceback.print_exc()
+            logging.info(contacts)
+            self.forward(None,0)
+

+ 2 - 2
BiddingKG/maxcompute/evaluates.py

@@ -76,8 +76,8 @@ def multiLoadEnv():
         # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
         #改为zip引入
         log("=======")
-        include_package_path("BiddingKG.baseline.zip")
-        # include_package_path("BiddingKG.backup.zip")
+        # include_package_path("BiddingKG.baseline.zip")
+        include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
     def load_vector():