Bladeren bron

云上去重,预处理使用中表格识别使用inItem

luojiehua 3 jaren geleden
bovenliggende
commit
f4a98872fc

+ 2 - 2
BiddingKG.iml

@@ -2,13 +2,13 @@
 <module type="JAVA_MODULE" version="4">
 <module type="JAVA_MODULE" version="4">
   <component name="FacetManager">
   <component name="FacetManager">
     <facet type="Python" name="Python">
     <facet type="Python" name="Python">
-      <configuration sdkName="Python 3.5 (BiddingKG)" />
+      <configuration sdkName="Python 3.5 (dl_nlp)" />
     </facet>
     </facet>
   </component>
   </component>
   <component name="NewModuleRootManager">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <content url="file://$MODULE_DIR$" />
     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
     <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
+    <orderEntry type="library" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>
   </component>
 </module>
 </module>

+ 5 - 5
BiddingKG/app.py

@@ -198,12 +198,12 @@ class MyProcessor(allspark.BaseProcessor):
 if __name__ == '__main__':
 if __name__ == '__main__':
     # paramter worker_threads indicates concurrency of processing
     # paramter worker_threads indicates concurrency of processing
     #本地运行
     #本地运行
-    allspark.default_properties().put("rpc.keepalive", 120000)
-
-
-    runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15030")
+    # allspark.default_properties().put("rpc.keepalive", 180000)
+    #
+    #
+    # runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15030")
     #PAI平台运行
     #PAI平台运行
-    # runner = MyProcessor()
+    runner = MyProcessor()
 
 
 
 
     runner.run()
     runner.run()

+ 3 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -8,7 +8,7 @@ import time
 import codecs
 import codecs
 
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
-from BiddingKG.dl.table_head.predict import predict
+# from BiddingKG.dl.table_head.predict import predict
 
 
 sys.setrecursionlimit(1000000)
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath("../.."))
@@ -995,6 +995,8 @@ def tableToText(soup):
         if in_attachment:
         if in_attachment:
             if tbody.name=='table':
             if tbody.name=='table':
                 _tbody = tbody.find('tbody')
                 _tbody = tbody.find('tbody')
+                if _tbody is None:
+                    _tbody = tbody
             else:
             else:
                 _tbody = tbody
                 _tbody = tbody
             _td_len_list = []
             _td_len_list = []

+ 1 - 1
BiddingKG/dl/table_head/models/model.py

@@ -2,7 +2,7 @@ import sys
 import os
 import os
 import numpy as np
 import numpy as np
 from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
 from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
-from keras_preprocessing.sequence import pad_sequences
+from keras.preprocessing.sequence import pad_sequences
 sys.path.append(os.path.dirname(__file__))
 sys.path.append(os.path.dirname(__file__))
 
 
 from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \
 from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \

+ 1 - 1
BiddingKG/dl/table_head/pre_process.py

@@ -1,7 +1,6 @@
 import os
 import os
 import random
 import random
 import sys
 import sys
-import psycopg2
 import numpy as np
 import numpy as np
 sys.path.append(os.path.dirname(__file__) + "/../")
 sys.path.append(os.path.dirname(__file__) + "/../")
 from common.Utils import embedding_word, embedding_word_forward
 from common.Utils import embedding_word, embedding_word_forward
@@ -26,6 +25,7 @@ def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
 
 
 
 
 def postgresql_util(sql, limit):
 def postgresql_util(sql, limit):
+    import psycopg2
     conn = psycopg2.connect(dbname="table_head_label", user="postgres", password="postgres",
     conn = psycopg2.connect(dbname="table_head_label", user="postgres", password="postgres",
                             host="192.168.2.103")
                             host="192.168.2.103")
     cursor = conn.cursor()
     cursor = conn.cursor()

+ 1 - 1
BiddingKG/dl/test/test4.py

@@ -30,7 +30,7 @@ def test(name,content):
     myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
     myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
 
 
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    _url = "http://192.168.2.102:15030/article_extract"
+    _url = "http://192.168.2.102:8080/article_extract"
     _resp = requests.post(_url, json=user, headers=myheaders, verify=True)
     _resp = requests.post(_url, json=user, headers=myheaders, verify=True)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
     resp_json = _resp.content.decode("utf-8")
     resp_json = _resp.content.decode("utf-8")

+ 9 - 7
BiddingKG/extract.app.json

@@ -1,18 +1,20 @@
 {
 {
   "generate_token": "true",
   "generate_token": "true",
   "metadata": {
   "metadata": {
-    "cpu": 7,
-    "instance": 4,
-    "memory": 18000,
+    "cpu": 4,
+    "instance": 7,
+    "memory": 11000,
     "region": "cn-hangzhou",
     "region": "cn-hangzhou",
     "resource": "eas-r-9oq7xupatg8yoiyuvk",
     "resource": "eas-r-9oq7xupatg8yoiyuvk",
     "rpc": {
     "rpc": {
-      "batching": "true",
-      "keepalive": 60000,
-      "max_batch_size": 40
+      "batching": "false",
+      "keepalive": 180000,
+      "max_queue_size": 100,
+      "io_threads": 4,
+      "worker_threads": 5
     }
     }
   },
   },
-  "workers":7,
+  "workers":5,
   "name": "content_extract",
   "name": "content_extract",
   "processor_entry": "./BiddingKG/app.py",
   "processor_entry": "./BiddingKG/app.py",
   "processor_path": "oss://eas-model-hangzhou/1255640119316927/BiddingKG_eas.zip",
   "processor_path": "oss://eas-model-hangzhou/1255640119316927/BiddingKG_eas.zip",

+ 152 - 24
BiddingKG/maxcompute/documentDumplicate.py

@@ -74,7 +74,11 @@ class f_decode_extract(BaseUDTF):
         time_release = _extract.get("time_release")
         time_release = _extract.get("time_release")
 
 
         # docchannel = _other.get("docchannel",0)
         # docchannel = _other.get("docchannel",0)
-        docchannel = self.dict_channel.get(_extract.get("docchannel",""),0)
+        docchannel_name = _extract.get("docchannel",{}).get("docchannel")
+        doctype_name = _extract.get("docchannel",{}).get("doctype")
+        if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
+            docchannel_name = doctype_name
+        docchannel = self.dict_channel.get(docchannel_name,0)
         if re.search(self.time_pattern,page_time) is not None:
         if re.search(self.time_pattern,page_time) is not None:
             try:
             try:
                 timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
                 timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
@@ -94,20 +98,36 @@ class f_decode_extract(BaseUDTF):
                 if bidding_budget=="":
                 if bidding_budget=="":
                     bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
                     bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
             for _role in dict_pack[_key]["roleList"]:
             for _role in dict_pack[_key]["roleList"]:
-                extract_count += 1
-                if _role[2]!='' and float(_role[2])>0:
+                if isinstance(_role,list):
                     extract_count += 1
                     extract_count += 1
-                if _role[0]=="tenderee":
-                    tenderee = _role[1]
-                if _role[0]=="win_tenderer":
-                    if  win_tenderer=="":
-                        win_tenderer = _role[1]
                     if _role[2]!='' and float(_role[2])>0:
                     if _role[2]!='' and float(_role[2])>0:
                         extract_count += 1
                         extract_count += 1
-                        if win_bid_price=="":
-                            win_bid_price = str(float(_role[2]))
-                if _role[0]=="agency":
-                    agency = _role[1]
+                    if _role[0]=="tenderee":
+                        tenderee = _role[1]
+                    if _role[0]=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role[1]
+                        if _role[2]!='' and float(_role[2])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role[2]))
+                    if _role[0]=="agency":
+                        agency = _role[1]
+                if isinstance(_role,dict):
+                    extract_count += 1
+                    if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                        extract_count += 1
+                    if _role["role_name"]=="tenderee":
+                        tenderee = _role["role_text"]
+                    if _role["role_name"]=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role["role_text"]
+                        if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role["role_money"]["money"]))
+                    if _role["role_name"]=="agency":
+                        agency = _role["role_text"]
 
 
 
 
         if project_code!="":
         if project_code!="":
@@ -151,25 +171,43 @@ class f_get_extractCount(object):
         win_tenderer = ""
         win_tenderer = ""
         win_bid_price = ""
         win_bid_price = ""
         for _key in dict_pack.keys():
         for _key in dict_pack.keys():
-            if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
+            if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
                 extract_count += 1
                 extract_count += 1
                 if bidding_budget=="":
                 if bidding_budget=="":
                     bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
                     bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
             for _role in dict_pack[_key]["roleList"]:
             for _role in dict_pack[_key]["roleList"]:
-                extract_count += 1
-                if _role[2]!='' and float(_role[2])>0:
+                if isinstance(_role,list):
                     extract_count += 1
                     extract_count += 1
-                if _role[0]=="tenderee":
-                    tenderee = _role[1]
-                if _role[0]=="win_tenderer":
-                    if  win_tenderer=="":
-                        win_tenderer = _role[1]
                     if _role[2]!='' and float(_role[2])>0:
                     if _role[2]!='' and float(_role[2])>0:
                         extract_count += 1
                         extract_count += 1
-                        if win_bid_price=="":
-                            win_bid_price = str(float(_role[2]))
-                if _role[0]=="agency":
-                    agency = _role[1]
+                    if _role[0]=="tenderee":
+                        tenderee = _role[1]
+                    if _role[0]=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role[1]
+                        if _role[2]!='' and float(_role[2])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role[2]))
+                    if _role[0]=="agency":
+                        agency = _role[1]
+                if isinstance(_role,dict):
+                    extract_count += 1
+                    if "role_money" in _role:
+                        if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
+                            extract_count += 1
+                    if _role.get("role_name")=="tenderee":
+                        tenderee = _role["role_text"]
+                    if _role.get("role_name")=="win_tenderer":
+                        if  win_tenderer=="":
+                            win_tenderer = _role["role_text"]
+                        if "role_money" in _role:
+                            if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                                extract_count += 1
+                                if win_bid_price=="":
+                                    win_bid_price = str(float(_role["role_money"]["money"]))
+                    if _role["role_name"]=="agency":
+                        agency = _role["role_text"]
 
 
 
 
         if project_code!="":
         if project_code!="":
@@ -1085,3 +1123,93 @@ class f_is_legal(object):
             return 0
             return 0
         return 1
         return 1
 
 
+@annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
+class f_autorule_group(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
+                project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
+        buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
+                          "doctitle_refine":doctitle_refine,"area":area,"province":province,
+                          "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
+                          "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
+                          "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
+                          "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0][:100])
+        buffer[0] = buffer[0][:100]
+
+    def getSameKeys(self,_dict1,_dict2):
+        list_keys = []
+        for k,v in _dict1.items():
+            if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
+                continue
+            v2 = _dict2.get(k,"")
+            if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
+                list_keys.append(k)
+        list_keys.sort(key=lambda x:x)
+        return "=".join(list_keys)
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+        the_group.sort(key=lambda x:x["confidence"],reverse=True)
+        if len(the_group)>5:
+            keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
+        else:
+            keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
+
+
+        #置信度
+        list_key_index = []
+        for _k in keys:
+            if _k=="doctitle":
+                list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
+            else:
+                list_key_index.append(getDiffIndex(the_group,_k))
+        final_group = []
+
+        _index = min(list_key_index)
+        if _index>1:
+            for item in the_group[:_index]:
+                final_group.append(item)
+
+        list_rules = []
+        for i in range(len(final_group)):
+            for j in range(i+1,len(final_group)):
+                _dict1 = final_group[i]
+                _dict2 = final_group[j]
+                _rule = self.getSameKeys(_dict1,_dict2)
+                list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
+
+        return json.dumps(list_rules)
+
+@annotate('string -> string,bigint,bigint')
+class f_autorule_group_extract(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+    def process(self,rules_json):
+        list_rules = json.loads(rules_json)
+        for _rule in list_rules:
+            self.forward(_rule[0],_rule[1],_rule[2])

+ 39 - 17
BiddingKG/maxcompute/documentMerge.py

@@ -86,7 +86,10 @@ def getSet(list_dict,key):
                     _set.add(str(item[key]))
                     _set.add(str(item[key]))
     return _set
     return _set
 
 
-def split_with_time(list_dict,sort_key,timedelta=86400*120):
+def split_with_time(list_dict,sort_key,timedelta=86400*120,more_than_one=True):
+    group_num = 1
+    if more_than_one:
+        group_num = 2
     if len(list_dict)>0:
     if len(list_dict)>0:
         if sort_key in list_dict[0]:
         if sort_key in list_dict[0]:
             list_dict.sort(key=lambda x:x[sort_key])
             list_dict.sort(key=lambda x:x[sort_key])
@@ -102,7 +105,7 @@ def split_with_time(list_dict,sort_key,timedelta=86400*120):
                     if len(_group)>1:
                     if len(_group)>1:
                         list_group.append(_group)
                         list_group.append(_group)
                     _begin = i + 1
                     _begin = i + 1
-            if len(list_dict)>1:
+            if len(list_dict)>=group_num:
                 _group = []
                 _group = []
                 for j in range(_begin,len(list_dict)):
                 for j in range(_begin,len(list_dict)):
                     _group.append(list_dict[j])
                     _group.append(list_dict[j])
@@ -442,12 +445,6 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
         return _result
         return _result
 
 
 
 
-
-
-
-
-
-
     def terminate(self, buffer):
     def terminate(self, buffer):
         list_group = []
         list_group = []
         the_group = buffer[0]
         the_group = buffer[0]
@@ -461,6 +458,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
         re_merge = False
         re_merge = False
         for _key in keys:
         for _key in keys:
             if len(getSet(the_group,_key))>1:
             if len(getSet(the_group,_key))>1:
+                log("has_more_than_one:%s"%str(getSet(the_group,_key)))
                 re_merge = True
                 re_merge = True
                 break
                 break
         #判断是否相似而不相同
         #判断是否相似而不相同
@@ -565,8 +563,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
                 #     final_group.append(list(set(_group["docid"])))
                 #     final_group.append(list(set(_group["docid"])))
         else:
         else:
             final_group = [list(set([item["docid"] for item in the_group]))]
             final_group = [list(set([item["docid"] for item in the_group]))]
-        log(str(final_group))
-
+        log("%s--%s"%("final_group",str(final_group)))
 
 
         #每个channel选择一篇公告
         #每个channel选择一篇公告
         final_group_channel = []
         final_group_channel = []
@@ -586,16 +583,16 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
 
 
             #根据日期进行切分
             #根据日期进行切分
             new_dict_channel_id = {}
             new_dict_channel_id = {}
-            print(dict_channel_id)
+            log("%s:%s"%("dict_channel_id",str(dict_channel_id)))
             for k,v in dict_channel_id.items():
             for k,v in dict_channel_id.items():
-                list_time_docids = split_with_time(v,"page_time_stamp",86400*6)
-                print(list_time_docids)
+                list_time_docids = split_with_time(v,"page_time_stamp",86400*6,more_than_one=False)
+                log(list_time_docids)
                 for _l in list_time_docids:
                 for _l in list_time_docids:
                     list_t = self.splitByTimezone(_l,"json_dicttime")
                     list_t = self.splitByTimezone(_l,"json_dicttime")
                     for _t in list_t:
                     for _t in list_t:
                         otherChannel += 1
                         otherChannel += 1
                         new_dict_channel_id[otherChannel] = _t
                         new_dict_channel_id[otherChannel] = _t
-            print(new_dict_channel_id)
+            log("%s:%s"%("new_dict_channel_id",str(new_dict_channel_id)))
             channel_dict = {}
             channel_dict = {}
             for k,v in new_dict_channel_id.items():
             for k,v in new_dict_channel_id.items():
                 v.sort(key=lambda x:x["docid"])
                 v.sort(key=lambda x:x["docid"])
@@ -1231,11 +1228,36 @@ class f_encode_time(object):
 
 
         return _encode
         return _encode
 
 
+
 if __name__ == '__main__':
 if __name__ == '__main__':
     a = f_remege_limit_num_contain_bychannel()
     a = f_remege_limit_num_contain_bychannel()
     buffer = a.new_buffer()
     buffer = a.new_buffer()
-    a.iterate(buffer,1,1,86400*1,"1","1","1","1","1","1","1",5,5,None)
-    a.iterate(buffer,3,1,86400*4,"1","1","1","1","1","1","1",5,5,'{"a":"dbb"}')
-    a.iterate(buffer,5,1,86400*10,"1","1","1","1","1","1","1",5,5,"{}")
+    tmp_s = '''
+    225405503	230202661	2022-04-02	1648828800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230202661	2022-04-02	1648828800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230101787	2022-03-31	1648656000	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230101787	2022-03-31	1648656000	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230093569	2022-03-31	1648656000	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	230093569	2022-03-31	1648656000	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+
+    '''
+    for _s in tmp_s.split("\n"):
+        ls = _s.split("\t")
+        if len(ls)!=17:
+            continue
+        _confid = 1 if ls[14] =="" else ls[14]
+        a.iterate(buffer,ls[1],ls[13],int(ls[3]),ls[8],ls[10],ls[11],ls[12],ls[7],ls[5],ls[4],_confid,ls[15],ls[16][1:-1])
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
     print(a.terminate(buffer))
     print(a.terminate(buffer))
     print(1)
     print(1)

+ 46 - 1
BiddingKG/maxcompute/enterpriseFix.py

@@ -1,5 +1,9 @@
 #coding:utf8
 #coding:utf8
 from odps.udf import annotate,BaseUDAF,BaseUDTF
 from odps.udf import annotate,BaseUDAF,BaseUDTF
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import json
+import traceback
 
 
 @annotate('string->string')
 @annotate('string->string')
 class getYearMonth(object):
 class getYearMonth(object):
@@ -127,4 +131,45 @@ class f_turn_circle(object):
         if name is not None:
         if name is not None:
             return name.replace("(","(").replace(")",")")
             return name.replace("(","(").replace(")",")")
         else:
         else:
-            return ""
+            return ""
+
+@annotate('string,string->string,bigint')
+class f_dumplicate_contacts(BaseUDTF):
+
+    def __init__(self):
+        pass
+
+    def process(self,name,contacts):
+        if contacts is None:
+            self.forward(contacts,1)
+            return
+        try:
+            list_contacts = json.loads(contacts)
+            _set = set()
+            _phone_set = set()
+            new_list_contacts = []
+            list_contacts.sort(key=lambda x:len(x.get("contact_person","")),reverse=True)
+            for _conta in list_contacts:
+                contact_person = _conta.get("contact_person","")
+                mobile_no = _conta.get("mobile_no","")
+                phone_no = _conta.get("phone_no","")
+                if contact_person=="" and (mobile_no in _phone_set or phone_no in _phone_set):
+                    continue
+                _key = "%s-%s-%s"%(contact_person,mobile_no,phone_no)
+                if _key in _set:
+                    continue
+                if mobile_no!="":
+                    _phone_set.add(mobile_no)
+                if phone_no!="":
+                    _phone_set.add(phone_no)
+                new_list_contacts.append(_conta)
+                _set.add(_key)
+            if len(new_list_contacts)!=len(list_contacts):
+                logging.info(name)
+            new_list_contacts.sort(key=lambda x:x.get("level",0),reverse=True)
+            self.forward(json.dumps(new_list_contacts,ensure_ascii=False),1)
+        except Exception as e:
+            traceback.print_exc()
+            logging.info(contacts)
+            self.forward(None,0)
+

+ 2 - 2
BiddingKG/maxcompute/evaluates.py

@@ -76,8 +76,8 @@ def multiLoadEnv():
         # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
         # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
         #改为zip引入
         #改为zip引入
         log("=======")
         log("=======")
-        include_package_path("BiddingKG.baseline.zip")
-        # include_package_path("BiddingKG.backup.zip")
+        # include_package_path("BiddingKG.baseline.zip")
+        include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
 
     def load_vector():
     def load_vector():