Procházet zdrojové kódy

拟在建逻辑修复,公告去重合并

luojiehua před 1 rokem
rodič
revize
083ef5aeda

+ 5 - 12
BaseDataMaintenance/common/ERNIE_utils.py

@@ -19,23 +19,17 @@ def get_access_token():
     return response.json().get("access_token")
 
 def main():
-    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + get_access_token()
+    # _token = get_access_token()
+    _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
 
     payload = json.dumps({
         "messages": [
             {
                 "role": "user",
                 "content": '''
-                名称: 亚低温治疗仪
-
-品牌:GSZ
-
-规格型号:233
-
-数量:1台
-
-单价: 170000.00元
-以上的GSZ是什么牌子
+                假设分类是建筑建材-建筑涂料的相关产品词“面漆”
+                请拓展其相关行业产品词,列举30个
                 '''
             }
         ]
@@ -49,5 +43,4 @@ def main():
     print(response.text)
 
 if __name__ == '__main__':
-    print(get_access_token())
     main()

+ 3 - 1
BaseDataMaintenance/dataMonitor/data_monitor.py

@@ -491,13 +491,15 @@ class BaseDataMonitor():
                                                                             columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
 
         if total_count>=1000:
-            _cmd = 'cat %s | grep -c "%s.*upgrate True save"'%(flow_dumplicate_log_path,self.get_last_tenmin_time())
+            _cmd = 'cat %s | grep -c "%s.*merge_project whole_time"'%(flow_dumplicate_log_path,self.get_last_tenmin_time())
             process_count = self.cmd_execute(_cmd)
             atAll = False
             if process_count=="":
                 process_count = 0
             if int(process_count)==0:
                 atAll = True
+            if int(process_count)<100:
+                self.cmd_execute("ps -ef | grep dumplicate | grep -v grep|cut -c 9-15|xargs kill -9")
             _msg = "数据流报警:待去重公告数为:%d,最近十分钟去重数为:%s"%(total_count,str(process_count))
             sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=atAll)
             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)

+ 4 - 4
BaseDataMaintenance/dataSource/setttings.py

@@ -43,10 +43,10 @@ oracle_host = "192.168.0.150"
 oracle_port = 1522
 # oracle_user = "bxkc_data_readonly"
 # oracle_pass = "P7WUrgcz0@#j8pjg"
-oracle_user = "bxkc_write"
-oracle_pass = "aBrTKNl9SaPk@Yy3"
-# oracle_user = "bxkc_db"
-# oracle_pass = "TKVF#3idC4UQlDVy"
+# oracle_user = "bxkc_write"
+# oracle_pass = "aBrTKNl9SaPk@Yy3"
+oracle_user = "bxkc_db"
+oracle_pass = "xb9F#24Hd#5rStr9"
 oracle_db = "yanphone"
 
 ots_AccessKeyId = 'LTAI5tFuoxHm8Uxrr5nT8wTZ'

+ 98 - 45
BaseDataMaintenance/maintenance/dataflow.py

@@ -2214,7 +2214,7 @@ class Dataflow_dumplicate(Dataflow):
         _dict["project_name"] = _extract.get("name","")
         _dict["dict_time"] = self.get_dict_time(_extract)
 
-    def dumplicate_fianl_check(self,base_list):
+    def dumplicate_fianl_check(self,base_list,b_log=False):
         the_group = base_list
         the_group.sort(key=lambda x:x["confidence"],reverse=True)
 
@@ -2232,17 +2232,16 @@ class Dataflow_dumplicate(Dataflow):
                 continue
             for _j in range(min(_i,10)):
                 _dict2 = base_list[_j]
-                _prob = self.dumplicate_check(_dict1,_dict2,_dict2.get("min_counts",10),b_log=False)
-                # print("_prob:",_prob)
+                _prob = self.dumplicate_check(_dict1,_dict2,_dict1.get("min_counts",10),b_log=b_log)
+                print("_prob:",_prob)
                 if _prob<=0.1:
                     _pass = False
                     break
-            log("checking index:%d"%(_i))
+            log("checking index:%d %s %.2f"%(_i,str(_pass),_prob))
             _index = _i
             if not _pass:
                 _index -= 1
                 break
-
         if _index>=1:
             # #对重复入库的进行去重
             # _l = the_group[:_index+1]
@@ -2258,7 +2257,8 @@ class Dataflow_dumplicate(Dataflow):
             return the_group[:_index+1]
         return []
 
-    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=True):
+    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
+        b_log=True
         document_less = _dict1
         docid_less = _dict1["docid"]
         docchannel_less = document_less["docchannel"]
@@ -2370,7 +2370,7 @@ class Dataflow_dumplicate(Dataflow):
             same_count += 1
         if getLength(project_name_less)>0 and project_name_less==project_name_greater:
             same_count += 1
-        if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
+        if getLength(doctitle_refine_less)>0 and (doctitle_refine_less==doctitle_refine_greater or doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
             same_count += 1
         base_prob = 0
         if min_counts<3:
@@ -2849,7 +2849,7 @@ class Dataflow_dumplicate(Dataflow):
 
     def flow_dumpcate_comsumer(self):
         from multiprocessing import Process
-        process_count = 2
+        process_count = 3
         thread_count = 20
         list_process = []
         def start_thread():
@@ -3543,7 +3543,7 @@ class Dataflow_dumplicate(Dataflow):
         if tenderee!="" and len(list_product)>0:
             _query = [TermQuery(project_tenderee,tenderee),
                       should_q_product]
-            list_query.append([_query,2])
+            list_query.append([_query,1])
 
         if tenderee!="" and project_name!="":
             _query = [TermQuery(project_tenderee,tenderee),
@@ -3553,7 +3553,7 @@ class Dataflow_dumplicate(Dataflow):
         if tenderee!="" and agency!="":
             _query = [TermQuery(project_tenderee,tenderee),
                       TermQuery(project_agency,agency)]
-            list_query.append([_query,1])
+            list_query.append([_query,0])
 
         if tenderee!="" and float(bidding_budget)>0:
             _query = [TermQuery(project_tenderee,tenderee),
@@ -3574,12 +3574,12 @@ class Dataflow_dumplicate(Dataflow):
         if agency!="" and win_tenderer!="":
             _query = [TermQuery(project_agency,agency),
                       TermQuery(project_win_tenderer,win_tenderer)]
-            list_query.append([_query,2])
+            list_query.append([_query,0])
 
         if agency!="" and len(list_product)>0:
             _query = [TermQuery(project_agency,agency),
                       should_q_product]
-            list_query.append([_query,2])
+            list_query.append([_query,1])
 
         if win_tenderer!="" and len(list_code)>0:
             _query = [TermQuery(project_win_tenderer,win_tenderer),
@@ -3608,7 +3608,7 @@ class Dataflow_dumplicate(Dataflow):
         if len(list_code)>0:
             _query = [
                       should_q_code]
-            list_query.append([_query,1])
+            list_query.append([_query,2])
 
             _query = [
                 should_q_cod]
@@ -3623,11 +3623,11 @@ class Dataflow_dumplicate(Dataflow):
         if len(list_product)>0 and should_q_area is not None:
             _query = [should_q_area,
                       should_q_product]
-            list_query.append([_query,1])
+            list_query.append([_query,0])
 
         generate_time = time.time()-_time
         whole_time = time.time()-whole_time_start
-        log("projects merge rules whole_time:%.3f prepare_time:%.3f log_time:%.3f generate_time:%.3f"%(whole_time,prepare_time,log_time,generate_time))
+        # log("projects merge rules whole_time:%.3f prepare_time:%.3f log_time:%.3f generate_time:%.3f"%(whole_time,prepare_time,log_time,generate_time))
         return list_query
 
 
@@ -3649,6 +3649,7 @@ class Dataflow_dumplicate(Dataflow):
             must_not_q = []
             for _uuid in list(set_uuid):
                 must_not_q.append(TermQuery("uuid",_uuid))
+                print("must_not_q uuid:%s"%(_uuid))
 
 
             projects_merge_count = 0
@@ -3675,13 +3676,25 @@ class Dataflow_dumplicate(Dataflow):
                 bidding_budget = _proj.get(project_bidding_budget,-1)
                 win_tenderer = _proj.get(project_win_tenderer,"")
                 win_bid_price = _proj.get(project_win_bid_price,-1)
+                _dynamic = _proj.get(project_project_dynamics,"[]")
+                is_yanshou = False
+                list_dynamic = json.loads(_dynamic)
+                for _d in list_dynamic:
+                    _title = _d.get("doctitle","")
+                    if re.search("验收公[示告]",_title) is not None:
+                        is_yanshou = True
+                        break
 
                 province = _proj.get(project_province,"")
                 city = _proj.get(project_city,"")
                 district = _proj.get(project_district,"")
 
-                page_time_less = timeAdd(page_time,-150)
-                page_time_greater = timeAdd(page_time,120)
+                if is_yanshou:
+                    page_time_less = timeAdd(page_time,-750)
+                    page_time_greater = timeAdd(page_time,720)
+                else:
+                    page_time_less = timeAdd(page_time,-450)
+                    page_time_greater = timeAdd(page_time,420)
                 sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
                 _time = time.time()
                 list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district)
@@ -3693,14 +3706,14 @@ class Dataflow_dumplicate(Dataflow):
                 search_table_index = "project2_index_formerge"
                 project_cls = Project
 
-                print("page_time,min_date",page_time,min_date)
-                if page_time>=min_date:
-                    search_table = "project2_tmp"
-                    search_table_index = "project2_tmp_index"
-                    project_cls = Project_tmp
+                # print("page_time,min_date",page_time,min_date)
+                # if page_time>=min_date:
+                #     search_table = "project2_tmp"
+                #     search_table_index = "project2_tmp_index"
+                #     project_cls = Project_tmp
 
 
-                _step = 4
+                _step = 2
                 _begin = 0
                 must_queries = []
 
@@ -3709,22 +3722,26 @@ class Dataflow_dumplicate(Dataflow):
                     must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
                                 ]
 
-                print("page_time_less,page_time_greater",page_time,page_time_less,page_time_greater)
                 #sub_project_name非必要条件
                 # if sub_project_q is not None:
                 #     must_queries.append(sub_project_q)
 
                 projects_prepare_time += time.time()-_time
                 _time = time.time()
+                sort_type = SortOrder.DESC
                 while _begin<len(list_must_query):
+                    if sort_type==SortOrder.DESC:
+                        sort_type=SortOrder.ASC
+                    if sort_type==SortOrder.ASC:
+                        sort_type=SortOrder.DESC
                     list_should_q = []
-                    _limit = 20
+                    _limit = 10
                     for must_q,_count in list_must_query[_begin:_begin+_step]:
                         must_q1 = list(must_q)
                         must_q1.extend(must_queries)
                         list_should_q.append(BoolQuery(must_queries=must_q1))
 
-                        # _limit += _count*5
+                        _limit += _count*5
                     _query = BoolQuery(
                                        should_queries=list_should_q,
                                        must_not_queries=must_not_q[:100]
@@ -3734,7 +3751,7 @@ class Dataflow_dumplicate(Dataflow):
                     #                                                                     columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
 
                     rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search(search_table,search_table_index,
-                                                                                              SearchQuery(_query,limit=_limit),
+                                                                                              SearchQuery(_query,sort=Sort(sorters=[FieldSort(project_page_time,sort_type)]),limit=_limit),
                                                                                               columns_to_get=ColumnsToGet(column_names=check_columns,return_type=ColumnReturnType.SPECIFIED))
                     list_data = getRow_ots(rows)
 
@@ -3829,8 +3846,9 @@ class Dataflow_dumplicate(Dataflow):
             list_projects = self.merge_projects(list_projects,b_log)
             # log("merge projects takes:%.3f"%(time.time()-_time))
 
+
             _time = time.time()
-            dumplicate_document_in_merge(list_projects)
+            list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
             # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
 
             _time = time.time()
@@ -3838,7 +3856,7 @@ class Dataflow_dumplicate(Dataflow):
             # log("json projects takes:%.3f"%(time.time()-_time))
             if b_log:
                 log("project_json:%s"%project_json)
-            return project_json
+            return project_json,list_merge_dump
         except Exception as e:
             raise RuntimeError("error on dumplicate")
 
@@ -3858,13 +3876,29 @@ class Dataflow_dumplicate(Dataflow):
             else:
                 if _save==1:
                     set_fingerprint.add(fingerprint_less)
-        print("_fingerprint",_fingerprint)
-        print(set_fingerprint)
         if _fingerprint in set_fingerprint:
             return True
         return False
 
 
+    def check_page_time(self,item):
+        page_time = item.get(document_page_time,"")
+        has_before = False
+        has_after = False
+        if len(page_time)>0:
+            l_page_time = timeAdd(page_time,days=-90)
+            dict_time = item.get("dict_time",{})
+            for k,v in dict_time.items():
+                if v is not None and len(v)>0:
+                    if l_page_time>v:
+                        has_before = True
+                    if v>page_time:
+                        has_after = True
+        if not has_after and has_before:
+            log("check page_time false %s==%s-%s"%(l_page_time,k,v))
+            return False
+        return True
+
 
     def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
         try:
@@ -3901,9 +3935,10 @@ class Dataflow_dumplicate(Dataflow):
 
 
 
+            b_log = False if upgrade else True
             _time = time.time()
             # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
-            final_list = self.dumplicate_fianl_check(base_list)
+            final_list = self.dumplicate_fianl_check(base_list,b_log)
 
             exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),table_name)
             # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
@@ -3929,7 +3964,7 @@ class Dataflow_dumplicate(Dataflow):
             remove_list = []
 
 
-            if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
+            if self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid)):
                 dtmp.setValue(document_tmp_save,1,True)
                 # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
                 dmp_docid = ",".join([str(a) for a in list(dup_docid)])
@@ -3953,26 +3988,25 @@ class Dataflow_dumplicate(Dataflow):
 
             list_docids = list(dup_docid)
             list_docids.append(best_docid)
-            b_log = False if upgrade else True
 
             if item.get(document_update_document)=="true":
                 dtmp.setValue(document_tmp_save,1,True)
 
+            list_merge_dump = []
             if exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0:
                 log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
                 dtmp.setValue(document_tmp_projects,"[]",True)
             else:
-                dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log),True)
+                project_json,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
+                dtmp.setValue(document_tmp_projects,project_json,True)
             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
 
             if upgrade:
-                if table_name=="document_tmp":
-                    self.changeSaveStatus(remove_list)
-
                 # print(dtmp.getProperties())
                 dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
                 dtmp.setValue(document_tmp_best_docid,best_docid,True)
                 _flag = dtmp.update_row(self.ots_client)
+
                 if not _flag:
                     for i in range(10):
                         list_proj_json = dtmp.getProperties().get(document_tmp_projects)
@@ -3981,6 +4015,11 @@ class Dataflow_dumplicate(Dataflow):
                             dtmp.setValue(document_tmp_projects,json.dumps(list_proj[:len(list_proj)//2]),True)
                             if dtmp.update_row(self.ots_client):
                                 break
+                if table_name=="document_tmp":
+                    self.changeSaveStatus(remove_list)
+                    self.changeSaveStatus(list_merge_dump)
+
+
 
 
             # log("dump takes %.2f"%(time.time()-start_time))
@@ -4053,7 +4092,7 @@ class Dataflow_dumplicate(Dataflow):
         schedule = BlockingScheduler()
         schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
         schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/30")
-        schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
+        schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/15")
         schedule.add_job(self.flow_remove,"cron",hour="20")
         schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
         # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
@@ -4061,13 +4100,25 @@ class Dataflow_dumplicate(Dataflow):
 
     def changeSaveStatus(self,list_dict):
         for _dict in list_dict:
-            if _dict.get(document_tmp_save,1)==1:
-                _d = {"partitionkey":_dict["partitionkey"],
-                      "docid":_dict["docid"],
+            if isinstance(_dict,dict):
+                if _dict.get(document_tmp_save,1)==1:
+                    _d = {"partitionkey":_dict["partitionkey"],
+                          "docid":_dict["docid"],
+                          document_tmp_save:0
+                          }
+                    _d_tmp = Document_tmp(_d)
+                    if _d_tmp.exists_row(self.ots_client):
+                        _d_tmp.update_row(self.ots_client)
+            elif isinstance(_dict,int):
+                _d = {"partitionkey":_dict%500+1,
+                      "docid":_dict,
                       document_tmp_save:0
                       }
                 _d_tmp = Document_tmp(_d)
-                _d_tmp.update_row(self.ots_client)
+                if _d_tmp.fix_columns(self.ots_client,["status"],True):
+                    if _d_tmp.getProperties().get("status")==1:
+                        _d_tmp.setValue("status",0,True)
+                        _d_tmp.update_row(self.ots_client)
 
 
 
@@ -4175,8 +4226,10 @@ if __name__ == '__main__':
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(339737931)
-    # df_dump.test_merge([292315564],[287890754])
+    df_dump.test_dumplicate(380763870
+                            )
+    # df_dump.test_merge([372841008
+    #                     ],[370595571])
     # df_dump.flow_remove_project_tmp()
     print("takes",time.time()-a)
     # df_dump.fix_doc_which_not_in_project()

+ 1 - 1
BaseDataMaintenance/maintenance/product/product_parameter.py

@@ -199,7 +199,7 @@ class Product_Attachment_Processor():
             list_product = list(set(list_product))
             dp = Document_product(item)
             if attachments is None or attachments=="" or len(list_product)==0:
-                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile)
+                dp.setValue(DOCUMENT_PRODUCT_PARAMETER_STATUS,parameter_status_no_bidfile,True)
                 dp.update_row(self.ots_client)
                 return
             list_attachment = json.loads(attachments)

+ 37 - 4
BaseDataMaintenance/maintenance/proposedBuilding/DataSynchronization.py

@@ -37,7 +37,7 @@ class DataSynchronization():
         columns = ["uuid","crtime","json_list_group"]
 
         rows, next_token, total_count, is_all_succeed = ots_client.search(self.proposedBuilding_table, self.proposedBuilding_table_index,
-                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.DESC)]), limit=100, get_total_count=True),
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                           ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
         list_data = getRow_ots(rows)
         for _data in list_data:
@@ -196,11 +196,43 @@ class DataSynchronization():
         mt=MultiThreadHandler(task_queue,_handle,None,30)
         mt.run()
 
+    def drop_data(self):
+        ots_client = getConnect_ots()
+
+        bool_query = BoolQuery(must_queries=[ExistsQuery("crtime")])
+
+        task_queue = queue.Queue()
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search(self.proposedBuilding_table, self.proposedBuilding_table_index,
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+        for _data in list_data:
+            _proposed = proposedBuilding_tmp(_data)
+            task_queue.put(_proposed,True)
+            print(total_count,task_queue.qsize())
+        _count = len(list_data)
+        while next_token:
+            rows, next_token, total_count, is_all_succeed = ots_client.search(self.proposedBuilding_table, self.proposedBuilding_table_index,
+                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            for _data in list_data:
+                _proposed = proposedBuilding_tmp(_data)
+                task_queue.put(_proposed,True)
+                print(total_count,task_queue.qsize())
+
+        def _handle(item,result_queue):
+            item.delete_row(ots_client)
+
+        mt = MultiThreadHandler(task_queue,_handle,None,30)
+        mt.run()
+
     def scheduler(self):
         from BaseDataMaintenance.maintenance.major_project.unionDocument import MajorUnion
         mu = MajorUnion()
         _scheduler = BlockingScheduler()
-        _scheduler.add_job(self.maxcompute2ots,"cron",minute="*/8")
+        _scheduler.add_job(self.maxcompute2ots,"cron",minute="*/1")
         _scheduler.add_job(self.turn_stage,"cron",hour="*/5")
         _scheduler.add_job(mu.comsumer,"cron",minute="*/8")
         _scheduler.start()
@@ -215,8 +247,9 @@ def startSychro():
 if __name__=="__main__":
     ds = DataSynchronization()
     # # ds.scheduler()
-    # # ds.maxcompute2ots()
+    # ds.maxcompute2ots()
     # ds.turn_stage()
-    ds.fix_progress()
+    # ds.fix_progress()
+    ds.drop_data()
 
 

+ 3 - 1
BaseDataMaintenance/maxcompute/1.py

@@ -2084,7 +2084,9 @@ if __name__ == '__main__':
     # _str1 = "SXXY-ZBP-GG-2020002"
     # _str2 = "SXXY-ZBP-GG-2020002"
     # print(getSimilarityOfString(_str1,_str2))
-    print(check_doctitle("南京市秦淮新河沿线泰山公寓、天虹山庄、福润雅居南区小区环境综合整治","(雨花台区)秦淮新河沿线泰山公寓、天虹山庄、福润雅居南区小区环境综合整治勘察设计"))
+    # print(check_doctitle("南京市秦淮新河沿线泰山公寓、天虹山庄、福润雅居南区小区环境综合整治","(雨花台区)秦淮新河沿线泰山公寓、天虹山庄、福润雅居南区小区环境综合整治勘察设计"))
+    print(type({52,101,118,119,120}))
+    print((1 if 1==1 else 2) + (1 if 1==1 else 2))
     # print(check_product(None,None))
     # print(check_code("4451020073383382206021325","4451020073383382206021322"))
     # print(check_money("550.0","440.0","",""))

+ 11 - 3
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -889,7 +889,7 @@ code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
 num_pattern = re.compile("^\d+(?:\.\d+)?$")
 num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
 location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
-building_pattern = "工程招标代理|工程设计|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
+building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
 date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
 def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
     if code_greater is None:
@@ -990,10 +990,18 @@ def check_product(product_less,product_greater,split_char=","):
 
         _product_l = product_less.split(split_char)
         _product_g = product_greater.split(split_char)
+        same_count = 0
+        if len(_product_g)>len(_product_l):
+            a = _product_g
+            _product_g = _product_l
+            _product_l = a
         for _l in _product_l:
             for _g in _product_g:
                 if getSimilarityOfString(_l,_g)>=0.8:
-                    return True
+                    same_count += 1
+                    break
+        if same_count/len(_product_l)>0.5:
+            return True
         return False
     return True
 
@@ -1024,7 +1032,7 @@ def check_time(json_time_less,json_time_greater):
             if getLength(v)>0:
                 v1 = time_greater.get(k,"")
                 if getLength(v1)>0:
-                    if v!=v1:
+                    if v[:10]!=v1[:10]:
                         return False
     return True
 

+ 156 - 76
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -1707,12 +1707,33 @@ def generate_packages_properties(list_docs):
     for _doc in list_docs:
         _dict = {}
         sub_docs = _doc.get("sub_docs")
+
+
         if sub_docs is not None:
             for _d in sub_docs:
                 sub_project_code = _d.get(project_sub_project_code,"")
                 sub_project_name = _d.get(project_sub_project_name,"")
                 win_tenderer = _d.get(project_win_tenderer,"")
                 win_bid_price = _d.get(project_win_bid_price,"")
+
+                if sub_project_name=="Project":
+
+                    win_exists = False
+                    win_price_exists = False
+                    win_sum = 0
+                    for _d1 in sub_docs:
+                        if _d.get(project_sub_project_name,"")=="Project":
+                            continue
+                        if _d1.get(project_win_tenderer,"")==win_tenderer:
+                            win_exists = True
+                        if _d1.get(project_win_tenderer,"")==win_tenderer and _d1.get(project_win_bid_price,"")!="":
+                            win_sum += float(_d1.get(project_win_bid_price,0))
+                        if _d1.get(project_win_bid_price,"")==win_bid_price:
+                            win_price_exists = True
+                    if win_exists and (win_price_exists or win_bid_price=="" or float(win_bid_price)==0 or float(win_bid_price)==win_sum):
+                        continue
+
+
                 _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
                 if _key in set_key:
                     continue
@@ -2110,6 +2131,7 @@ def dumplicate_projects(list_projects,b_log=False):
     list_projects.sort(key=lambda x:x.get("keyvaluecount",0),reverse=True)
     cluster_projects = list_projects[:50]
     _count = 10
+    print("dumplicate projects rest",len(cluster_projects))
     while _count>0:
         _count -= 1
         _update = False
@@ -2140,7 +2162,7 @@ def dumplicate_projects(list_projects,b_log=False):
             break
         cluster_projects = list_p
 
-
+    print("dumplicate projects rest",len(cluster_projects))
     return cluster_projects
 
 def update_projects_by_project(project_dict,projects):
@@ -2277,7 +2299,7 @@ def check_time_merge(json_time_less,json_time_greater,b_log,set_time_key=set([pr
                 if getLength(v)>0:
                     v1 = time_greater.get(k,"")
                     if getLength(v1)>0:
-                        _dis = getTimeStamp(v)-getTimeStamp(v1)
+                        _dis = getTimeStamp(v[:10])-getTimeStamp(v1[:10])
                         if _dis>86400*5 or _dis<-86400*5:
                             if b_log:
                                 log("check time failed %s-%s-%s"%(str(k),str(v),str(v1)))
@@ -2395,7 +2417,7 @@ def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,
         if tenderee in enterprise_to_merge or tenderee_to_merge in enterprise:
             pass
         else:
-            if getSimilarityOfString(tenderee,tenderee_to_merge)==1:
+            if getSimilarityOfString(re.sub("[省市]",'',tenderee),re.sub("[省市]",'',tenderee_to_merge))==1:
                 pass
             else:
                 if b_log:
@@ -2403,18 +2425,21 @@ def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,
                 return -1
     _set2 = set([a for a in [agency,agency_to_merge] if a!=""])
     if len(_set2)>1:
-        if getSimilarityOfString(agency,agency_to_merge)==1:
+        if agency in enterprise_to_merge or agency_to_merge in enterprise:
             pass
         else:
-            if b_log:
-                log("check agency failed %s===%s"%(str(agency),str(agency_to_merge)))
-            return -1
+            if getSimilarityOfString(re.sub("[省市]",'',agency),re.sub("[省市]",'',agency_to_merge))==1:
+                pass
+            else:
+                if b_log:
+                    log("check agency failed %s===%s"%(str(agency),str(agency_to_merge)))
+                return -1
     _set3 = set([a for a in [win_tenderer,win_tenderer_to_merge] if a!=""])
     if len(_set3)>1:
         if win_tenderer in enterprise_to_merge or win_tenderer_to_merge in enterprise:
             pass
         else:
-            if getSimilarityOfString(win_tenderer,win_tenderer_to_merge)==1:
+            if getSimilarityOfString(re.sub("[省市]",'',win_tenderer),re.sub("[省市]",'',win_tenderer_to_merge))==1:
                 pass
             else:
                 if b_log:
@@ -2445,16 +2470,23 @@ def check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_b
         return -1
 
     _set1 = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
+
     if len(_set1)>1:
         if b_log:
             log("check win_bid_price failed %s===%s"%(str(win_bid_price),str(win_bid_price_to_merge)))
         return -1
     #check money
+    if len(_set)==1 and len(_set1)==0:
+        if (bidding_budget>0 and bidding_budget_to_merge>0):
+            return 1
+
 
     if len(_set)==1 and len(_set1)==1:
         max_win_bid_price = max(_set1)
         max_bidding_budget = max(_set)
         radio = max_win_bid_price/max_bidding_budget
+        if (bidding_budget>0 and bidding_budget_to_merge>0) or (win_bid_price>0 and win_bid_price_to_merge>0):
+            return 1
         #允许中标金额大于预算10%
         if max_win_bid_price>max_bidding_budget*(1.1):
             if b_log:
@@ -2464,9 +2496,8 @@ def check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_b
             if radio<0.3:
                 if b_log:
                     log("check money failed radio<0.3 %s===%s"%(str(max(_set1)),str(max(_set))))
-                return -1
-        if (bidding_budget>0 and bidding_budget_to_merge>0) or (win_bid_price>0 and win_bid_price_to_merge>0):
-            return 1
+                return 0
+                # return -1
     return 0
 
 def check_project_codes_merge(list_code,list_code_to_merge,b_log):
@@ -2475,6 +2506,8 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
     has_similar = False
     for _c in list_code[:100]:
         for _c1 in list_code_to_merge[:100]:
+            _c = str(_c).replace("【","[").replace("】","]")
+            _c1 = str(_c1).replace("【","[").replace("】","]")
             _simi = getSimilarityOfString(_c,_c1,3)
             if _simi==1:
                 has_same = True
@@ -2493,7 +2526,7 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
     return 0
 
 
-def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=False,simple_check=False):
+def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=False,simple_check=False):
     docids = _proj.get(project_docids,"")
     page_time = _proj.get(project_page_time,"")
     project_codes = _proj.get(project_project_codes,"")
@@ -2511,6 +2544,8 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
 
     project_dynamics = _proj.get(project_project_dynamics)
 
+
+
     enterprise = _proj.get("enterprise")
     if enterprise is None:
         try:
@@ -2541,8 +2576,16 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
     zhao_biao_page_time_to_merge = _dict.get(project_zhao_biao_page_time,"")
     zhong_biao_page_time_to_merge = _dict.get(project_zhong_biao_page_time,"")
 
+
     project_dynamics_to_merge = _dict.get(project_project_dynamics)
 
+    is_few = False
+    if (0 if project_codes=="" else 1) + (0 if project_name=="" else 1) + (0 if bidding_budget<0 else 1) +(0 if tenderee=="" else 1) + (0 if win_bid_price<0 else 1) + (0 if win_tenderer=="" else 1)<=1:
+        is_few = True
+    if (0 if project_codes_to_merge=="" else 1) + (0 if project_name_to_merge=="" else 1) + (0 if bidding_budget_to_merge<0 else 1) +(0 if tenderee_to_merge=="" else 1) + (0 if win_bid_price_to_merge<0 else 1) + (0 if win_tenderer_to_merge=="" else 1)<=1:
+        is_few = True
+
+
     list_code_to_merge = [a for a in project_codes_to_merge.split(",") if a!='']
     if project_code_to_merge!="":
         list_code_to_merge.append(project_code_to_merge)
@@ -2571,30 +2614,45 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
             return False,0
         return False
 
+
     #事件判断-金额
     _money_check = check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log)
     check_dict[_money_check] += 1
-    if check_dict[-1]>0:
-        if return_prob:
-            return False,0
-        return False
     prob_count += _money_check
 
     #人物判断-角色
     _roles_check = check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log)
     check_dict[_roles_check] += 1
-    if check_dict[-1]>0:
-        if return_prob:
-            return False,0
-        return False
+
     prob_count += _roles_check
-    _product_check = check_product_merge(product,product_to_merge,b_log)
 
-    prob_count += _product_check*2
+
+    _product_check = check_product_merge(product,product_to_merge,b_log)
     _project_name_check = check_project_name_merge(project_name,project_name_to_merge,b_log)
-    prob_count += _project_name_check
     _title_check = check_dynamics_title_merge(project_dynamics,project_dynamics_to_merge,b_log)
-    prob_count += _title_check
+
+    #事件判断-编号
+    _codes_check = check_project_codes_merge(list_code,list_code_to_merge,b_log)
+    check_dict[_codes_check] += 1
+
+    prob_count += _codes_check
+
+    if is_few:
+        if _codes_check!=1:
+            if _title_check!=1:
+                if return_prob:
+                    return False,0
+                return False
+            if len(enterprise)>0 and len(enterprise_to_merge)>0:
+                if len(enterprise & enterprise_to_merge)==0:
+                    if return_prob:
+                        return False,0
+                    return False
+            if _product_check==-1:
+                if return_prob:
+                    return False,0
+                return False
+
     min_count = 2
     if product=="" or product_to_merge=="":
         min_count = 1
@@ -2604,12 +2662,12 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
             log("project_name,project_name_to_merge %s %s"%(project_name,project_name_to_merge))
             log("product,product_to_merge %s %s"%(product,product_to_merge))
             log("check _project_name_check+_product_check+_title_check<2 failed %d %s,%s,%s"%(_project_name_check+_product_check+_title_check,str(_project_name_check),str(_product_check),str(_title_check)))
-        if return_prob:
-            return False,0
-        return False
+        # if return_prob:
+        #     return False,0
+        # return False
+        prob_count += -1
     else:
-        check_dict[1] += 1
-        check_dict[1] += 1
+        prob_count += 2
 
     if simple_check:
         if return_prob:
@@ -2617,14 +2675,6 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
             return True,_prob
         return True
 
-    #事件判断-编号
-    _codes_check = check_project_codes_merge(list_code,list_code_to_merge,b_log)
-    check_dict[_codes_check] += 1
-    if check_dict[-1]>0:
-        if return_prob:
-            return False,0
-        return False
-    prob_count += _codes_check
 
     #时间判断-其他时间
     _time_check = check_time_merge(_proj,_dict,b_log)
@@ -2632,6 +2682,10 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
 
     #时间判断-分包编号
     _sub_project_name_check = check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log)
+    if docids==docids_to_merge and _sub_project_name_check==-1:
+        if return_prob:
+            return False,0
+        return False
     check_dict[_sub_project_name_check] += 1
     prob_count += _sub_project_name_check
 
@@ -2642,18 +2696,23 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
 
     _prob = prob_count/8
 
+    if b_log:
+        log("check %s-%s result%s"%(docids,docids_to_merge,str(check_dict)))
     if _prob<0.15:
         if b_log:
-            log("prob less than 0.15")
+            log("prob less than 0.15 prob_count:%d"%(prob_count))
         if return_prob:
             return False,_prob
         return False
 
-    if b_log:
-        log("check %s-%s result%s"%(docids,docids_to_merge,str(check_dict)))
+
     if check_dict[-1]>0:
         if check_dict[-1]==1:
-            if (_codes_check==1 and _roles_check==1 and _product_check==1) or (_roles_check==1 and _money_check==1 and _product_check==1):
+            if _roles_check==-1:
+                if return_prob:
+                    return False,0
+                return False
+            if (_codes_check==1 and _roles_check==1 and _product_check==1 and _money_check>=0) or (_roles_check==1 and _money_check==1 and _product_check==1) or (_money_check==1 and _product_check==1 and _codes_check==1) or (_money_check>=0 and _roles_check==1 and _codes_check==1 and (_title_check==1 or _project_name_check==1 or _product_check==1)):
                 if return_prob:
                     return True,_prob
                 return True
@@ -2859,18 +2918,20 @@ def get_page_time_dis(page_time,n_page_time):
 
 def check_page_time_dup(page_time,n_page_time):
     _dis = get_page_time_dis(page_time,n_page_time)
-    if _dis>=0 and _dis<=10:
+    if _dis>=0 and _dis<=20:
         return True
     return False
 
 
-def dumplicate_document_in_merge(list_projects):
+def dumplicate_document_in_merge(list_projects,dup_docid):
     '''
     合并时去重
     :param list_projects:
     :return:
     '''
 
+    dup_docid = set(dup_docid)
+    set_dup_total = set()
     for _proj in list_projects:
         try:
             docids = _proj.get(project_docids,"")
@@ -2882,48 +2943,65 @@ def dumplicate_document_in_merge(list_projects):
             _time = time.time()
             for _d in list_dynamics:
                 docid = _d.get(document_docid)
+                doctitle = _d.get(document_doctitle,"")
+                title_search = re.search("[一二三四五六七八九十1-9]+(?:次|标|包)",doctitle)
                 if str(docid) not in set_docids:
                     continue
+
+                if docid in dup_docid:
+                    continue
                 _status = _d.get(document_status,201)
                 is_multipack = _d.get("is_multipack",True)
                 extract_count = _d.get(document_tmp_extract_count,0)
                 docchannel = _d.get(document_docchannel,0)
                 page_time = _d.get(document_page_time,"")
-                if _status>=401 and _status<=450:
-                    set_dup_docid.add(str(docid))
-                # if docchannel>0:
-                #     if docchannel in dict_channel_proj:
-                #         n_d = dict_channel_proj[docchannel]
-                #         n_docid = n_d.get(document_docid)
-                #         n_is_multipack = n_d.get("is_multipack",True)
-                #         n_extract_count = n_d.get(document_tmp_extract_count,0)
-                #         n_page_time = n_d.get(document_page_time,"")
-                #         if docid==n_docid:
-                #             continue
-                #         if not check_page_time_dup(page_time,n_page_time):
-                #             continue
-                #
-                #         if extract_count>n_extract_count:
-                #             n_d[document_status] = 401
-                #             set_dup_docid.add(str(n_docid))
-                #             dict_channel_proj[docchannel] = _d
-                #         elif extract_count==n_extract_count:
-                #             if int(n_docid)>int(docid):
-                #                 n_d[document_status] = 401
-                #                 set_dup_docid.add(str(n_docid))
-                #                 dict_channel_proj[docchannel] = _d
-                #             elif int(n_docid)<int(docid):
-                #                 _d[document_status] = 401
-                #                 set_dup_docid.add(str(docid))
-                #         else:
-                #             _d[document_status] = 401
-                #             set_dup_docid.add(str(docid))
-                #         if not is_multipack and not n_is_multipack:
-                #             pass
-                #     else:
-                #         dict_channel_proj[docchannel] = _d
+                # if _status>=401 and _status<=450:
+                #     print(":1",docid)
+                #     set_dup_docid.add(str(docid))
+                if docchannel in {52,101,118,119,120} and extract_count>5:
+                    if docchannel in dict_channel_proj:
+                        n_d = dict_channel_proj[docchannel]
+                        n_docid = n_d.get(document_docid)
+                        n_is_multipack = n_d.get("is_multipack",True)
+                        n_extract_count = n_d.get(document_tmp_extract_count,0)
+                        n_page_time = n_d.get(document_page_time,"")
+                        n_doctitle = n_d.get(document_doctitle,"")
+                        if docid==n_docid:
+                            continue
+                        if not check_page_time_dup(page_time,n_page_time):
+                            continue
+                        if is_multipack or n_is_multipack:
+                            continue
+                        n_title_search = re.search("[一二三四五六七八九十1-9]+(?:次|标|包)",n_doctitle)
+                        if title_search is None and n_title_search is None:
+                            pass
+                        elif title_search is not None and n_title_search is not None and str(title_search.group())==str(n_title_search.group()):
+                            pass
+                        else:
+                            continue
+
+                        if extract_count>n_extract_count:
+                            n_d[document_status] = 401
+                            set_dup_docid.add(str(n_docid))
+                            dict_channel_proj[docchannel] = _d
+                        elif extract_count==n_extract_count:
+                            if int(n_docid)>int(docid):
+                                n_d[document_status] = 401
+                                set_dup_docid.add(str(n_docid))
+                                dict_channel_proj[docchannel] = _d
+                            elif int(n_docid)<int(docid):
+                                _d[document_status] = 401
+                                set_dup_docid.add(str(docid))
+                        else:
+                            _d[document_status] = 401
+                            set_dup_docid.add(str(docid))
+                        if not is_multipack and not n_is_multipack:
+                            pass
+                    else:
+                        dict_channel_proj[docchannel] = _d
 
             set_docids = set_docids-set_dup_docid
+            set_dup_total |= set_dup_docid
             if len(set_docids)==0:
                 log("projects set_docids length is zero %s"%(docids))
             else:
@@ -2932,8 +3010,10 @@ def dumplicate_document_in_merge(list_projects):
             _proj[project_docid_number] = len(set_docids)
             _proj[project_dup_docid] = ",".join(list(set_dup_docid))
             # log("dumplicate_document docid%s dynamic %d takes%.3f"%(str(docid),len(list_dynamics),time.time()-_time))
+
         except Exception as e:
             traceback.print_exc()
+    return list(set_dup_total)
 
 @annotate('string,string->string')
 class f_dumplicate_projects(BaseUDAF):

+ 7 - 7
BaseDataMaintenance/model/ots/BaseModel.py

@@ -48,13 +48,6 @@ class BaseModel():
                 _list.append((_key,_v))
         return _list
 
-    def getPrimaryKey_turple(self):
-        _list = []
-        for _key in self.getPrimary_keys():
-            _list.append((_key,self.getProperties().get(_key)))
-        return _list
-
-
     @staticmethod
     def search(ots_client,table_name,key_tuple,columns_to_get):
         try:
@@ -74,6 +67,13 @@ class BaseModel():
             traceback.print_exc()
             log("get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s" % (str(e.get_http_status()), e.get_error_code(), e.get_error_message(), e.get_request_id()))
 
+
+    def getPrimaryKey_turple(self):
+        _list = []
+        for _key in self.getPrimary_keys():
+            _list.append((_key,self.getProperties().get(_key)))
+        return _list
+
     def fix_columns(self,ots_client,columns_to_fix,_flag):
         _dict = self.search(ots_client,self.table_name,self.getPrimaryKey_turple(),columns_to_fix)
         if _dict is not None:

+ 26 - 8
BaseDataMaintenance/model/ots/designed_project.py

@@ -21,13 +21,28 @@ class designed_project(BaseModel):
         for _spid in spids.split(","):
             should_q.append(TermQuery("spids",_spid))
 
-        bool_query = BoolQuery(should_queries=should_q)
-        columns = ["docids"]
-        rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
-                                                                          SearchQuery(bool_query, limit=100,get_total_count=True),
-                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
-        list_dict = getRow_ots(rows)
-        return list_dict
+        _begin = 0
+        _step = 20
+        list_dict = []
+        while 1:
+            _end = _begin +_step
+            bool_query = BoolQuery(should_queries=should_q[_begin:_end])
+            columns = ["status"]
+            rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
+                                                                              SearchQuery(bool_query, limit=100,get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            list_dict.extend(getRow_ots(rows))
+            _begin = _end
+            if _begin>=len(should_q):
+                break
+        list_dict_final = []
+        set_id = set()
+        for _dict in list_dict:
+            if _dict.get("id","") in set_id:
+                continue
+            list_dict_final.append(_dict)
+            set_id.add(_dict.get("id",""))
+        return list_dict_final
 
     def getAttribute_turple(self):
         _list = []
@@ -51,7 +66,10 @@ class designed_project(BaseModel):
         if len(list_dict)>0:
             for _dict in list_dict[1:]:
                 _designed_delete = designed_project(_dict)
-                _designed_delete.delete_row(ots_client)
+
+                _designed_delete.setValue("status","404",True)
+                _designed_delete.update_project(ots_client)
+                # _designed_delete.delete_row(ots_client)
 
             _designed_update = designed_project(list_dict[0])
             properties = _designed_update.getProperties()

+ 130 - 1
BaseDataMaintenance/model/ots/document.py

@@ -592,8 +592,137 @@ def delete_documents():
     print("delete count:%d"%_count)
 
 
+def turn_document_docchannel():
+    from BaseDataMaintenance.dataSource.source import getConnect_ots
+    from BaseDataMaintenance.common.multiThread import MultiThreadHandler
+    import queue
+    from threading import Thread
+    import json
+    task_queue = queue.Queue()
+    from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
+    ots_client = getConnect_ots()
+    def producer(task_queue,ots_client):
+
+        bool_query = BoolQuery(
+            must_queries=[
+                TermQuery("web_source_no","DX007520-7"),
+                # TermQuery("docid",363793104)
+                # MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
+                # BoolQuery(should_queries=[
+                #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
+                #                           # MatchPhraseQuery("doctitle","中国电信"),
+                #                           # MatchPhraseQuery("doctextcon","中国电信"),
+                #                           # MatchPhraseQuery("attachmenttextcon","中国电信")]),
+                #                           # RangeQuery(document_status,88,120,True,True),
+                #                           RangeQuery("page_time","2022-03-24","2022-03-25",True,False),
+                #                           ExistsQuery
+                #                                  #,TermQuery(document_docid,171146519)
+                #                                  ]
+                # )
+            ],
+            # must_not_queries=[WildcardQuery("DX004354*")]
+        )
+
+        # bool_query = BoolQuery(
+        #     # must_queries=[
+        #     #     RangeQuery("crtime","2023-08-30 15:00:00","2023-08-30 23:59:59"),
+        #     #     NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
+        #     # ],
+        #     # must_not_queries=[WildcardQuery("attachmenttextcon","*")],
+        #     should_queries=[
+        #         NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","个体工商户")),
+        #         NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer","机械设备")),
+        #     ]
+        #
+        # )
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(["detail_link"],return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+        print(total_count)
+        _count = len(list_data)
+        for _data in list_data:
+            _document = Document(_data)
+            task_queue.put(_document)
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(["detail_link"],return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            _count += len(list_data)
+            print("%d/%d"%(_count,total_count))
+            for _data in list_data:
+                _document = Document(_data)
+                task_queue.put(_document)
+
+        # docids = [223820830,224445409]
+        # for docid in docids:
+        #     _dict = {document_docid:int(docid),
+        #              document_partitionkey:int(docid)%500+1,
+        #              }
+        #     task_queue.put(Document(_dict))
+        # import pandas as pd
+        # df = pd.read_excel("G:\\20221212error.xlsx")
+        # for docid in df["docid"]:
+        #     _dict = {document_docid:int(docid),
+        #              document_partitionkey:int(docid)%500+1,
+        #              }
+        #     task_queue.put(Document(_dict))
+        log("task_queue size:%d"%(task_queue.qsize()))
+
+    def _handle(item,result_queue,ots_client):
+        #change attach value
+        # list_attachment = json.loads(item.getProperties().get(document_attachment_path))
+        # print("docid",item.getProperties().get(document_docid))
+        # for attach in list_attachment:
+        #
+        #     filemd5 = attach.get(document_attachment_path_filemd5,"")
+        #     _document_html = item.getProperties().get(document_dochtmlcon,"")
+        #
+        #     _file_title = item.getTitleFromHtml(filemd5,_document_html)
+        #     filelink = item.getSourceLinkFromHtml(filemd5,_document_html)
+        #     attach[document_attachment_path_fileTitle] = _file_title
+        #     attach[document_attachment_path_fileLink] = filelink
+        #
+        # item.setValue(document_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
+        # item.all_columns.remove(document_dochtmlcon)
+
+        #change status
+        # item.setValue(document_docchannel,item.getProperties().get(document_original_docchannel),True)
+        # item.setValue(document_status,random.randint(151,171),True)
+        # item.setValue(document_area,"华南",True)
+        # item.setValue(document_province,"广东",True)
+        # item.setValue(document_city,"珠海",True)
+        # item.setValue(document_district,"金湾区",True)
+        # item.setValue(document_status,1,True)
+        # print(item.getProperties())
+        # item.update_row(ots_client)
+        detail_link = item.getProperties().get("detail_link","")
+        if "/012002002/" in detail_link:
+            partitionkey = item.getProperties().get("partitionkey")
+            docid = item.getProperties().get("docid")
+            _dict = {document_partitionkey:partitionkey,
+                     document_docid:docid,
+                     document_docchannel:101,
+                     document_original_docchannel:101}
+            doc = Document(_dict)
+            doc.update_row(ots_client)
+            print(_dict)
+
+        # log("update %d status done"%(item.getProperties().get(document_docid)))
+        pass
+
+
+    t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
+    t_producer.start()
+    t_producer.join()
+    mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
+    mt.run()
+
 if __name__=="__main__":
     # turn_extract_status()
-    turn_document_status()
+    # turn_document_status()
     # drop_extract2()
     # fixDocumentHtml()
+    turn_document_docchannel()

+ 31 - 14
BaseDataMaintenance/model/ots/document_tmp.py

@@ -243,7 +243,6 @@ def turn_extract_status():
     mt.run()
 
 
-
 def turn_document_tmp_status():
     from BaseDataMaintenance.dataSource.source import getConnect_ots
     from BaseDataMaintenance.common.multiThread import MultiThreadHandler
@@ -253,13 +252,22 @@ def turn_document_tmp_status():
     task_queue = queue.Queue()
     from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
     ots_client = getConnect_ots()
-    def producer(task_queue,ots_client):
 
+    def producer1(task_queue,ots_client):
+        for l_a in a.split("\n"):
+            l_a = l_a.strip()
+            if l_a !="":
+                task_queue.put(Document_tmp({document_tmp_partitionkey:int(l_a)%500+1,
+                                             document_tmp_docid:int(l_a),
+                                             document_tmp_status:66}))
+
+    def producer(task_queue,ots_client):
 
         bool_query = BoolQuery(
             must_queries=[
-                TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
-                # RangeQuery("status",66,71),
+                # TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
+                TermQuery("save",1),
+                RangeQuery("status",72),
                 # BoolQuery(should_queries=[
                 #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
                 #                           # MatchPhraseQuery("doctitle","中国电信"),
@@ -272,16 +280,16 @@ def turn_document_tmp_status():
                 #                                  ]
                 # )
             ],
-            must_not_queries=[
-                TermQuery("docid",288599518)
-                # ExistsQuery("status"),
-                # ExistsQuery("page_time"),
-                              ]
+            # must_not_queries=[
+            #     TermQuery("docid",288599518)
+            #     # ExistsQuery("status"),
+            #     # ExistsQuery("page_time"),
+            #                   ]
         )
 
         rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                        SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
-                                                                       columns_to_get=ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
+                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
         list_data = getRow_ots(rows)
         print(total_count)
         # print(list_data)
@@ -292,7 +300,7 @@ def turn_document_tmp_status():
         while next_token:
             rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-                                                                           columns_to_get=ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
+                                                                           columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
             list_data = getRow_ots(rows)
             _count += len(list_data)
             print("%d/%d"%(_count,total_count))
@@ -342,13 +350,22 @@ def turn_document_tmp_status():
         # json.loads(_extract_json)
         # item.setValue(document_tmp_status,71,True)
         # item.setValue(document_tmp_save,1,True)
-        print(item.getProperties())
+        # if item.exists_row(ots_client):
+        #     item.update_row(ots_client)
+        # print(item.getProperties())
         # item.update_row(ots_client)
         # log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
-        item.delete_row(ots_client)
+        # item.delete_row(ots_client)
+        from BaseDataMaintenance.model.ots.document import Document
+
+        Doc = Document(item.getProperties())
+        if Doc.fix_columns(ots_client,["status"],True):
+            if Doc.getProperties().get("status",0)>=401:
+                print(Doc.getProperties().get("docid"),"redo")
+                item.setValue("status",66,True)
+                item.update_row(ots_client)
         pass
 
-
     t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
     t_producer.start()
     t_producer.join()

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 135 - 73
BaseDataMaintenance/model/ots/proposedBuilding_tmp.py


Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů