luojiehua
/
DataMining


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
							#encoding:GBK
import sys
import os
sys.path.append("../")

import pandas as pd
from dataSource.source import *
import json
from utils.multiThread import MultiThreadHandler
import queue
from utils.Utils import *
from dataSource.pool import ConnectorPool
import re
from tablestore import *
import traceback
from utils.hashUtil import aesCipher
from export.exportEnterprise import getDictEnterprise,getOneContact


set_columns = set()
list_df_columns = []

def set_dict_item(_dict,name,v):
    _dict[name] = getLegal_str(v)
    if name not in set_columns:
        set_columns.add(name)
        list_df_columns.append(getLegal_str(name))

def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
    _dict[name] = getLegal_str(v)
    if name not in set_columns1:
        set_columns1.add(name)
        list_df_columns1.append(getLegal_str(name))

def getTenderee(contacts):
    list_contacts = json.loads(contacts)
    for _contact in list_contacts:
        if _contact.get("type")=="业主单位":
            _phone = _contact.get("cellphone")
            if _phone and re.search("^1\d{10}$",_phone.split(" ")[-1]) is not None:
                return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("cellphone")
    for _contact in list_contacts:
        if _contact.get("type")=="业主单位":
            return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("phone")
    return "","",""

def exportDocument_by_pagetime():
    # filename = "../data/重复公告.xlsx"
    # df = pd.read_excel(filename)
    ots_client = getConnect_ots()


    set_enter = set()
    str_enter = '''
 
    '''
    for a in re.split("\s+",str_enter):
        if a.strip()!="":
            set_enter.add(a.strip())

    columns = ["doctitle","doctextcon","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
    columns = ["page_time","contacts","covered_area","floor_space","progress","project_address","project_description","project_investment","project_name"]
    def getData(df_data,rows,set_line,list_keyword):
        list_data = getRow_ots(rows)
        for row in list_data:
            item = {}
            _dict = row
            # set_dict_item(item,"",_dict.get("docid",""))
            tenderee,contact_name,contact_phone = getTenderee(_dict.get("contacts","[]"))
            set_dict_item(item,"业主",tenderee)
            set_dict_item(item,"业主联系人",contact_name)
            set_dict_item(item,"业主联系电话",contact_phone)
            set_dict_item(item,"发布时间",_dict.get("page_time",""))
            set_dict_item(item,"建筑面积",_dict.get("covered_area",""))
            set_dict_item(item,"层数",_dict.get("floor_space",""))
            set_dict_item(item,"阶段",_dict.get("progress",""))
            set_dict_item(item,"项目地址",_dict.get("project_address",""))
            set_dict_item(item,"简介",_dict.get("project_description",""))
            set_dict_item(item,"项目投资",_dict.get("project_investment",""))
            set_dict_item(item,"地址","http://www.bidizhaobiao.com/nzjxm-%d.html"%_dict.get("id",""))
            # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
            # set_dict_item(item,"项目名称",_dict.get("project_name",""))
            # set_dict_item(item,"区县",_dict.get("district",""))
            # set_dict_item(item,"发布时间",_dict.get("page_time",""))
            # set_dict_item(item,"创建时间",_dict.get("crtime",""))
            #
            # set_dict_item(item,"行业一级分类",_dict.get("industry",""))
            # set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
            #
            # set_dict_item(item,"uuid",_dict.get("uuid"))
            #
            # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))
            #
            # set_dict_item(item,"项目编号",_dict.get("project_code",""))
            # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
            # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
            # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
            # set_dict_item(item,"代理单位",_dict.get("agency",""))
            # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
            # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
            # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
            #
            # set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
            # sub_docs_json = _dict.get("sub_docs_json")
            # if sub_docs_json is not None:
            #     for _doc in json.loads(sub_docs_json):
            #         if "win_tenderer" in _doc:
            #             set_dict_item(item,"中标单位",_doc["win_tenderer"])
            #         if "win_tenderee_manager" in _doc:
            #             set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
            #         if "win_tenderee_phone" in _doc:
            #             set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
            #         if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
            #             set_dict_item(item,"中标金额",_doc["win_bid_price"])
            #         if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
            #             set_dict_item(item,"招标金额",_doc["bidding_budget"])
            # if "招标金额" not in item:
            #     set_dict_item(item,"招标金额","")
            # if "中标金额" not in item:
            #     set_dict_item(item,"中标金额","")
            # if "中标单位" not in item:
            #     set_dict_item(item,"中标单位","")
            #
            # if "中标单位联系人" not in item:
            #     set_dict_item(item,"中标单位联系人","")
            # if "中标单位联系电话" not in item:
            #     set_dict_item(item,"中标单位联系电话","")
            #
            # # if item["中标单位"] not in set_enter:
            # #     continue
            #
            # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
            # if _line in set_line:
            #     continue
            # if item["招标金额"]=="":
            #     continue

            # set_line.add(_line)
            for k,v in item.items():
                if k not in df_data:
                    df_data[k] = []
                df_data[k].append(v)

    # list_province = ["江西","湖南","四川","安徽"]
    list_province = ["全国"]
    for _province in list_province:
        df_data = {}

        str_keywords = '''
        医院	养老院	疗养院	老人院
            '''
        list_keyword = []
        list_should_keyword = []
        for _p in re.split("\s|、|，|,|/",str_keywords):
            if _p.strip()=="":
                continue
            list_keyword.append(_p)
            print(_p)
            list_should_keyword.append(MatchPhraseQuery('full_text', '%s'%_p.strip()))
            # list_should_keyword.append(Ma('attachmenttextcon','%s'%_p.strip()))

        s_tenderee = '酒店、地产'
        list_should_ten = []
        for _p in re.split("、",s_tenderee):
            if _p.strip()=="":
                continue
            list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))

        list_should_chan = []
        list_should_chan.append(TermQuery("docchannel",101))
        # list_should_chan.append(TermQuery("docchannel",101))
        # list_should_chan.append(TermQuery("docchannel",102))

        list_should_bidway = []
        s_bidway = "公开招标、邀请招标、竞争性谈判、竞争性磋商、询价采购、单一来源采购"
        for _b in re.split("、",s_bidway):
            if _b.strip()=="":
                continue
            list_should_bidway.append(MatchPhraseQuery("doctextcon",_b.strip()))


        str_area = '北京、天津'
        list_should_area = []
        for _p in str_area.split("、"):
            list_should_area.append(TermQuery("province",_p))


        must_not_q = []
        not_str = '校园电视台 虚拟演播室'
        for _s in not_str.split(" "):
            must_not_q.append(MatchPhraseQuery("doctextcon",_s))

        should_q_keywrod = BoolQuery(should_queries=list_should_keyword)
        bool_query = BoolQuery(must_queries=[

            RangeQuery("page_time","2020-01-01","2022-01-01",True,True)
            ,should_q_keywrod
        ]
        )


        rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))


        set_line = set()
        _count = len(rows)
        print(list_keyword)
        print("total_count:%d"%total_count)
        getData(df_data,rows,set_line,list_keyword)
        while next_token:
            print("%d/%d"%(_count,total_count))
            rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            getData(df_data,rows,set_line,list_keyword)
            _count += len(rows)
            # if len(df_data[list(df_data.keys())[0]])>=100:
            #     break


        # list_df_columns.append('信用代码')
        # list_df_columns.append('原网地址')
        df1 = pd.DataFrame(df_data)
        df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)


if __name__=="__main__":
    exportDocument_by_pagetime()