12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- '''
- Created on 2019年5月20日
- @author: User
- '''
- import pymongo
- import re
- from .entityLink import *
- from BiddingKG.dl.common.Utils import *
- from multiprocessing import Process
- def getData():
- client = pymongo.MongoClient("mongodb://bxkc:bidizhaobiaowang2017@121.46.18.113:17017/bxkc")
- db = client["bxkc"]
- collection_enterprise = db['enterprise_profile']
-
- cursor = collection_enterprise.find({},{"enterprise_name":1}).limit(3000)
-
- list_entitys = []
- list_entitys_pair = []
- for row in cursor:
- entity_name = row["enterprise_name"]
- entity_name_sub = re.sub("(有限(责任)?公司|子公司|(政府)?采购中心)","",entity_name)
- list_entitys.append([entity_name,entity_name_sub])
-
- parts = 2
- nums = len(list_entitys)//20
- list_index = []
- for i in range(parts-1):
- list_index.append([i*nums,(i+1)*nums])
- list_index.append([(parts-1)*nums,len(list_entitys)])
-
- for item in list_index:
- proc = Process(target=process_thread,args=(list_entitys,item[0],item[1]))
- proc.start()
-
- def process_thread(list_entitys,index_begin,index_end):
- list_entitys_pair = []
- for i in range(index_begin,index_end):
- print(i)
- source = list_entitys[i]
- for j in range(i+1,len(list_entitys)):
- target = list_entitys[j]
- if edit_distance(source[0], target[0])<4 and jaccard_score(source[0], target[0])>0.6:
- list_entitys_pair.append([source[0],target[0]])
- for item in list_entitys_pair:
- print(item)
- save(list_entitys_pair,"list_entitys_pair_"+str(index_begin)+"_"+str(index_end)+".pk")
- if __name__=="__main__":
- getData()
|