mongotest.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. '''
  2. Created on 2019年5月20日
  3. @author: User
  4. '''
  5. import pymongo
  6. import re
  7. from .entityLink import *
  8. from BiddingKG.dl.common.Utils import *
  9. from multiprocessing import Process
  10. def getData():
  11. client = pymongo.MongoClient("mongodb://bxkc:bidizhaobiaowang2017@121.46.18.113:17017/bxkc")
  12. db = client["bxkc"]
  13. collection_enterprise = db['enterprise_profile']
  14. cursor = collection_enterprise.find({},{"enterprise_name":1}).limit(3000)
  15. list_entitys = []
  16. list_entitys_pair = []
  17. for row in cursor:
  18. entity_name = row["enterprise_name"]
  19. entity_name_sub = re.sub("(有限(责任)?公司|子公司|(政府)?采购中心)","",entity_name)
  20. list_entitys.append([entity_name,entity_name_sub])
  21. parts = 2
  22. nums = len(list_entitys)//20
  23. list_index = []
  24. for i in range(parts-1):
  25. list_index.append([i*nums,(i+1)*nums])
  26. list_index.append([(parts-1)*nums,len(list_entitys)])
  27. for item in list_index:
  28. proc = Process(target=process_thread,args=(list_entitys,item[0],item[1]))
  29. proc.start()
  30. def process_thread(list_entitys,index_begin,index_end):
  31. list_entitys_pair = []
  32. for i in range(index_begin,index_end):
  33. print(i)
  34. source = list_entitys[i]
  35. for j in range(i+1,len(list_entitys)):
  36. target = list_entitys[j]
  37. if edit_distance(source[0], target[0])<4 and jaccard_score(source[0], target[0])>0.6:
  38. list_entitys_pair.append([source[0],target[0]])
  39. for item in list_entitys_pair:
  40. print(item)
  41. save(list_entitys_pair,"list_entitys_pair_"+str(index_begin)+"_"+str(index_end)+".pk")
  42. if __name__=="__main__":
  43. getData()