getData.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. import numpy as np
  2. import pandas as pd
  3. from py2neo import Graph, Node, Relationship
  4. import time
  5. import scipy.spatial
  6. from fuzzywuzzy import fuzz
  7. import hashlib
  8. import json
  9. import multiprocessing as mp
  10. import heapq
  11. import os
  12. from functools import partial
  13. from multiprocessing.pool import Pool
  14. dir_project = os.getcwd()+"\\data1\\100000\\zh_en\\"
  15. dir_align_project = os.getcwd()+"\\data1\\Align\\zh_en\\"
  16. dir_Neo4j_File = "C:\\Users\\admin\\Desktop\\Neo4j.csv"
  17. dir_Neo4j_Align_File = "C:\\Users\\admin\\Desktop\\Neo4j_aligned.csv"
  18. dir_ent_ids_1_real = dir_project + "ent_ids_1_real"
  19. dir_ent_ids_2_real = dir_project + "ent_ids_2_real"
  20. dir_triples_1_real = dir_project + "triples_1_real"
  21. dir_triples_2_real = dir_project + "triples_2_real"
  22. dir_ref_ent_ids_real = dir_project + "ref_ent_ids_real"
  23. dir_ref_ent_ids_real_neg = dir_project + "ref_ent_ids_real_neg"
  24. dir_training_attrs_1 = dir_project + "training_attrs_1"
  25. dir_training_attrs_2 = dir_project + "training_attrs_2"
  26. dir_ModelId2RealId = dir_project + "ModelId2RealId"
  27. dir_ent_ids_1 = dir_project + "ent_ids_1"
  28. dir_triples_1 = dir_project + "triples_1"
  29. dir_ref_ent_ids = dir_project + "ref_ent_ids"
  30. dir_ref_ent_ids_neg = dir_project + "ref_ent_ids_neg"
  31. def getDataFromNeo4j3(skip_num, limit_num):
  32. """
  33. 取包括已对齐的公司ID在内的一共20w个公司的ID,每个公司ID取最多10条关联
  34. 输入多进程需要的开始结束下标
  35. :return:
  36. """
  37. graph = Graph('http://118.31.10.60:7474', auth=('bxkc_web', 'bxkc_web'))
  38. lines = file2Data(dir_ref_ent_ids_real)
  39. id_list = []
  40. # for index in range(len(lines)):
  41. # ids = lines[index][0:-1].split("\t")
  42. # id_list.append(int(ids[0]))
  43. # id_list.append(int(ids[1]))
  44. # id_list = list(set(id_list))
  45. print("Match Neo4j, getting ID...")
  46. result = graph.run("match (o:Organization) "
  47. "return id(o) order by id(o) "
  48. " skip " + str(skip_num) +
  49. " limit " + str(limit_num)
  50. ).to_ndarray()
  51. result = result.tolist()
  52. for index in range(len(result)):
  53. id_list.append(int(result[index][0]))
  54. id_list = list(set(id_list))
  55. id_list.sort()
  56. print(len(id_list))
  57. # print(id_list)
  58. print("Match Neo4j...")
  59. start_time = time.time()
  60. results = []
  61. for index in range(len(id_list)):
  62. result = graph.run("MATCH (o:Organization)-[r]->(p:Project) "
  63. "where p.project_name is not null and id(o)=" + str(id_list[index]) +
  64. " RETURN id(o), o.name, o.nicknames, o.area, o.city, o.district, "
  65. "id(r), type(r), r.price, "
  66. "id(p), p.project_name, p.area, p.city, p.district "
  67. "LIMIT 10"
  68. ).to_ndarray()
  69. results.append(result.tolist())
  70. # print(index, len(id_list))
  71. if index % 500 == 0:
  72. end_time = time.time()
  73. print(index, end_time-start_time)
  74. print("write into csv...")
  75. start_time = time.time()
  76. df_list = []
  77. for i in range(len(results)):
  78. for j in range(len(results[i])):
  79. df_list.append(results[i][j])
  80. df = pd.DataFrame(df_list)
  81. df.columns = ["id(o)", "o.name", "o.nicknames", "o.area", "o.city", "o.district",
  82. "id(r)", "type(r)", "r.price",
  83. "id(p)", "p.project_name", "p.area", "p.city", "p.district"]
  84. df.to_csv("C:\\Users\\admin\\Desktop\\Neo4j"+"_"+str(skip_num)+"_"+str(limit_num)+".csv")
  85. end_time = time.time()
  86. print(end_time-start_time)
  87. def getDataFromNeo4j_align():
  88. """
  89. 取已对齐的公司ID的所有关联
  90. :return:
  91. """
  92. graph = Graph('http://118.31.10.60:7474', auth=('bxkc_web', 'bxkc_web'))
  93. print("Match Neo4j, getting ID...")
  94. lines = file2Data(dir_align_project+"ref_ent_ids_real")
  95. id_list = []
  96. for index in range(len(lines)):
  97. ids = lines[index][0:-1].split("\t")
  98. id_list.append(ids[0])
  99. id_list.append(ids[1])
  100. id_list = list(set(id_list))
  101. id_list.sort()
  102. # print(len(id_list))
  103. print("Match Neo4j...")
  104. start_time = time.time()
  105. results = []
  106. for index in range(len(id_list)):
  107. result = graph.run("MATCH (o:Organization)-[r]->(p:Project) "
  108. "where p.project_name is not null and id(o)=" + id_list[index] +
  109. " RETURN id(o), o.name, o.nicknames, o.area, o.city, o.district, "
  110. "id(r), type(r), r.price, "
  111. "id(p), p.project_name, p.area, p.city, p.district "
  112. "LIMIT 10"
  113. ).to_ndarray()
  114. results.append(result.tolist())
  115. # print(index, len(id_list))
  116. end_time = time.time()
  117. print(end_time-start_time)
  118. print("write into csv...")
  119. start_time = time.time()
  120. df_list = []
  121. for i in range(len(results)):
  122. for j in range(len(results[i])):
  123. df_list.append(results[i][j])
  124. df = pd.DataFrame(df_list)
  125. df.columns = ["id(o)", "o.name", "o.nicknames", "o.area", "o.city", "o.district",
  126. "id(r)", "type(r)", "r.price",
  127. "id(p)", "p.project_name", "p.area", "p.city", "p.district"]
  128. df.to_csv("C:\\Users\\admin\\Desktop\\Neo4j_aligned.csv")
  129. end_time = time.time()
  130. print(end_time-start_time)
  131. def getDataFromNeo4j():
  132. """
  133. 随机取前200w条数据
  134. :return:
  135. """
  136. graph = Graph('http://118.31.10.60:7474', auth=('bxkc_web', 'bxkc_web'))
  137. print("Match Neo4j...")
  138. start_time = time.time()
  139. results = []
  140. result = graph.run("MATCH (o:Organization)-[r]->(p:Project) "
  141. "where p.project_name is not null "
  142. "RETURN id(o), o.name, o.nicknames, o.area, o.city, o.district, "
  143. "id(r), type(r), r.price, "
  144. "id(p), p.project_name, p.area, p.city, p.district "
  145. "skip 2000000 LIMIT 2000000"
  146. ).to_ndarray()
  147. results.append(result.tolist())
  148. end_time = time.time()
  149. print(end_time-start_time)
  150. print("write into csv...")
  151. start_time = time.time()
  152. df_list = []
  153. for i in range(len(results)):
  154. for j in range(len(results[i])):
  155. df_list.append(results[i][j])
  156. df = pd.DataFrame(df_list)
  157. df.columns = ["id(o)", "o.name", "o.nicknames", "o.area", "o.city", "o.district",
  158. "id(r)", "type(r)", "r.price",
  159. "id(p)", "p.project_name", "p.area", "p.city", "p.district"]
  160. df.to_csv("C:\\Users\\admin\\Desktop\\Neo4j.csv")
  161. end_time = time.time()
  162. print(end_time-start_time)
  163. def testNeo4j():
  164. graph = Graph('http://118.31.10.60:7474', auth=('bxkc_web', 'bxkc_web'))
  165. print("Match Neo4j...")
  166. start_time = time.time()
  167. a = graph.run("MATCH (o:Organization)-[r]->(p:Project) "
  168. "where p.project_name is not null "
  169. "RETURN id(o), o.name, o.nicknames, o.area, o.city, o.district, "
  170. "id(r), type(r), r.price, "
  171. "id(p), p.project_name, p.area, p.city, p.district "
  172. "skip 2000000 LIMIT 2000000").to_ndarray()
  173. end_time = time.time()
  174. print(end_time-start_time)
  175. print("write into csv...")
  176. start_time = time.time()
  177. df = pd.DataFrame(a)
  178. df.columns = ["id(o)", "o.name", "o.nicknames", "o.area", "o.city", "o.district",
  179. "id(r)", "type(r)", "r.price",
  180. "id(p)", "p.project_name", "p.area", "p.city", "p.district"]
  181. df.to_csv("C:\\Users\\admin\\Desktop\\Neo4j1.csv")
  182. end_time = time.time()
  183. print(end_time-start_time)
  184. def Neo4j2Data(filename):
  185. dir = dir_project
  186. df = pd.read_csv(filename)
  187. # files
  188. triples = []
  189. ent_ids = []
  190. training_attrs = []
  191. # df = df[0:100000]
  192. for column in df.columns:
  193. df[column] = df[column].apply(lambda x: "" if x == "未知" else x)
  194. for index, row in df.iterrows():
  195. triples.append(str(row["id(o)"]) + "\t" + str(row["id(r)"]) + "\t" + str(row["id(p)"]) + "\n")
  196. ent_ids.append(str(row["id(o)"]) + "\t" + "O" + str(row["o.name"]) + "\n")
  197. ent_ids.append(str(row["id(p)"]) + "\t" + "P" + str(row["p.project_name"]) + "\n")
  198. training_attrs.append("O" + str(row["o.name"]) + "\t" + str(row["o.nicknames"]) + "\t" + str(row["o.area"])
  199. + "\t" + str(row["o.city"]) + "\t" + str(row["o.district"]) + "\n")
  200. training_attrs.append("P" + str(row["p.project_name"]) + "\t" + str(row["p.area"])
  201. + "\t" + str(row["p.city"]) + "\t" + str(row["p.district"]) + "\n")
  202. triples1 = triples
  203. ent_ids1 = ent_ids
  204. training_attrs1 = training_attrs
  205. data2File(triples1, dir+"triples_1_real")
  206. # data2File(triples2, dir+"triples_2_real")
  207. data2File(ent_ids1, dir+"ent_ids_1_real")
  208. # data2File(ent_ids2, dir+"ent_ids_2_real")
  209. data2File(training_attrs1, dir+"training_attrs_1")
  210. # data2File(training_attrs1, dir+"training_attrs_2_a")
  211. def data2File(_list, filename):
  212. with open(filename, 'w', encoding='UTF-8') as f:
  213. f.writelines(_list)
  214. def file2Data(filename):
  215. with open(filename, 'r', encoding='UTF-8') as f:
  216. _list = f.readlines()
  217. return _list
  218. def data2FileAppend(_list, filename):
  219. with open(filename, 'a+', encoding='UTF-8') as f:
  220. f.writelines(_list)
  221. def similarity():
  222. df = pd.read_csv(dir_Neo4j_File)
  223. # df1 = df[10000:20000]
  224. # 59000:1000000 1037000:1500000
  225. df = df[:100000]
  226. df = df[["id(o)", "o.name", "o.area", "o.city"]]
  227. # df1 = df1[["id(o)", "o.name", "o.area", "o.city"]]
  228. org_list = []
  229. for index, row in df.iterrows():
  230. org_list.append(str(row["id(o)"]) + " " + str(row["o.name"]) + " " + str(row["o.area"])
  231. + " " + str(row["o.city"]))
  232. # org_list1 = []
  233. # for index, row in df1.iterrows():
  234. # org_list1.append(str(row["id(o)"]) + " " + str(row["o.name"]) + " " + str(row["o.area"])
  235. # + " " + str(row["o.city"]))
  236. # 去重
  237. org_list = list(set(org_list))
  238. print("去重后org_list:", len(org_list))
  239. # org_list1 = list(set(org_list1))
  240. # print("去重后org_list1:", len(org_list1))
  241. # 分离id和其他字段
  242. id_list = []
  243. for i in range(len(org_list)):
  244. ss = org_list[i].split(" ")
  245. id_list.append(ss[0])
  246. org_list[i] = ss[1] + " " + ss[2] + " " + ss[3]
  247. # id_list1 = []
  248. # for i in range(len(org_list1)):
  249. # ss = org_list1[i].split(" ")
  250. # id_list1.append(ss[0])
  251. # org_list1[i] = ss[1] + " " + ss[2] + " " + ss[3]
  252. # 循环计算相似度
  253. # 阈值
  254. alpha = 95
  255. beta = 98
  256. alpha_dict ={}
  257. beta_dict = {}
  258. alpha_list = []
  259. beta_list = []
  260. for i in range(len(org_list)):
  261. if i % 1000 == 0:
  262. print("Loop:", i)
  263. i2 = i+1
  264. for j in range(i2):
  265. org1 = org_list[i]
  266. org2 = org_list[j]
  267. # 初步匹配两个字符串中的字符,避免每条都计算相似度,效率低
  268. org_name1 = org1.split(" ")[0]
  269. org_name2 = org2.split(" ")[0]
  270. cnt = 0
  271. if len(org_name1) <= len(org_name2):
  272. for c in org_name1:
  273. if c in org_name2:
  274. cnt += 1
  275. else:
  276. for c in org_name2:
  277. if c in org_name1:
  278. cnt += 1
  279. if cnt / (len(org_name1) if len(org_name1) <= len(org_name2) else len(org_name2)) <= 0.7:
  280. continue
  281. if org1 == org2:
  282. continue
  283. if org1 in org2:
  284. org1 = org2
  285. if org2 in org1:
  286. org2 = org1
  287. sim = fuzz.ratio(org1, org2)
  288. negative_flag = 1
  289. if alpha <= sim:
  290. alpha_dict[(org_list[i], org_list[j])] = sim
  291. # 判断IDs是否在已对齐中
  292. aligned_list = file2Data(dir_ref_ent_ids_real)
  293. for ids in aligned_list:
  294. if str(id_list[i])+"\t"+str(id_list[j]) == ids \
  295. or str(id_list[j])+"\t"+str(id_list[i]) == ids:
  296. negative_flag = 0
  297. break
  298. if negative_flag:
  299. alpha_list.append(org_list[i]+"#"+org_list[j]+"#"+str(sim)+"\t"
  300. +str(id_list[i])+"\t"+str(id_list[j])+"\n")
  301. print(org_list[i], "#", org_list[j], "#", sim, id_list[i], id_list[j])
  302. # if sim >= beta:
  303. # beta_dict[(org_list[i], org_list[j])] = sim
  304. # beta_list.append([org_list[i], id_list[i], org_list[j], id_list[j], sim])
  305. # print(org_list[i], "#", org_list[j], "#", sim, id_list[i], id_list[j])
  306. # df = pd.DataFrame.from_dict(alpha_dict, orient='index')
  307. # df1 = pd.DataFrame.from_dict(beta_dict, orient='index')
  308. # df = pd.DataFrame(alpha_list)
  309. # df1 = pd.DataFrame(beta_list)
  310. data2File(alpha_list, "C:\\Users\\admin\\Desktop\\ref_ent_ids_real_neg")
  311. # data2File(alpha_list, "C:\\Users\\admin\\Desktop\\Similar_10w")
  312. # df.to_csv("C:\\Users\\admin\\Desktop\\Similar0.9.csv")
  313. # df1.to_csv("C:\\Users\\admin\\Desktop\\Similar0.95.csv")
  314. def resetID4KeyValue(filename, start_index):
  315. _list = file2Data(filename)
  316. map_dict = loadDict(dir_ModelId2RealId)
  317. for i in range(len(_list)):
  318. ss = _list[i].split("\t")
  319. real_id = ss[0]
  320. if len(ss) <= 1:
  321. _list[i].split(" ")
  322. continue
  323. value = ss[1]
  324. if str(real_id) in map_dict.keys():
  325. model_id = map_dict[str(real_id)]
  326. else:
  327. map_dict[str(real_id)] = start_index
  328. model_id = start_index
  329. _list[i] = str(model_id) + "\t" + value
  330. # print(_list[i])
  331. start_index += 1
  332. data2File(_list, dir_ent_ids_1)
  333. saveDict(dir_ModelId2RealId, map_dict)
  334. return
  335. def resetID4Tuple(filename):
  336. _list = file2Data(filename)
  337. map_dict = loadDict(dir_ModelId2RealId)
  338. for i in range(len(_list)):
  339. ss = _list[i][:-1].split("\t")
  340. real_id_1 = ss[0]
  341. real_id_2 = ss[1]
  342. if str(real_id_1) in map_dict.keys():
  343. model_id_1 = map_dict[str(real_id_1)]
  344. else:
  345. print("not exists ID1 mapping:", real_id_1)
  346. if str(real_id_2) in map_dict.keys():
  347. model_id_2 = map_dict[str(real_id_2)]
  348. else:
  349. print("not exists ID2 mapping:", real_id_2)
  350. _list[i] = str(model_id_1) + "\t" + str(model_id_2) + "\n"
  351. data2File(_list, dir_ref_ent_ids_neg)
  352. def resetID4Triple(filename):
  353. _list = file2Data(filename)
  354. map_dict = loadDict(dir_ModelId2RealId)
  355. for i in range(len(_list)):
  356. ss = _list[i][:-1].split("\t")
  357. real_id_1 = ss[0]
  358. ref = ss[1]
  359. real_id_2 = ss[2]
  360. if str(real_id_1) in map_dict.keys():
  361. model_id_1 = map_dict[str(real_id_1)]
  362. else:
  363. print("not exists ID1 mapping:", real_id_1)
  364. if str(real_id_2) in map_dict.keys():
  365. model_id_2 = map_dict[str(real_id_2)]
  366. else:
  367. print("not exists ID2 mapping:", real_id_2)
  368. _list[i] = str(model_id_1) + "\t" + str(ref) + "\t" + str(model_id_2) + "\n"
  369. data2File(_list, dir_triples_1)
  370. return
  371. def loadDict(filename):
  372. with open(filename, "r") as json_file:
  373. dic = json.load(json_file)
  374. return dic
  375. def saveDict(filename, dic):
  376. with open(filename, 'w') as json_file:
  377. json.dump(dic, json_file)
  378. def deleteDuplicateId(filename):
  379. ## 去重
  380. _dir = filename
  381. lines = file2Data(_dir)
  382. print(len(lines))
  383. lines = list(set(lines))
  384. lines.sort(key=lambda x: x.split("\t")[0])
  385. print(len(lines))
  386. data2File(lines, _dir)
  387. def deleteDuplicateId2(filename):
  388. ## 去重
  389. _dir = filename
  390. lines = file2Data(_dir)
  391. print(len(lines))
  392. new_lines = []
  393. for i in range(len(lines)):
  394. ss = lines[i][:-1].split("\t")
  395. if int(ss[0]) > int(ss[1]):
  396. new_lines.append(ss[1] + "\t" + ss[0] + "\n")
  397. else:
  398. new_lines.append(ss[0] + "\t" + ss[1] + "\n")
  399. new_lines = list(set(new_lines))
  400. new_lines.sort(key=lambda x: x.split("\t")[0])
  401. print(len(new_lines))
  402. data2File(new_lines, _dir)
  403. def initMapDict():
  404. d = {"-1": -1}
  405. saveDict(dir_ModelId2RealId, d)
  406. d = loadDict(dir_ModelId2RealId)
  407. # print(d, type(d))
  408. def multiProcess():
  409. process_num = 15
  410. skip_num = 300000
  411. limit_num = 50000
  412. for i in range(process_num):
  413. print(skip_num, limit_num)
  414. p = mp.Process(target=getDataFromNeo4j3, args=(skip_num, limit_num))
  415. skip_num += limit_num
  416. p.start()
  417. def multiThread():
  418. return
  419. def clean_ID_Entity(filename):
  420. lines = file2Data(filename)
  421. print(len(lines))
  422. new_lines = []
  423. for i in range(len(lines)):
  424. ss = lines[i].split("\t")
  425. if len(ss) == 2:
  426. new_lines.append(ss[0] + "\t" + ss[1])
  427. print(len(new_lines))
  428. data2File(new_lines, filename)
  429. def mergeFile(file_name_prefix, target_file, file_num, limit_num):
  430. skip_num = 0
  431. df_all = pd.DataFrame()
  432. for i in range(file_num):
  433. df = pd.read_csv(file_name_prefix + "_" + str(skip_num) + "_" + str(limit_num) + ".csv")
  434. df = df[["id(o)", "o.name", "o.nicknames", "o.area", "o.city", "o.district",
  435. "id(r)", "type(r)", "r.price",
  436. "id(p)", "p.project_name", "p.area", "p.city", "p.district"]]
  437. df_all = df_all.append(df)
  438. # print(df_all)
  439. skip_num += limit_num
  440. df_all.columns = ["id(o)", "o.name", "o.nicknames", "o.area", "o.city", "o.district",
  441. "id(r)", "type(r)", "r.price",
  442. "id(p)", "p.project_name", "p.area", "p.city", "p.district"]
  443. df_all.to_csv(target_file)
  444. def alignAppend2Data(filename=dir_Neo4j_Align_File):
  445. dir = dir_align_project
  446. dir2dir = dir_project
  447. df = pd.read_csv(filename)
  448. # files
  449. triples = []
  450. ent_ids = []
  451. training_attrs = []
  452. for column in df.columns:
  453. df[column] = df[column].apply(lambda x: "" if x == "未知" else x)
  454. for index, row in df.iterrows():
  455. triples.append(str(row["id(o)"]) + "\t" + str(row["id(r)"]) + "\t" + str(row["id(p)"]) + "\n")
  456. ent_ids.append(str(row["id(o)"]) + "\t" + "O" + str(row["o.name"]) + "\n")
  457. ent_ids.append(str(row["id(p)"]) + "\t" + "P" + str(row["p.project_name"]) + "\n")
  458. training_attrs.append("O" + str(row["o.name"]) + "\t" + str(row["o.nicknames"]) + "\t" + str(row["o.area"])
  459. + "\t" + str(row["o.city"]) + "\t" + str(row["o.district"]) + "\n")
  460. training_attrs.append("P" + str(row["p.project_name"]) + "\t" + str(row["p.area"])
  461. + "\t" + str(row["p.city"]) + "\t" + str(row["p.district"]) + "\n")
  462. data2File(triples, dir+"triples_1_real")
  463. data2File(ent_ids, dir+"ent_ids_1_real")
  464. data2File(training_attrs, dir+"training_attrs_1")
  465. data2FileAppend(triples, dir2dir+"triples_1_real")
  466. data2FileAppend(ent_ids, dir2dir+"ent_ids_1_real")
  467. data2FileAppend(training_attrs, dir2dir+"training_attrs_1")
  468. return
  469. def getAlignIDOrg():
  470. list1 = file2Data(dir_align_project+"ref_ent_ids_real")
  471. list2 = file2Data(dir_align_project+"ent_ids_1_real")
  472. id_org_dict = {}
  473. id_align = []
  474. for i in range(len(list2)):
  475. ss = list2[i][:-1].split("\t")
  476. if ss[1][0] == "O":
  477. id_org_dict[str(ss[0])] = ss[1]
  478. for i in range(len(list1)):
  479. ss = list1[i][:-1].split("\t")
  480. if str(ss[0]) in id_org_dict.keys():
  481. id_align.append(str(ss[0])+"\t"+id_org_dict[str(ss[0])]+"\n")
  482. if str(ss[1]) in id_org_dict.keys():
  483. id_align.append(str(ss[1])+"\t"+id_org_dict[str(ss[1])]+"\n")
  484. id_align = list(set(id_align))
  485. data2File(id_align, dir_align_project+"ent_ids_aligned")
  486. def test():
  487. print(os.getcwd())
  488. # id_vec = [[100, 1], [60, 2], [4, 200], [33, 300]]
  489. # id_ndarray = np.array(id_vec)
  490. # print(id_vec)
  491. # sim = scipy.spatial.distance.cdist(id_vec, id_vec[:2], metric='cityblock')
  492. # print(sim)
  493. # print(sim.shape)
  494. # print(heapq.nlargest(2, range(len(id_ndarray)), id_ndarray.take))
  495. if __name__ == '__main__':
  496. # testNeo4j()
  497. # Neo4j2Data()
  498. # similarity()
  499. # initMapDict()
  500. # deleteDuplicateId()
  501. # resetID4KeyValue(dir_ent_ids_1_real, 0)
  502. # resetID4Tuple(dir_ref_ent_ids_real)
  503. # resetID4Triple(dir_triples_1_real)
  504. # getDataFromNeo4j2()
  505. #####################################
  506. # Neo4j2Data(dir_Neo4j_File)
  507. # alignAppend2Data(dir_Neo4j_Align_File)
  508. # deleteDuplicateId(dir_ent_ids_1_real)
  509. # initMapDict()
  510. # clean_ID_Entity(dir_ent_ids_1_real)
  511. # resetID4KeyValue(dir_ent_ids_1_real, 0)
  512. # resetID4Tuple(dir_ref_ent_ids_real_neg)
  513. # resetID4Triple(dir_triples_1_real)
  514. # deleteDuplicateId2(dir_ref_ent_ids_real)
  515. #######################################
  516. # deleteDuplicateId2()
  517. # getDataFromNeo4j3()
  518. # multiProcess()
  519. # dir = "C:\\Users\\admin\\Desktop\\Neo4j"
  520. # mergeFile(dir, dir+"4.csv", 8, 30000)
  521. # ll = [0, 1, 2, 3]
  522. # for i in range(0, len(ll), 2):
  523. # i2 = i+1
  524. # for j in range(0, len(ll), 2):
  525. # print(ll[i], "-", ll[j])
  526. test()
  527. # getAlignIDOrg()
  528. # resetID4KeyValue(dir_project+"ent_ids_aligned", 0)
  529. # for i in range(0, 22, 5):
  530. # print(i)
  531. # _dict = loadDict(dir_ModelId2RealId)
  532. #
  533. # for i in _dict.keys():
  534. # if i == "":
  535. # print(i, _dict[i])
  536. # j = i
  537. # _dict.pop(j)
  538. # saveDict(dir_ModelId2RealId, _dict)
  539. # getDataFromNeo4j_align()
  540. # alignAppend2Data()