testEntityMatching.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #coding:utf8
  2. import codecs
  3. import json
  4. import fool
  5. import numpy as np
  6. import psycopg2
  7. import re
  8. class EntityLink():
  9. def __init__(self):
  10. companyUnitList = ["有限公司","公司","有限责任公司","有限","责任","集团","分公司","'","(",")","(",")"]
  11. posUnitList = ["ns"]
  12. self.companyUnit = set(companyUnitList)
  13. self.posUnit = set(posUnitList)
  14. def removeUnit(self,list):
  15. result = []
  16. for item in list:
  17. if item[0] not in self.companyUnit:
  18. if item[1] not in self.posUnit:
  19. result.append(item[0])
  20. return "".join(result)
  21. def getCompability(self,list):
  22. list = self.removeUnit(list)
  23. set1 = set("".join(list[0]))
  24. set2 = set("".join(list[1]))
  25. common = set1&set2
  26. compability = []
  27. compability.append(len(common)/len(set1))
  28. compability.append(len(common)/len(set2))
  29. return max(compability)
  30. if __name__=="__main__":
  31. file = "C:\\Users\\User\\Desktop\\bxkc.enterprise_crawl_test.json"
  32. #file = "bxkc.enterprise_crawl_test.json"
  33. el = EntityLink()
  34. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  35. cursor = conn.cursor()
  36. with codecs.open(file, "r", encoding="utf8") as f:
  37. text = "["+f.read().replace("}","},")[:-1]+"]"
  38. index = 0
  39. for item in json.loads(text,encoding="utf8"):
  40. index += 1
  41. print(index)
  42. name = item["enterprise_name"]
  43. name = re.sub("'|\"|(|(|)|)","",name)
  44. reference = el.removeUnit(fool.pos_cut(name)[0])
  45. #print(name,reference)
  46. sql = " insert into entity(id,entity_text,entity_reference) values("+str(index)+",'"+str(name)+"','"+str(reference)+"')"
  47. cursor.execute(sql)
  48. if index%1000==0:
  49. conn.commit()
  50. f.close()
  51. conn.close()