12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- #coding:utf8
- import codecs
- import json
- import fool
- import numpy as np
- import psycopg2
- import re
- class EntityLink():
-
- def __init__(self):
- companyUnitList = ["有限公司","公司","有限责任公司","有限","责任","集团","分公司","'","(",")","(",")"]
- posUnitList = ["ns"]
- self.companyUnit = set(companyUnitList)
- self.posUnit = set(posUnitList)
- def removeUnit(self,list):
- result = []
- for item in list:
- if item[0] not in self.companyUnit:
- if item[1] not in self.posUnit:
- result.append(item[0])
- return "".join(result)
-
- def getCompability(self,list):
- list = self.removeUnit(list)
- set1 = set("".join(list[0]))
- set2 = set("".join(list[1]))
- common = set1&set2
- compability = []
- compability.append(len(common)/len(set1))
- compability.append(len(common)/len(set2))
- return max(compability)
-
- if __name__=="__main__":
- file = "C:\\Users\\User\\Desktop\\bxkc.enterprise_crawl_test.json"
- #file = "bxkc.enterprise_crawl_test.json"
-
- el = EntityLink()
-
- conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
-
- cursor = conn.cursor()
-
- with codecs.open(file, "r", encoding="utf8") as f:
- text = "["+f.read().replace("}","},")[:-1]+"]"
- index = 0
- for item in json.loads(text,encoding="utf8"):
- index += 1
- print(index)
- name = item["enterprise_name"]
- name = re.sub("'|\"|(|(|)|)","",name)
- reference = el.removeUnit(fool.pos_cut(name)[0])
- #print(name,reference)
- sql = " insert into entity(id,entity_text,entity_reference) values("+str(index)+",'"+str(name)+"','"+str(reference)+"')"
- cursor.execute(sql)
- if index%1000==0:
- conn.commit()
- f.close()
- conn.close()
-
-
|