#coding:utf8 import codecs import json import fool import numpy as np import psycopg2 import re class EntityLink(): def __init__(self): companyUnitList = ["有限公司","公司","有限责任公司","有限","责任","集团","分公司","'","(",")","(",")"] posUnitList = ["ns"] self.companyUnit = set(companyUnitList) self.posUnit = set(posUnitList) def removeUnit(self,list): result = [] for item in list: if item[0] not in self.companyUnit: if item[1] not in self.posUnit: result.append(item[0]) return "".join(result) def getCompability(self,list): list = self.removeUnit(list) set1 = set("".join(list[0])) set2 = set("".join(list[1])) common = set1&set2 compability = [] compability.append(len(common)/len(set1)) compability.append(len(common)/len(set2)) return max(compability) if __name__=="__main__": file = "C:\\Users\\User\\Desktop\\bxkc.enterprise_crawl_test.json" #file = "bxkc.enterprise_crawl_test.json" el = EntityLink() conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() with codecs.open(file, "r", encoding="utf8") as f: text = "["+f.read().replace("}","},")[:-1]+"]" index = 0 for item in json.loads(text,encoding="utf8"): index += 1 print(index) name = item["enterprise_name"] name = re.sub("'|\"|(|(|)|)","",name) reference = el.removeUnit(fool.pos_cut(name)[0]) #print(name,reference) sql = " insert into entity(id,entity_text,entity_reference) values("+str(index)+",'"+str(name)+"','"+str(reference)+"')" cursor.execute(sql) if index%1000==0: conn.commit() f.close() conn.close()