''' Created on 2019年6月4日 @author: User ''' import fool #import BiddingKG.dl.interface.Preprocessing as Preprocessing from bs4 import BeautifulSoup import re import codecs from BiddingKG.dl.common.Utils import save,load, findAllIndex import glob import threading import numpy as np import time from zipfile import ZipFile import json import psycopg2 import pandas as pd import math from BiddingKG.dl.foolnltk.bi_lstm_crf import BiLSTM import copy from BiddingKG.dl.interface.Entitys import * from BiddingKG.dl.foolnltk.Entity2DB import * import tensorflow as tf import requests def getNers(sentences,MAXAREA = 100000,userselffool=False): ''' @param: sentences:句子数 @return 限流执行后的分词和实体识别list ''' def getData(ners,process_data): process_sentences = [item[1] for item in process_data] if userselffool: ner_ = Preprocessing.selffool.ner(process_sentences) else: ner_ = fool.ner(process_sentences) for i in range(len(ner_)): the_index = process_data[i][0] ners[the_index] = ner_[i] sents = [] for i in range(len(sentences)): sents.append([i,sentences[i]]) sents.sort(key=lambda x:len(x[1]),reverse=True) index_ = 0 ners = [[]for i in range(len(sentences))] while(True): width = len(sents[index_][1]) height = MAXAREA//width+1 if height>len(sents)-index_: height = len(sents)-index_ process_data = sents[index_:index_+height] getData(ners, process_data) index_ += height if index_>=len(sents): break return ners def preprocess(list_articles): ''' @summary: 预处理文本,将foolnltk的识别结果存储到数据库,方便查看和修正 ''' import psycopg2 conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") list_filename_text = [] cursor = conn.cursor() for article in list_articles: doc_id = article[0] text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml"))) list_filename_text.append([doc_id,text,article[2]]) list_sent = [] for x in re.split("[。]", text): if len(x)>0: list_sent.append(x+"。") for n in getNers(list_sent): for _entity in n: print(_entity) sql = " insert into fool_ner_train(filename,begin_index,end_index,type,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')" cursor.execute(sql) conn.commit() conn.close() return list_filename_text def hasNotBeenLabeled(items,code_begin,code): for i in range(code_begin,code_begin+len(code)): if items[i][1]!="O": return False return True def findAllIndex(substr,wholestr): copystr = wholestr result = [] indexappend = 0 while(True): index = copystr.find(substr) if index<0: break else: result.append(indexappend+index) indexappend += index+len(substr) copystr = copystr[index+len(substr):] return result def labelEntity(): ''' @summary: 标注数据,从数据库中查询实体信息,生成对文本生成标签数据 ''' import psycopg2 conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() list_filename_text = load("list_filename_text_wrongEntity.pk") list_sent_label = [] list_text_label = [] sql = " select distinct filename from fool_ner t where not exists(select 1 from fool_ner a where t.filename=a.filename and type_0 in('org','company') and new_type is NULL) " cursor.execute(sql) set_filename = set() for row in cursor.fetchall(): set_filename.add(row[0]) for filename_text in list_filename_text: filename = filename_text[0] text = filename_text[1] if filename not in set_filename: continue sql = " select text,type_0,new_text,new_type from fool_ner where filename='"+filename+"' group by text,type_0,new_text,new_type" print(sql) cursor.execute(sql) rows = cursor.fetchall() rows.sort(key=lambda x:len(x[0])) list_entity = [] for row in rows: entity = row[0] type = row[1] new_entity = row[2] new_type = row[3] _entitys = [] if new_type is None or new_type=="" or new_type=="nan": _type = type else: _type = new_type if new_entity=="1": continue elif new_entity is None or new_entity =="" or new_entity=="nan": list_entity.append([entity,_type]) _entitys.append([entity,_type]) else: for _entity in new_entity.split("##"): list_entity.append([_entity,_type]) _entitys.append([_entity,_type]) if len(_entitys)>=2: data_item = [] for i in range(len(str(entity))): _item = [] _item.append(entity[i]) _item.append("O") data_item.append(_item) for _entity_type in _entitys: _entity = _entity_type[0] _type = _entity_type[1] if _type not in ["person","company","org","job","time","location"]: continue for _index in findAllIndex(_entity, entity): _find_flag = True if len(_entity)==1: if hasNotBeenLabeled(data_item, _index, _entity): data_item[_index][1] = "S_"+_type else: if hasNotBeenLabeled(data_item, _index, _entity): for j in range(_index,_index+len(_entity)): if j==_index: data_item[j][1] = "B_"+_type elif j==_index+len(_entity)-1: data_item[j][1] = "E_"+_type else: data_item[j][1] = "M_"+_type if _find_flag: list_text_label.append(data_item) list_insert = [" ","根据","就","受",",",",","。",":",":","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"] for insert_item in list_insert: if np.random.random()>0.7: copy_data_item = copy.copy(data_item) list_index = [] for i in range(len(copy_data_item)): _split = copy_data_item[i][1].split("_") if len(_split)==2: if _split[0]=="B": list_index.append(i) if _split[0]=="E": list_index.append(i+1) list_index.sort(key=lambda x:x,reverse=True) for _index in list_index: if np.random.random()>0.5: for j in range(len(insert_item)): copy_data_item.insert(_index+j,[insert_item[j],"O"]) if np.random.random()>0.5: break list_text_label.append(copy_data_item) '''''' list_entity.sort(key=lambda x:len(x[0]),reverse=True) for _sent in text.split("。"): _sent+= "。" _find_flag = False data_item = [] for i in range(len(str(_sent))): _item = [] _item.append(_sent[i]) _item.append("O") data_item.append(_item) for _entity_type in list_entity: _entity = _entity_type[0] _type = _entity_type[1] if _type not in ["person","company","org","job","time","location"]: continue for _index in findAllIndex(_entity, _sent): _find_flag = True if len(_entity)==1: if hasNotBeenLabeled(data_item, _index, _entity): data_item[_index][1] = "S_"+_type else: if hasNotBeenLabeled(data_item, _index, _entity): for j in range(_index,_index+len(_entity)): if j==_index: data_item[j][1] = "B_"+_type elif j==_index+len(_entity)-1: data_item[j][1] = "E_"+_type else: data_item[j][1] = "M_"+_type #根据句子中是否包含实体来判断是否加入训练数据 if _find_flag: list_sent_label.append(data_item) else: if np.random.random()>0.9: list_sent_label.append(data_item) '''''' with codecs.open("ner_label.txt","w",encoding="utf8") as f: for _sent_label in list_sent_label: for _word,_label in _sent_label: f.write(_word+" "+_label+"\n") f.write("\n") f.flush() with codecs.open("ner_label_split.txt","w",encoding="utf8") as f: for _sent_label in list_text_label: for _word,_label in _sent_label: f.write(_word+" "+_label+"\n") f.write("\n") f.flush() return list_sent_label class MyThread(threading.Thread): def __init__(self,func,args=()): super(MyThread,self).__init__() self.func = func self.args = args def run(self): self.result = self.func(*self.args) def get_result(self): try: return self.result # 如果子线程不使用join方法,此处可能会报没有self.result的错误 except Exception: return None def deal(): list_articles = [] path = "C:\\Users\\User\\Desktop\\fool语料\\*.html" set_doc_id = set() for file in glob.glob(path): filename = file.split("\\")[-1] doc_id = filename.split("_")[-1][:-5] text = codecs.open(file,"r",encoding="utf8").read() wrong_entity = "".join(filename.split("_")[:-1]) if doc_id in set_doc_id: for item in list_articles: if doc_id==item[0]: item[2].append(wrong_entity) else: set_doc_id.add(doc_id) list_articles.append([doc_id,text,[wrong_entity]]) save(list_articles,"list_filename_html_wrongEntity.pk") def dataSplit(data,parts=2): _index = 0 part_len = len(data)//parts while(True): if _index+part_len0: list_sent.append(x+"。") for n in getNers(list_sent,userselffool=True): for _entity in n: sql = " insert into fool_ner_train_1(filename,begin_index,end_index,type_0,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')" cursor.execute(sql) conn.commit() conn.close() save(list_filename_text,"list_filename_text_train_1.pk") return list_filename_text def makeLabel(sent,list_entity_type): _find_flag = False data_item = [] list_entity_type.sort(key=lambda x:len(x[0]),reverse=True) for i in range(len(str(sent))): _item = [] _item.append(sent[i]) _item.append("O") data_item.append(_item) for _entity_type in list_entity_type: _entity = _entity_type[0] _type = _entity_type[1] if _type not in ["person","company","org","job","time","location"]: continue for _index in findAllIndex(_entity, sent): _find_flag = True if len(_entity)==1: if hasNotBeenLabeled(data_item, _index, _entity): data_item[_index][1] = "S_"+_type else: if hasNotBeenLabeled(data_item, _index, _entity): for j in range(_index,_index+len(_entity)): if j==_index: data_item[j][1] = "B_"+_type elif j==_index+len(_entity)-1: data_item[j][1] = "E_"+_type else: data_item[j][1] = "M_"+_type return data_item,_find_flag def makeTrainTxt(): ''' @summary: 生成训练数据文本 ''' import psycopg2 conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() list_filename_text = load("list_filename_text_train.pk") list_sent_label = [] list_text_label = [] for filename_text in list_filename_text: filename = filename_text[0] text = filename_text[1] sql = " select text,type_0,new_text,new_type from fool_ner_train where filename='"+filename+"' group by text,type_0,new_text,new_type" print(sql) cursor.execute(sql) rows = cursor.fetchall() rows.sort(key=lambda x:len(x[0])) list_entity = [] for row in rows: entity = row[0] type = row[1] new_entity = row[2] new_type = row[3] _entitys = [] if new_type is None or new_type=="" or new_type=="nan": _type = type else: _type = new_type if new_entity=="1": continue elif new_entity is None or new_entity =="" or new_entity=="nan": list_entity.append([entity,_type]) _entitys.append([entity,_type]) else: for _entity in new_entity.split("##"): list_entity.append([_entity,_type]) _entitys.append([_entity,_type]) if len(_entitys)>=2: data_item = [] for i in range(len(str(entity))): _item = [] _item.append(entity[i]) _item.append("O") data_item.append(_item) for _entity_type in _entitys: _entity = _entity_type[0] _type = _entity_type[1] if _type not in ["person","company","org","job","time","location"]: continue for _index in findAllIndex(_entity, entity): _find_flag = True if len(_entity)==1: if hasNotBeenLabeled(data_item, _index, _entity): data_item[_index][1] = "S_"+_type else: if hasNotBeenLabeled(data_item, _index, _entity): for j in range(_index,_index+len(_entity)): if j==_index: data_item[j][1] = "B_"+_type elif j==_index+len(_entity)-1: data_item[j][1] = "E_"+_type else: data_item[j][1] = "M_"+_type if _find_flag: list_text_label.append(data_item) list_insert = ["根据","就",",",",","。",":",":"] for insert_item in list_insert: if np.random.random()>0.5: copy_data_item = copy.copy(data_item) list_index = [] for i in range(len(copy_data_item)): _split = copy_data_item[i][1].split("_") if len(_split)==2: if _split[0]=="B": list_index.append(i) if _split[0]=="E": list_index.append(i+1) list_index.sort(key=lambda x:x,reverse=True) for _index in list_index: if np.random.random()>0.5: for j in range(len(insert_item)): copy_data_item.insert(_index+j,[insert_item[j],"O"]) list_text_label.append(copy_data_item) list_insert = [" ","根据","就","受",",",",","。",":",":","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"] for insert_item in list_insert: if np.random.random()>0.7: copy_data_item = copy.copy(data_item) list_index = [] for i in range(len(copy_data_item)): _split = copy_data_item[i][1].split("_") if len(_split)==2: if _split[0]=="B": list_index.append(i) if _split[0]=="E": list_index.append(i+1) list_index.sort(key=lambda x:x,reverse=True) for _index in list_index: if np.random.random()>0.5: for j in range(len(insert_item)): copy_data_item.insert(_index+j,[insert_item[j],"O"]) if np.random.random()>0.5: break list_text_label.append(copy_data_item) '''''' list_entity.sort(key=lambda x:len(x[0]),reverse=True) for _sent in text.split("。"): _sent+= "。" _find_flag = False data_item = [] for i in range(len(str(_sent))): _item = [] _item.append(_sent[i]) _item.append("O") data_item.append(_item) for _entity_type in list_entity: _entity = _entity_type[0] _type = _entity_type[1] if _type not in ["person","company","org","job","time","location"]: continue for _index in findAllIndex(_entity, _sent): _find_flag = True if len(_entity)==1: if hasNotBeenLabeled(data_item, _index, _entity): data_item[_index][1] = "S_"+_type else: if hasNotBeenLabeled(data_item, _index, _entity): for j in range(_index,_index+len(_entity)): if j==_index: data_item[j][1] = "B_"+_type elif j==_index+len(_entity)-1: data_item[j][1] = "E_"+_type else: data_item[j][1] = "M_"+_type #根据句子中是否包含实体来判断是否加入训练数据 if _find_flag: list_sent_label.append(data_item) else: if np.random.random()>0.9: list_sent_label.append(data_item) ''' ''' with codecs.open("ner_train.txt","w",encoding="utf8") as f: for _sent_label in list_sent_label: for _word,_label in _sent_label: f.write(_word+" "+_label+"\n") f.write("\n") f.flush() with codecs.open("ner_train_split.txt","w",encoding="utf8") as f: for _sent_label in list_text_label: for _word,_label in _sent_label: f.write(_word+" "+_label+"\n") f.write("\n") f.flush() def _load_map_file(path, char_map_name, id_map_name): with ZipFile(path) as myzip: with myzip.open('all_map.json') as myfile: content = myfile.readline() content = content.decode() data = json.loads(content) return data.get(char_map_name), data.get(id_map_name) def getContext(file): char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map") id_to_tag = {int(k):v for k,v in id_to_seg.items()} tag_to_id = {v:int(k) for k,v in id_to_seg.items()} list_sent_label = [] with codecs.open(file,"r",encoding="utf8") as f: sentence = [] while(True): line = f.readline() if not line: break if len(line)==1: if len(sentence)>0: list_sent_label.append(sentence) sentence = [] else: _word_id = char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("") _tag_id = tag_to_id.get(line.split()[-1].strip()) sentence.append([_word_id,_tag_id]) return list_sent_label def readlabeldata(file,list_context,MAX_LEN=300,keep_prob=1): ''' @summary: 读取文件中的标注数据 ''' def addContext(_sentence,entity_sent,entity_label,id_B_company,id_E_company): _sent = [] _label = [] _flag = 0 _find_flag = False for item in _sentence: if _flag==0: if item[1]==id_B_company: for word_id,tag_id in zip(entity_sent,entity_label): _sent.append(word_id) _label.append(tag_id) _flag = 1 _find_flag = True else: _sent.append(item[0]) _label.append(item[1]) elif _flag==1: if item[1]==id_E_company: _flag = 2 else: continue else: _sent.append(item[0]) _label.append(item[1]) return _sent,_label,_find_flag def spreadContext(_sent,_label,id_to_char,id_to_tag): list_sent_label = [] for _word,_l in zip(_sent,_label): list_sent_label.append([id_to_char.get(_word),id_to_tag.get(_l)]) print(list_sent_label) list_sent_label_lengths = [] char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map") id_to_char = {int(v):k for k,v in char_to_id.items()} id_to_tag = {int(k):v for k,v in id_to_seg.items()} tag_to_id = {v:int(k) for k,v in id_to_seg.items()} id_B_company = tag_to_id.get("B_company") id_E_company = tag_to_id.get("E_company") with codecs.open(file,"r",encoding="utf8") as f: _sent = [] _label = [] while(True): line = f.readline() if not line: break if len(line)==1: if np.random.rand()0 and _label[0]==id_B_company and _label[-1]==id_E_company: if np.random.rand()<0.8: _int_random = np.random.randint(0,len(list_context)) _sentence = list_context[_int_random] _sent_context,_label_context,_find_flag = addContext(_sentence, _sent, _label, id_B_company, id_E_company) if _find_flag: if len(_sent_context)0: list_sent_label_lengths.append([_sent,_label,len(_sent)]) ''' print("====") spreadContext(_sent, _label, id_to_char, id_to_tag) spreadContext(_sent_context, _label_context, id_to_char, id_to_tag) print("====") ''' else: if len(_sent)0: list_sent_label_lengths.append([_sent,_label,len(_sent)]) _sent = [] _label = [] else: _sent.append(char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("")) tag = line.split()[-1].strip() _label.append(tag_to_id.get(tag)) return list_sent_label_lengths def gt3(): ''' @summary: 获取错误标签长度大于3的 ''' list_articles = [] list_filename_html_wrongEntity = load("list_filename_html_wrongEntity.pk") for row in list_filename_html_wrongEntity: if len(row[2])>1 or len(row[2][0])>3: list_articles.append(row) print(len(list_articles)) save(list_articles,"list_filename_html_wrongEntity_gt3.pk") def selectByRule(): ''' @summary: 从数据库中查询出符合规则的记录,方便修正 ''' conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101") cursor = conn.cursor() tables = ["fool_ner","fool_ner_train"] #tables = ["fool_ner"] for table in tables: sql = " select filename,type_0,text,new_type,new_text from "+table+" where ctid in (select max(ctid) from "+table+" where type_0 in ('org','company','location') group by filename,text) order by text " cursor.execute(sql) rows = cursor.fetchall() list_filename = [] list_type_0 = [] list_text = [] list_new_type = [] list_new_text = [] pattern = "室" list_keyword = ["厂","所","出","院","部","行","局","社","采招办","酒店","办事处","分理处","管理处","集团","组织","支队","部队","支行","银行","支局","分行","分公司","公司","中心","医院","卫生院","小学","中学","大学","学校","政府","委员会","委会","财政局"] list_second_keyword = ["处","厅","园","委","队","室","站","会","办","馆","共和国","科技"] for row in rows: filename = row[0] type_0 = row[1] entity = row[2] new_type = row[3] new_entity = row[4] list_entity = [] if new_type is None or new_type=="" or new_type=="nan": _type = type else: _type = new_type if new_entity=="1": continue elif new_entity is None or new_entity =="" or new_entity=="nan": list_entity.append([entity,_type,new_entity]) else: for _entity in new_entity.split("##"): list_entity.append([_entity,_type,entity]) _flag = False _index = 0 for _entity in list_entity: ''' if re.search('监狱.{,4}$',entity) is not None: _flag = True ''' if (len(entity)>2 and entity[-1]==entity[-2]) or (len(entity)>4 and entity[-4:-2]==entity[-2:]): _flag = True ''' pattern = "|".join(list_keyword) for _iter in re.finditer(pattern,text): if _iter.span()[1]>_index: _index = _iter.span()[1] new_text = text[:_index] if _index == 0: for _iter in re.finditer("|".join(list_second_keyword),text): if _iter.span()[1]>_index: _index = _iter.span()[1] new_text = text[:_index] ''' ''' for keyword in list_keyword: if _flag: break allindex = findAllIndex(keyword, text) if len(allindex)>0: _flag = True _index = allindex[-1]+len(keyword) new_text = text[:_index] ''' if _flag: list_filename.append(filename) list_type_0.append(type_0) list_text.append(entity) list_new_type.append(new_type) list_new_text.append(new_entity) data = {"list_filename":list_filename,"list_type_0":list_type_0,"list_text":list_text,"list_new_type":list_new_type,"list_new_text":list_new_text} df = pd.DataFrame(data,columns=["list_filename","list_type_0","list_text","list_new_type","list_new_text"]) df.to_excel(table+".xls") def makeDict_filename_content(): dict_filename_content = {} path = "C:\\Users\\User\\Desktop\\fool语料\\*.html" set_doc_id = set() for file in glob.glob(path): filename = file.split("\\")[-1] doc_id = filename.split("_")[-1][:-5] text = codecs.open(file,"r",encoding="utf8").read() dict_filename_content[doc_id] = text list_path = ["C:\\Users\\User\\Desktop\\20190416要素\\*.html","C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html","C:\\Users\\User\\Desktop\\data_20190703\\*.html","C:\\Users\\User\\Desktop\\20190715\\*.html"] for path in list_path: for file in glob.glob(path): filename = file.split("\\")[-1] text = codecs.open(file,"r",encoding="utf8").read() dict_filename_content[filename] = text save(dict_filename_content,"dict_filename_content.pk") def importLabelData(): conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101") cursor = conn.cursor() for file in glob.glob("label/*.xls"): if len(file.split("_"))>1: table = "fool_ner" else: table = "fool_ner_train" print(file,table) df = pd.read_excel(file) for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]): sql = " insert into "+table+" (filename,type_0,text,new_type,new_text) values('"+str(filename).replace(".0","")+"','"+str(type_0)+"','"+str(text)+"','"+str(new_type)+"','"+str(new_text)+"')" #sql = " update "+table+" set new_text='"+str(new_text)+"',new_type='"+str(new_type)+"' where filename='"+str(filename)+"' and text='"+str(text)+"' " cursor.execute(sql) conn.commit() conn.close() def checklabel(): ''' @summary: 检查label是否标注正确 ''' with codecs.open("ner_train.txt","r",encoding="utf8") as f: a = "" b = "" c = "" _index = 0 while(True): _index += 1 line = f.readline() if not line: break c = line.split(" ")[0].strip() if a=="新" and b=="乡" and c=="华": print(_index) a = b b = c def updateLabel(): ''' @summary: 更新标注数据 ''' conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101") cursor = conn.cursor() tables = ["fool_ner","fool_ner_train"] for table in tables: file = table+".xls" df = pd.read_excel(file) for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]): sql = " update "+table+" set new_type='"+str(new_type)+"',new_text='"+str(new_text)+"' where filename='"+str(filename).replace(".0","")+"' and text='"+str(text)+"'" print(sql) cursor.execute(sql) conn.commit() conn.close() def makeCertainEntity(): fileList = ["C:\\Users\\User\\Desktop\\cleanedEntity.tsv","C:\\Users\\User\\Desktop\\company_found.tsv"] for file in fileList: outfile = file.split(".")[0]+".txt" with codecs.open(outfile,"w",encoding="utf8") as f_w: with codecs.open(file,"r",encoding="utf8") as f: while(True): line = f.readline().strip() if not line: break for i in range(len(line)): if i==0: f_w.write(line[i]+" B_company\n") elif i==len(line)-1: f_w.write(line[i]+" E_company\n") f_w.write("\n") else: f_w.write(line[i]+" M_company\n") def addContextToTheEntity(entity_file): def getContext(file): list_sent_label = [] with codecs.open(file,"r",encoding="utf8") as f: sentence = [] while(True): line = f.readline() if not line: break if len(line)==1: list_sent_label.append(sentence) sentence = [] else: sentence.append([line[0],line.split()[-1].strip()]) return list_sent_label list_sent_label = getContext("ner_label.txt") print("getContent done",len(list_sent_label)) context_len = len(list_sent_label) outputfile = entity_file.split(".")[0]+"_addContext.txt" with codecs.open(outputfile,"w",encoding="utf8") as f_w: with codecs.open(entity_file,"r",encoding="utf8") as f_r: while(True): entity = f_r.readline().strip() random_int = np.random.randint(0,context_len) _sentence = list_sent_label[random_int] _flag = 0 for item in _sentence: if _flag==0: if item[1]=="B_company": for word_index in range(len(entity)): if word_index==0: f_w.write(entity[word_index]+" B_company\n") elif word_index==len(entity)-1: f_w.write(entity[word_index]+" E_company\n") else: f_w.write(entity[word_index]+" M_company\n") _flag = 1 else: f_w.write(item[0]+" "+item[1]+"\n") elif _flag==1: if item[1]=="E_company": _flag = 2 else: continue else: f_w.write(item[0]+" "+item[1]+"\n") f_w.write("\n") def makeContext_by_fool_selffool(): ''' @summary: 通过fool和selffool的识别结果来判断一个句子的识别是否正确,若fool和selffool的识别一样,则为正确,否则待定 ''' import psycopg2 conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101") list_filename_text = [] cursor = conn.cursor() file_index = 0 for file in glob.glob("C:\\Users\\User\\Desktop\\测试数据20190812\\*.html"): try: filename = file.split("\\")[-1] sql = " select count(1) from articles_processed_selffool where id='"+filename+"'" cursor.execute(sql) rows = cursor.fetchall() if rows[0][0]>0: continue content = codecs.open(file,"r",encoding="utf8").read() print(file_index,filename) text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(content,"lxml"))) _article = Article(id=filename, content=text, sourceContent="", doc_id="", title="") persistArticle(conn,[_article],"articles_processed_selffool") list_sentences = [] _sent_index = 0 set_sentences = set() for x in re.split("[。]", text): if len(x)>0: if x in set_sentences: continue set_sentences.add(x) _sentence = Sentences(doc_id=filename,sentence_index=_sent_index,sentence_text=x+"。",tokens=[],pos_tags=[],ner_tags=[]) list_sentences.append(_sentence) _ner_fool = fool.ner(_sentence.sentence_text) _ner_selffool = Preprocessing.selffool.ner(_sentence.sentence_text) if len(set(_ner_fool[0])&set(_ner_selffool[0]))==len(_ner_fool[0]): table_entity = "entity_mention_selffool" else: table_entity = "entity_mention_selffool_notsame" list_entitys = [] for item in _ner_selffool[0]: _entity_id = filename+"_"+str(_sent_index)+"_"+str(item[0])+"_"+str(item[1]) _entity = Entity(doc_id=filename,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=_sent_index,begin_index=item[0],end_index=item[1]) list_entitys.append(_entity) persistEntity(conn,list_entitys,table_entity) _sent_index += 1 persistSentence(conn,list_sentences,"sentences_selffool") conn.commit() except Exception as e: print(e) conn.close() conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() finally: file_index += 1 conn.close() def makeCompare(): ''' @summary: 通过比较fool的多个版本的selffool来判断置信度 ''' bilstm_new = BiLSTM() path_add = "new_model/" path = 'model/'+path_add+'model.ckpt' bilstm_new.restore(path) conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select doc_id,sentence_index,sentence_text from sentences_selffool A where exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='selffool') and not exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='fool') " cursor.execute(sql) rows = cursor.fetchall() table_entity = "entity_mention_selffool_notsame" _index = 0 try: for row in rows: _index += 1 print(_index,len(rows)) doc_id = row[0] sentence_index = row[1] ''' sql = " select count(1) from "+table_entity+" where doc_id='"+doc_id+"' and sentence_index="+str(sentence_index)+" and fool_version='fool' " cursor.execute(sql) count_rows = cursor.fetchall() if count_rows[0][0]>0: continue ''' text = row[2] _ner_entity_fool = set() _ner_entity_selffool = set() _ner_fool = fool.ner(text)[0] _ner_selffool = bilstm_new.ner(text)[0] list_entitys = [] for item in _ner_fool: _entity_id = doc_id+"_"+str(sentence_index)+"_"+str(item[0])+"_"+str(item[1]) _entity = Entity(doc_id=doc_id,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=sentence_index,begin_index=item[0],end_index=item[1]) list_entitys.append(_entity) persistEntity(conn,list_entitys,table_entity) conn.commit() ''' for item in _ner_fool: if item[2] in ["org","company"]: _ner_entity_fool.add(item) for item in _ner_selffool: if item[2] in ["org","company"]: _ner_entity_selffool.add(item) if len(_ner_entity_fool&_ner_entity_selffool)==len(_ner_entity_fool) and len(_ner_entity_fool)==len(_ner_entity_selffool): print(text) print(_ner_selffool) ''' except Exception as e: print(e) conn.close() conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() conn.close() def cluster_difference(): ''' @summary: 对截断的尾部进行聚类 ''' conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select entity_id,doc_id,sentence_index,begin_index,end_index,entity_type,entity_text,fool_version from entity_mention_selffool_notsame where entity_type in ('org','company') order by entity_id " cursor.execute(sql) rows = cursor.fetchall() row_begin = 0 DIFF_LEN = 2 dict_diff_list = dict() while(row_beginmax(begin_index_des,begin_index_src): if begin_index_src==begin_index_des: _key_begin = "SAME" else: _key_begin = entity_text_src[0:min(DIFF_LEN,len(entity_text_src))]+"#"+entity_text_des[0:min(DIFF_LEN,len(entity_text_des))] if end_index_src==end_index_des: _key_end = "SAME" else: _key_end = entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]+"#"+entity_text_des[-min(DIFF_LEN,len(entity_text_des)):] _key = _key_begin+"|"+_key_end list_key_entityid.append([_key,[entity_id_src,entity_id_des]]) #查看是否独有 for _entity_fool in list_entitys_fool: entity_id_src = _entity_fool[0] begin_index_src = _entity_fool[1] end_index_src = _entity_fool[2] entity_text_src = _entity_fool[3] entity_type_src = _entity_fool[4] find_flag = False for item in list_key_entityid: if entity_id_src in item[1]: find_flag = True if not find_flag: _key = "fool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):] list_key_entityid.append([_key,[entity_id_src]]) for _entity_fool in list_entitys_selffool: entity_id_src = _entity_fool[0] begin_index_src = _entity_fool[1] end_index_src = _entity_fool[2] entity_text_src = _entity_fool[3] entity_type_src = _entity_fool[4] find_flag = False for item in list_key_entityid: if entity_id_src in item[1]: find_flag = True if not find_flag: _key = "selffool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):] list_key_entityid.append([_key,[entity_id_src]]) #聚类 for item in list_key_entityid: find_flag = False if item[0]=="SAME|SAME": continue for _key in dict_diff_list.keys(): if item[0]==_key: dict_diff_list[_key].append(item[1]) find_flag = True if not find_flag: dict_diff_list[item[0]] = [item[1]] print(len(dict_diff_list.keys())) list_key_count = [] for _key in dict_diff_list.keys(): list_key_count.append([_key,len(dict_diff_list[_key])]) list_key_count.sort(key=lambda x:x[1],reverse=True) with codecs.open("diff_key_count.txt","w",encoding="utf8") as f: for item in list_key_count: f.write(item[0]+"\t"+str(item[1])+"\n") save(dict_diff_list,"dict_diff_list.pk") dict_sentence = None def get_sentence(doc_id,sentence_index): global dict_sentence file_dict_sentence = "dict_sentence.pk" if dict_sentence is None: if os.path.exists(file_dict_sentence): dict_sentence = load(file_dict_sentence) else: conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select doc_id,sentence_index,sentence_text from sentences_selffool " cursor.execute(sql) dict_sentence = dict() rows = cursor.fetchall() for row in rows: _doc_id = row[0] _sentence_index = row[1] _sentence_text = row[2] _key = _doc_id+str(_sentence_index) dict_sentence[_key] = _sentence_text save(dict_sentence,file_dict_sentence) _key = doc_id+str(sentence_index) if _key in dict_sentence.keys(): return dict_sentence[_key] return None dict_diff_list = None def viewEntityByKey(): global dict_diff_list if dict_diff_list is None: dict_diff_list = load("dict_diff_list.pk") CONTEXT_LEN = 15 for key in dict_diff_list.keys(): diff_list = dict_diff_list[key] file = "cluster_view/"+re.sub("[\*\|\/\r\n:]","",key.strip())+".xls" if os.path.exists(file): continue list_entityid = [] list_before = [] list_center = [] list_after = [] list_change = [] list_type = [] list_version = [] if len(diff_list[0])==2: for item in diff_list: for i in range(len(item)): if i==0: list_version.append("fool") else: list_version.append("selffool") entityid = item[i] split_entityid = entityid.split("html")[1].split("_") doc_id = entityid.split("html")[0]+"html" sentence_index = split_entityid[1] sentence_text = get_sentence(doc_id, sentence_index) begin_index = int(split_entityid[2]) end_index = int(split_entityid[3])-1 list_entityid.append(entityid) before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index] center = sentence_text[begin_index:end_index] after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))] list_before.append(before) list_center.append(center) list_after.append(after) list_change.append(center) list_type.append("") else: version = key.split("|")[0] for item in diff_list: list_version.append(version) entityid = item[0] split_entityid = entityid.split("html")[1].split("_") doc_id = entityid.split("html")[0]+"html" sentence_index = split_entityid[1] sentence_text = get_sentence(doc_id, sentence_index) begin_index = int(split_entityid[2]) end_index = int(split_entityid[3])-1 list_entityid.append(entityid) before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index] center = sentence_text[begin_index:end_index] after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))] list_before.append(before) list_center.append(center) list_after.append(after) list_change.append(center) list_type.append("") data = {"list_entityid":list_entityid, "list_before":list_before, "list_center":list_center, "list_after":list_after, "list_change":list_change, "list_type":list_type, "list_version":list_version} df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"]) df.to_excel(file) def alterFileByRule(file): df = pd.read_excel(file) _location = "location" for _index in range(len(df["list_entityid"])): version = df["list_version"][_index] if version=="selffool": '''''' df["list_change"][_index] = df["list_change"][_index-1] df.to_excel(file,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"]) def getCompanyByTianyan(): token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56" headers = {"Authorization":token} url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word=" _inTianYan = "inTianYan" _inSource = "inSource" _dict = load("dict_company.pk") count = 0 for entity in load("set_company.pk"): try: count += 1 print(count,len(_dict.keys())) if entity in _dict: _dict[entity][_inSource] = True if _dict[entity][_inTianYan]: continue else: _dict[entity] = {_inTianYan:False,_inSource:True} r = requests.get(url+entity,headers=headers) r_json = r.json() if r_json["error_code"]==0: for item in r_json["result"]["items"]: companyName = re.sub("","",item["name"]).replace("(","(").replace(")",")") if companyName in _dict: _dict[companyName][_inTianYan] = True else: _dict[companyName] = {_inTianYan:True,_inSource:False} elif r_json["error_code"]==300007: print("剩余次数不足") break except Exception as e: print(str(e)) save(_dict,"dict_company.pk") def labelByTianyan(): ''' @summary: 通过天眼查的数据接口来获取标注 ''' list_entityid = [] list_before = [] list_center = [] list_after = [] list_change = [] list_type = [] list_version = [] list_entityid_notmatch = [] list_before_notmatch = [] list_center_notmatch = [] list_after_notmatch = [] list_change_notmatch = [] list_type_notmatch = [] list_version_notmatch = [] _inTianYan = "inTianYan" _inSource = "inSource" _dict_company = load("dict_company.pk") is_compare = False for file in glob.glob("cluster_view/add/*.xls"): df = pd.read_excel(file) for _index in range(len(df["list_change"])): version = df["list_version"][_index] if version in ["selffool","fool"]: _match_count = 0 true_entity = None if df["list_change"][_index] in _dict_company: if _dict_company[df["list_change"][_index]][_inTianYan]: _match_count += 1 true_entity = df["list_change"][_index] if is_compare: if df["list_change"][_index-1] in _dict_company: if _dict_company[df["list_change"][_index-1]][_inTianYan]: _match_count += 1 true_entity = df["list_change"][_index-1] if _match_count==1: if is_compare: list_entityid.append(df["list_entityid"][_index-1]) list_before.append(df["list_before"][_index-1]) list_center.append(df["list_center"][_index-1]) list_after.append(df["list_after"][_index-1]) list_change.append(df["list_change"][_index-1]) list_type.append(df["list_type"][_index-1]) list_version.append(df["list_version"][_index-1]) list_entityid.append(df["list_entityid"][_index]) list_before.append(df["list_before"][_index]) list_center.append(df["list_center"][_index]) list_after.append(df["list_after"][_index]) list_change.append(true_entity) list_type.append(df["list_type"][_index]) list_version.append(df["list_version"][_index]) else: if is_compare: list_entityid_notmatch.append(df["list_entityid"][_index-1]) list_before_notmatch.append(df["list_before"][_index-1]) list_center_notmatch.append(df["list_center"][_index-1]) list_after_notmatch.append(df["list_after"][_index-1]) list_change_notmatch.append(df["list_change"][_index-1]) list_type_notmatch.append(df["list_type"][_index-1]) list_version_notmatch.append(df["list_version"][_index-1]) list_entityid_notmatch.append(df["list_entityid"][_index]) list_before_notmatch.append(df["list_before"][_index]) list_center_notmatch.append(df["list_center"][_index]) list_after_notmatch.append(df["list_after"][_index]) list_change_notmatch.append(df["list_change"][_index]) list_type_notmatch.append(df["list_type"][_index]) list_version_notmatch.append(df["list_version"][_index]) data = {"list_entityid":list_entityid, "list_before":list_before, "list_center":list_center, "list_after":list_after, "list_change":list_change, "list_type":list_type, "list_version":list_version} df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"]) df.to_excel("cluster_view/add_match.xls") nums = 50000 _begin = 0 while(_begin2000: continue if _key in dict_sent_entitys: data_item,_find_flag = makeLabel(sentence, dict_sent_entitys[_key]) for _word,_label in data_item: f.write(_word+" "+_label+"\n") else: if np.random.random()>0.8: data_item,_find_flag = makeLabel(sentence, []) for _word,_label in data_item: f.write(_word+" "+_label+"\n") f.write("\n") f.flush() if __name__=="__main__": #makeFoolTrainData() #makeTrainTxt() #labelEntity() #readlabeldata("cleanedEntity.txt",getContext("ner_train.txt")) #makeDict_filename_content() #selectByRule() #updateLabel() #importLabelData() #makeCertainEntity() #addContextToTheEntity("company_found.tsv") #makeContext_by_fool_selffool() #makeCompare() #cluster_difference() #viewEntityByKey() #alterFileByRule("cluster_view/change/SAME版社#大学.xls") #getCompanyByTianyan() ''' data = load("dict_company.pk") for item in data.keys(): print(item,data[item]) ''' #labelByTianyan() ''' token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56" headers = {"Authorization":token} url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word=" r = requests.get(url+"安阳鑫龙煤业(集团)龙山煤业有限责任公司",headers=headers) r_json = r.json() print(r_json) ''' #tofix() cluster_entitys() makeLabelText()