123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764 |
- '''
- Created on 2019年6月4日
- @author: User
- '''
- import fool
- #import BiddingKG.dl.interface.Preprocessing as Preprocessing
- from bs4 import BeautifulSoup
- import re
- import codecs
- from BiddingKG.dl.common.Utils import save,load, findAllIndex
- import glob
- import threading
- import numpy as np
- import time
- from zipfile import ZipFile
- import json
- import psycopg2
- import pandas as pd
- import math
- from BiddingKG.dl.foolnltk.bi_lstm_crf import BiLSTM
- import copy
- from BiddingKG.dl.interface.Entitys import *
- from BiddingKG.dl.foolnltk.Entity2DB import *
- import tensorflow as tf
- import requests
- def getNers(sentences,MAXAREA = 100000,userselffool=False):
- '''
- @param: sentences:句子数
- @return 限流执行后的分词和实体识别list
- '''
-
- def getData(ners,process_data):
- process_sentences = [item[1] for item in process_data]
- if userselffool:
- ner_ = Preprocessing.selffool.ner(process_sentences)
- else:
- ner_ = fool.ner(process_sentences)
- for i in range(len(ner_)):
- the_index = process_data[i][0]
- ners[the_index] = ner_[i]
- sents = []
- for i in range(len(sentences)):
- sents.append([i,sentences[i]])
- sents.sort(key=lambda x:len(x[1]),reverse=True)
- index_ = 0
- ners = [[]for i in range(len(sentences))]
-
- while(True):
- width = len(sents[index_][1])
- height = MAXAREA//width+1
- if height>len(sents)-index_:
- height = len(sents)-index_
- process_data = sents[index_:index_+height]
- getData(ners, process_data)
- index_ += height
- if index_>=len(sents):
- break
- return ners
- def preprocess(list_articles):
- '''
- @summary: 预处理文本,将foolnltk的识别结果存储到数据库,方便查看和修正
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- list_filename_text = []
- cursor = conn.cursor()
- for article in list_articles:
- doc_id = article[0]
- text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml")))
- list_filename_text.append([doc_id,text,article[2]])
- list_sent = []
- for x in re.split("[。]", text):
- if len(x)>0:
- list_sent.append(x+"。")
- for n in getNers(list_sent):
- for _entity in n:
- print(_entity)
- sql = " insert into fool_ner_train(filename,begin_index,end_index,type,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')"
- cursor.execute(sql)
- conn.commit()
- conn.close()
- return list_filename_text
-
- def hasNotBeenLabeled(items,code_begin,code):
- for i in range(code_begin,code_begin+len(code)):
- if items[i][1]!="O":
- return False
- return True
-
-
- def findAllIndex(substr,wholestr):
- copystr = wholestr
- result = []
- indexappend = 0
- while(True):
- index = copystr.find(substr)
- if index<0:
- break
- else:
- result.append(indexappend+index)
- indexappend += index+len(substr)
- copystr = copystr[index+len(substr):]
- return result
- def labelEntity():
- '''
- @summary: 标注数据,从数据库中查询实体信息,生成对文本生成标签数据
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- list_filename_text = load("list_filename_text_wrongEntity.pk")
- list_sent_label = []
-
- list_text_label = []
- sql = " select distinct filename from fool_ner t where not exists(select 1 from fool_ner a where t.filename=a.filename and type_0 in('org','company') and new_type is NULL) "
- cursor.execute(sql)
- set_filename = set()
- for row in cursor.fetchall():
- set_filename.add(row[0])
-
- for filename_text in list_filename_text:
- filename = filename_text[0]
- text = filename_text[1]
- if filename not in set_filename:
- continue
-
-
- sql = " select text,type_0,new_text,new_type from fool_ner where filename='"+filename+"' group by text,type_0,new_text,new_type"
- print(sql)
- cursor.execute(sql)
- rows = cursor.fetchall()
- rows.sort(key=lambda x:len(x[0]))
-
- list_entity = []
- for row in rows:
- entity = row[0]
- type = row[1]
- new_entity = row[2]
- new_type = row[3]
- _entitys = []
- if new_type is None or new_type=="" or new_type=="nan":
- _type = type
- else:
- _type = new_type
- if new_entity=="1":
- continue
- elif new_entity is None or new_entity =="" or new_entity=="nan":
- list_entity.append([entity,_type])
- _entitys.append([entity,_type])
- else:
- for _entity in new_entity.split("##"):
- list_entity.append([_entity,_type])
- _entitys.append([_entity,_type])
- if len(_entitys)>=2:
- data_item = []
- for i in range(len(str(entity))):
- _item = []
- _item.append(entity[i])
- _item.append("O")
- data_item.append(_item)
- for _entity_type in _entitys:
- _entity = _entity_type[0]
- _type = _entity_type[1]
- if _type not in ["person","company","org","job","time","location"]:
- continue
- for _index in findAllIndex(_entity, entity):
- _find_flag = True
- if len(_entity)==1:
- if hasNotBeenLabeled(data_item, _index, _entity):
- data_item[_index][1] = "S_"+_type
- else:
- if hasNotBeenLabeled(data_item, _index, _entity):
- for j in range(_index,_index+len(_entity)):
- if j==_index:
- data_item[j][1] = "B_"+_type
- elif j==_index+len(_entity)-1:
- data_item[j][1] = "E_"+_type
- else:
- data_item[j][1] = "M_"+_type
- if _find_flag:
- list_text_label.append(data_item)
-
- list_insert = [" ","根据","就","受",",",",","。",":",":","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"]
- for insert_item in list_insert:
- if np.random.random()>0.7:
- copy_data_item = copy.copy(data_item)
- list_index = []
- for i in range(len(copy_data_item)):
- _split = copy_data_item[i][1].split("_")
- if len(_split)==2:
- if _split[0]=="B":
- list_index.append(i)
- if _split[0]=="E":
- list_index.append(i+1)
- list_index.sort(key=lambda x:x,reverse=True)
- for _index in list_index:
- if np.random.random()>0.5:
- for j in range(len(insert_item)):
- copy_data_item.insert(_index+j,[insert_item[j],"O"])
- if np.random.random()>0.5:
- break
- list_text_label.append(copy_data_item)
-
- ''''''
- list_entity.sort(key=lambda x:len(x[0]),reverse=True)
-
- for _sent in text.split("。"):
- _sent+= "。"
-
- _find_flag = False
- data_item = []
- for i in range(len(str(_sent))):
- _item = []
- _item.append(_sent[i])
- _item.append("O")
- data_item.append(_item)
-
- for _entity_type in list_entity:
- _entity = _entity_type[0]
- _type = _entity_type[1]
- if _type not in ["person","company","org","job","time","location"]:
- continue
- for _index in findAllIndex(_entity, _sent):
- _find_flag = True
- if len(_entity)==1:
- if hasNotBeenLabeled(data_item, _index, _entity):
- data_item[_index][1] = "S_"+_type
- else:
- if hasNotBeenLabeled(data_item, _index, _entity):
- for j in range(_index,_index+len(_entity)):
- if j==_index:
- data_item[j][1] = "B_"+_type
- elif j==_index+len(_entity)-1:
- data_item[j][1] = "E_"+_type
- else:
- data_item[j][1] = "M_"+_type
- #根据句子中是否包含实体来判断是否加入训练数据
- if _find_flag:
- list_sent_label.append(data_item)
- else:
- if np.random.random()>0.9:
- list_sent_label.append(data_item)
-
- ''''''
- with codecs.open("ner_label.txt","w",encoding="utf8") as f:
- for _sent_label in list_sent_label:
- for _word,_label in _sent_label:
- f.write(_word+" "+_label+"\n")
- f.write("\n")
- f.flush()
-
- with codecs.open("ner_label_split.txt","w",encoding="utf8") as f:
- for _sent_label in list_text_label:
- for _word,_label in _sent_label:
- f.write(_word+" "+_label+"\n")
- f.write("\n")
- f.flush()
- return list_sent_label
- class MyThread(threading.Thread):
- def __init__(self,func,args=()):
- super(MyThread,self).__init__()
- self.func = func
- self.args = args
- def run(self):
- self.result = self.func(*self.args)
- def get_result(self):
- try:
- return self.result # 如果子线程不使用join方法,此处可能会报没有self.result的错误
- except Exception:
- return None
- def deal():
- list_articles = []
- path = "C:\\Users\\User\\Desktop\\fool语料\\*.html"
- set_doc_id = set()
- for file in glob.glob(path):
- filename = file.split("\\")[-1]
- doc_id = filename.split("_")[-1][:-5]
- text = codecs.open(file,"r",encoding="utf8").read()
- wrong_entity = "".join(filename.split("_")[:-1])
- if doc_id in set_doc_id:
- for item in list_articles:
- if doc_id==item[0]:
- item[2].append(wrong_entity)
- else:
-
- set_doc_id.add(doc_id)
- list_articles.append([doc_id,text,[wrong_entity]])
- save(list_articles,"list_filename_html_wrongEntity.pk")
-
- def dataSplit(data,parts=2):
- _index = 0
- part_len = len(data)//parts
- while(True):
- if _index+part_len<len(data):
- yield data[_index:_index+part_len]
- _index += part_len
- else:
- yield data[_index:]
- break
-
- def makeFoolTrainData():
- '''
- @summary: 生成fool训练数据
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- list_articles = []
- #list_path = ["C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html"]
- list_path = ["C:\\Users\\User\\Desktop\\data_20190703\\*.html"]
- set_doc_id = set()
- for path in list_path:
- for file in glob.glob(path):
- filename = file.split("\\")[-1]
- text = codecs.open(file,"r",encoding="utf8").read()
- if filename in set_doc_id:
- continue
- else:
- set_doc_id.add(filename)
- list_articles.append([filename,text])
- list_filename_text = []
- cursor = conn.cursor()
- _count = 0
- for article in list_articles:
- _count += 1
- print(str(_count)+"/"+str(len(list_articles)))
- doc_id = article[0]
- text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml")))
- list_filename_text.append([doc_id,text])
- list_sent = []
- for x in re.split("[。]", text):
- if len(x)>0:
- list_sent.append(x+"。")
- for n in getNers(list_sent,userselffool=True):
- for _entity in n:
- sql = " insert into fool_ner_train_1(filename,begin_index,end_index,type_0,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')"
- cursor.execute(sql)
- conn.commit()
- conn.close()
- save(list_filename_text,"list_filename_text_train_1.pk")
- return list_filename_text
- def makeLabel(sent,list_entity_type):
- _find_flag = False
- data_item = []
- list_entity_type.sort(key=lambda x:len(x[0]),reverse=True)
- for i in range(len(str(sent))):
- _item = []
- _item.append(sent[i])
- _item.append("O")
- data_item.append(_item)
-
- for _entity_type in list_entity_type:
- _entity = _entity_type[0]
- _type = _entity_type[1]
- if _type not in ["person","company","org","job","time","location"]:
- continue
- for _index in findAllIndex(_entity, sent):
- _find_flag = True
- if len(_entity)==1:
- if hasNotBeenLabeled(data_item, _index, _entity):
- data_item[_index][1] = "S_"+_type
- else:
- if hasNotBeenLabeled(data_item, _index, _entity):
- for j in range(_index,_index+len(_entity)):
- if j==_index:
- data_item[j][1] = "B_"+_type
- elif j==_index+len(_entity)-1:
- data_item[j][1] = "E_"+_type
- else:
- data_item[j][1] = "M_"+_type
- return data_item,_find_flag
- def makeTrainTxt():
- '''
- @summary: 生成训练数据文本
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- list_filename_text = load("list_filename_text_train.pk")
- list_sent_label = []
-
-
- list_text_label = []
- for filename_text in list_filename_text:
- filename = filename_text[0]
- text = filename_text[1]
-
-
- sql = " select text,type_0,new_text,new_type from fool_ner_train where filename='"+filename+"' group by text,type_0,new_text,new_type"
- print(sql)
- cursor.execute(sql)
- rows = cursor.fetchall()
- rows.sort(key=lambda x:len(x[0]))
- list_entity = []
- for row in rows:
- entity = row[0]
- type = row[1]
- new_entity = row[2]
- new_type = row[3]
- _entitys = []
- if new_type is None or new_type=="" or new_type=="nan":
- _type = type
- else:
- _type = new_type
- if new_entity=="1":
- continue
- elif new_entity is None or new_entity =="" or new_entity=="nan":
- list_entity.append([entity,_type])
- _entitys.append([entity,_type])
- else:
- for _entity in new_entity.split("##"):
- list_entity.append([_entity,_type])
- _entitys.append([_entity,_type])
-
- if len(_entitys)>=2:
- data_item = []
- for i in range(len(str(entity))):
- _item = []
- _item.append(entity[i])
- _item.append("O")
- data_item.append(_item)
- for _entity_type in _entitys:
- _entity = _entity_type[0]
- _type = _entity_type[1]
- if _type not in ["person","company","org","job","time","location"]:
- continue
- for _index in findAllIndex(_entity, entity):
- _find_flag = True
- if len(_entity)==1:
- if hasNotBeenLabeled(data_item, _index, _entity):
- data_item[_index][1] = "S_"+_type
- else:
- if hasNotBeenLabeled(data_item, _index, _entity):
- for j in range(_index,_index+len(_entity)):
- if j==_index:
- data_item[j][1] = "B_"+_type
- elif j==_index+len(_entity)-1:
- data_item[j][1] = "E_"+_type
- else:
- data_item[j][1] = "M_"+_type
- if _find_flag:
- list_text_label.append(data_item)
-
- list_insert = ["根据","就",",",",","。",":",":"]
- for insert_item in list_insert:
- if np.random.random()>0.5:
- copy_data_item = copy.copy(data_item)
- list_index = []
- for i in range(len(copy_data_item)):
- _split = copy_data_item[i][1].split("_")
- if len(_split)==2:
- if _split[0]=="B":
- list_index.append(i)
- if _split[0]=="E":
- list_index.append(i+1)
- list_index.sort(key=lambda x:x,reverse=True)
- for _index in list_index:
- if np.random.random()>0.5:
- for j in range(len(insert_item)):
- copy_data_item.insert(_index+j,[insert_item[j],"O"])
- list_text_label.append(copy_data_item)
- list_insert = [" ","根据","就","受",",",",","。",":",":","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"]
- for insert_item in list_insert:
- if np.random.random()>0.7:
- copy_data_item = copy.copy(data_item)
- list_index = []
- for i in range(len(copy_data_item)):
- _split = copy_data_item[i][1].split("_")
- if len(_split)==2:
- if _split[0]=="B":
- list_index.append(i)
- if _split[0]=="E":
- list_index.append(i+1)
- list_index.sort(key=lambda x:x,reverse=True)
- for _index in list_index:
- if np.random.random()>0.5:
- for j in range(len(insert_item)):
- copy_data_item.insert(_index+j,[insert_item[j],"O"])
- if np.random.random()>0.5:
- break
- list_text_label.append(copy_data_item)
- ''''''
- list_entity.sort(key=lambda x:len(x[0]),reverse=True)
-
-
- for _sent in text.split("。"):
- _sent+= "。"
-
- _find_flag = False
- data_item = []
- for i in range(len(str(_sent))):
- _item = []
- _item.append(_sent[i])
- _item.append("O")
- data_item.append(_item)
-
- for _entity_type in list_entity:
- _entity = _entity_type[0]
- _type = _entity_type[1]
- if _type not in ["person","company","org","job","time","location"]:
- continue
- for _index in findAllIndex(_entity, _sent):
- _find_flag = True
- if len(_entity)==1:
- if hasNotBeenLabeled(data_item, _index, _entity):
- data_item[_index][1] = "S_"+_type
- else:
- if hasNotBeenLabeled(data_item, _index, _entity):
- for j in range(_index,_index+len(_entity)):
- if j==_index:
- data_item[j][1] = "B_"+_type
- elif j==_index+len(_entity)-1:
- data_item[j][1] = "E_"+_type
- else:
- data_item[j][1] = "M_"+_type
- #根据句子中是否包含实体来判断是否加入训练数据
- if _find_flag:
- list_sent_label.append(data_item)
- else:
- if np.random.random()>0.9:
- list_sent_label.append(data_item)
-
-
- ''' '''
- with codecs.open("ner_train.txt","w",encoding="utf8") as f:
- for _sent_label in list_sent_label:
- for _word,_label in _sent_label:
- f.write(_word+" "+_label+"\n")
- f.write("\n")
- f.flush()
-
- with codecs.open("ner_train_split.txt","w",encoding="utf8") as f:
- for _sent_label in list_text_label:
- for _word,_label in _sent_label:
- f.write(_word+" "+_label+"\n")
- f.write("\n")
- f.flush()
-
- def _load_map_file(path, char_map_name, id_map_name):
- with ZipFile(path) as myzip:
- with myzip.open('all_map.json') as myfile:
- content = myfile.readline()
- content = content.decode()
- data = json.loads(content)
- return data.get(char_map_name), data.get(id_map_name)
-
- def getContext(file):
- char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map")
- id_to_tag = {int(k):v for k,v in id_to_seg.items()}
- tag_to_id = {v:int(k) for k,v in id_to_seg.items()}
- list_sent_label = []
- with codecs.open(file,"r",encoding="utf8") as f:
- sentence = []
- while(True):
- line = f.readline()
- if not line:
- break
- if len(line)==1:
- if len(sentence)>0:
- list_sent_label.append(sentence)
- sentence = []
- else:
- _word_id = char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("<OOV>")
- _tag_id = tag_to_id.get(line.split()[-1].strip())
- sentence.append([_word_id,_tag_id])
- return list_sent_label
- def readlabeldata(file,list_context,MAX_LEN=300,keep_prob=1):
- '''
- @summary: 读取文件中的标注数据
- '''
-
- def addContext(_sentence,entity_sent,entity_label,id_B_company,id_E_company):
- _sent = []
- _label = []
- _flag = 0
- _find_flag = False
- for item in _sentence:
- if _flag==0:
- if item[1]==id_B_company:
- for word_id,tag_id in zip(entity_sent,entity_label):
- _sent.append(word_id)
- _label.append(tag_id)
- _flag = 1
- _find_flag = True
- else:
- _sent.append(item[0])
- _label.append(item[1])
- elif _flag==1:
- if item[1]==id_E_company:
- _flag = 2
- else:
- continue
- else:
- _sent.append(item[0])
- _label.append(item[1])
- return _sent,_label,_find_flag
-
- def spreadContext(_sent,_label,id_to_char,id_to_tag):
- list_sent_label = []
- for _word,_l in zip(_sent,_label):
- list_sent_label.append([id_to_char.get(_word),id_to_tag.get(_l)])
- print(list_sent_label)
-
- list_sent_label_lengths = []
- char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map")
- id_to_char = {int(v):k for k,v in char_to_id.items()}
- id_to_tag = {int(k):v for k,v in id_to_seg.items()}
- tag_to_id = {v:int(k) for k,v in id_to_seg.items()}
- id_B_company = tag_to_id.get("B_company")
- id_E_company = tag_to_id.get("E_company")
- with codecs.open(file,"r",encoding="utf8") as f:
- _sent = []
- _label = []
- while(True):
- line = f.readline()
- if not line:
- break
- if len(line)==1:
- if np.random.rand()<keep_prob:
- if len(_label)>0 and _label[0]==id_B_company and _label[-1]==id_E_company:
- if np.random.rand()<0.8:
- _int_random = np.random.randint(0,len(list_context))
- _sentence = list_context[_int_random]
- _sent_context,_label_context,_find_flag = addContext(_sentence, _sent, _label, id_B_company, id_E_company)
-
- if _find_flag:
- if len(_sent_context)<MAX_LEN:
- list_sent_label_lengths.append([_sent_context,_label_context,len(_sent_context)])
- else:
- if len(_sent)<MAX_LEN and len(_sent)>0:
- list_sent_label_lengths.append([_sent,_label,len(_sent)])
- '''
- print("====")
- spreadContext(_sent, _label, id_to_char, id_to_tag)
- spreadContext(_sent_context, _label_context, id_to_char, id_to_tag)
- print("====")
- '''
- else:
- if len(_sent)<MAX_LEN and len(_sent)>0:
- list_sent_label_lengths.append([_sent,_label,len(_sent)])
- _sent = []
- _label = []
- else:
- _sent.append(char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("<OOV>"))
- tag = line.split()[-1].strip()
- _label.append(tag_to_id.get(tag))
- return list_sent_label_lengths
- def gt3():
- '''
- @summary: 获取错误标签长度大于3的
- '''
- list_articles = []
- list_filename_html_wrongEntity = load("list_filename_html_wrongEntity.pk")
- for row in list_filename_html_wrongEntity:
- if len(row[2])>1 or len(row[2][0])>3:
- list_articles.append(row)
- print(len(list_articles))
- save(list_articles,"list_filename_html_wrongEntity_gt3.pk")
-
- def selectByRule():
- '''
- @summary: 从数据库中查询出符合规则的记录,方便修正
- '''
- conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- tables = ["fool_ner","fool_ner_train"]
- #tables = ["fool_ner"]
-
- for table in tables:
- sql = " select filename,type_0,text,new_type,new_text from "+table+" where ctid in (select max(ctid) from "+table+" where type_0 in ('org','company','location') group by filename,text) order by text "
- cursor.execute(sql)
-
- rows = cursor.fetchall()
- list_filename = []
- list_type_0 = []
- list_text = []
- list_new_type = []
- list_new_text = []
- pattern = "室"
- list_keyword = ["厂","所","出","院","部","行","局","社","采招办","酒店","办事处","分理处","管理处","集团","组织","支队","部队","支行","银行","支局","分行","分公司","公司","中心","医院","卫生院","小学","中学","大学","学校","政府","委员会","委会","财政局"]
- list_second_keyword = ["处","厅","园","委","队","室","站","会","办","馆","共和国","科技"]
- for row in rows:
- filename = row[0]
- type_0 = row[1]
- entity = row[2]
- new_type = row[3]
- new_entity = row[4]
-
- list_entity = []
- if new_type is None or new_type=="" or new_type=="nan":
- _type = type
- else:
- _type = new_type
- if new_entity=="1":
- continue
- elif new_entity is None or new_entity =="" or new_entity=="nan":
- list_entity.append([entity,_type,new_entity])
- else:
- for _entity in new_entity.split("##"):
- list_entity.append([_entity,_type,entity])
- _flag = False
- _index = 0
-
-
- for _entity in list_entity:
- '''
- if re.search('监狱.{,4}$',entity) is not None:
- _flag = True
- '''
- if (len(entity)>2 and entity[-1]==entity[-2]) or (len(entity)>4 and entity[-4:-2]==entity[-2:]):
- _flag = True
-
- '''
- pattern = "|".join(list_keyword)
- for _iter in re.finditer(pattern,text):
- if _iter.span()[1]>_index:
- _index = _iter.span()[1]
- new_text = text[:_index]
- if _index == 0:
- for _iter in re.finditer("|".join(list_second_keyword),text):
- if _iter.span()[1]>_index:
- _index = _iter.span()[1]
- new_text = text[:_index]
- '''
- '''
- for keyword in list_keyword:
- if _flag:
- break
- allindex = findAllIndex(keyword, text)
- if len(allindex)>0:
- _flag = True
- _index = allindex[-1]+len(keyword)
- new_text = text[:_index]
- '''
- if _flag:
- list_filename.append(filename)
- list_type_0.append(type_0)
- list_text.append(entity)
- list_new_type.append(new_type)
- list_new_text.append(new_entity)
- data = {"list_filename":list_filename,"list_type_0":list_type_0,"list_text":list_text,"list_new_type":list_new_type,"list_new_text":list_new_text}
- df = pd.DataFrame(data,columns=["list_filename","list_type_0","list_text","list_new_type","list_new_text"])
- df.to_excel(table+".xls")
-
- def makeDict_filename_content():
-
- dict_filename_content = {}
- path = "C:\\Users\\User\\Desktop\\fool语料\\*.html"
- set_doc_id = set()
- for file in glob.glob(path):
- filename = file.split("\\")[-1]
- doc_id = filename.split("_")[-1][:-5]
- text = codecs.open(file,"r",encoding="utf8").read()
- dict_filename_content[doc_id] = text
- list_path = ["C:\\Users\\User\\Desktop\\20190416要素\\*.html","C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html","C:\\Users\\User\\Desktop\\data_20190703\\*.html","C:\\Users\\User\\Desktop\\20190715\\*.html"]
- for path in list_path:
- for file in glob.glob(path):
- filename = file.split("\\")[-1]
- text = codecs.open(file,"r",encoding="utf8").read()
- dict_filename_content[filename] = text
- save(dict_filename_content,"dict_filename_content.pk")
-
- def importLabelData():
- conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- for file in glob.glob("label/*.xls"):
-
- if len(file.split("_"))>1:
- table = "fool_ner"
- else:
- table = "fool_ner_train"
- print(file,table)
- df = pd.read_excel(file)
- for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]):
- sql = " insert into "+table+" (filename,type_0,text,new_type,new_text) values('"+str(filename).replace(".0","")+"','"+str(type_0)+"','"+str(text)+"','"+str(new_type)+"','"+str(new_text)+"')"
- #sql = " update "+table+" set new_text='"+str(new_text)+"',new_type='"+str(new_type)+"' where filename='"+str(filename)+"' and text='"+str(text)+"' "
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def checklabel():
- '''
- @summary: 检查label是否标注正确
- '''
- with codecs.open("ner_train.txt","r",encoding="utf8") as f:
- a = ""
- b = ""
- c = ""
- _index = 0
- while(True):
- _index += 1
- line = f.readline()
- if not line:
- break
- c = line.split(" ")[0].strip()
- if a=="新" and b=="乡" and c=="华":
- print(_index)
- a = b
- b = c
- def updateLabel():
- '''
- @summary: 更新标注数据
- '''
- conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- tables = ["fool_ner","fool_ner_train"]
- for table in tables:
- file = table+".xls"
- df = pd.read_excel(file)
- for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]):
- sql = " update "+table+" set new_type='"+str(new_type)+"',new_text='"+str(new_text)+"' where filename='"+str(filename).replace(".0","")+"' and text='"+str(text)+"'"
- print(sql)
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def makeCertainEntity():
- fileList = ["C:\\Users\\User\\Desktop\\cleanedEntity.tsv","C:\\Users\\User\\Desktop\\company_found.tsv"]
- for file in fileList:
- outfile = file.split(".")[0]+".txt"
- with codecs.open(outfile,"w",encoding="utf8") as f_w:
- with codecs.open(file,"r",encoding="utf8") as f:
- while(True):
- line = f.readline().strip()
- if not line:
- break
- for i in range(len(line)):
- if i==0:
- f_w.write(line[i]+" B_company\n")
- elif i==len(line)-1:
- f_w.write(line[i]+" E_company\n")
- f_w.write("\n")
- else:
- f_w.write(line[i]+" M_company\n")
-
-
- def addContextToTheEntity(entity_file):
-
- def getContext(file):
- list_sent_label = []
- with codecs.open(file,"r",encoding="utf8") as f:
- sentence = []
- while(True):
- line = f.readline()
- if not line:
- break
- if len(line)==1:
- list_sent_label.append(sentence)
- sentence = []
- else:
- sentence.append([line[0],line.split()[-1].strip()])
- return list_sent_label
- list_sent_label = getContext("ner_label.txt")
- print("getContent done",len(list_sent_label))
- context_len = len(list_sent_label)
- outputfile = entity_file.split(".")[0]+"_addContext.txt"
- with codecs.open(outputfile,"w",encoding="utf8") as f_w:
- with codecs.open(entity_file,"r",encoding="utf8") as f_r:
- while(True):
- entity = f_r.readline().strip()
- random_int = np.random.randint(0,context_len)
- _sentence = list_sent_label[random_int]
- _flag = 0
- for item in _sentence:
- if _flag==0:
- if item[1]=="B_company":
- for word_index in range(len(entity)):
- if word_index==0:
- f_w.write(entity[word_index]+" B_company\n")
- elif word_index==len(entity)-1:
- f_w.write(entity[word_index]+" E_company\n")
- else:
- f_w.write(entity[word_index]+" M_company\n")
- _flag = 1
- else:
- f_w.write(item[0]+" "+item[1]+"\n")
- elif _flag==1:
- if item[1]=="E_company":
- _flag = 2
- else:
- continue
- else:
- f_w.write(item[0]+" "+item[1]+"\n")
- f_w.write("\n")
- def makeContext_by_fool_selffool():
- '''
- @summary: 通过fool和selffool的识别结果来判断一个句子的识别是否正确,若fool和selffool的识别一样,则为正确,否则待定
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- list_filename_text = []
- cursor = conn.cursor()
- file_index = 0
- for file in glob.glob("C:\\Users\\User\\Desktop\\测试数据20190812\\*.html"):
- try:
- filename = file.split("\\")[-1]
-
- sql = " select count(1) from articles_processed_selffool where id='"+filename+"'"
- cursor.execute(sql)
- rows = cursor.fetchall()
- if rows[0][0]>0:
- continue
-
- content = codecs.open(file,"r",encoding="utf8").read()
- print(file_index,filename)
- text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(content,"lxml")))
- _article = Article(id=filename, content=text, sourceContent="", doc_id="", title="")
- persistArticle(conn,[_article],"articles_processed_selffool")
- list_sentences = []
- _sent_index = 0
- set_sentences = set()
- for x in re.split("[。]", text):
- if len(x)>0:
- if x in set_sentences:
- continue
- set_sentences.add(x)
- _sentence = Sentences(doc_id=filename,sentence_index=_sent_index,sentence_text=x+"。",tokens=[],pos_tags=[],ner_tags=[])
- list_sentences.append(_sentence)
-
-
- _ner_fool = fool.ner(_sentence.sentence_text)
- _ner_selffool = Preprocessing.selffool.ner(_sentence.sentence_text)
-
- if len(set(_ner_fool[0])&set(_ner_selffool[0]))==len(_ner_fool[0]):
- table_entity = "entity_mention_selffool"
- else:
- table_entity = "entity_mention_selffool_notsame"
-
- list_entitys = []
- for item in _ner_selffool[0]:
- _entity_id = filename+"_"+str(_sent_index)+"_"+str(item[0])+"_"+str(item[1])
- _entity = Entity(doc_id=filename,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=_sent_index,begin_index=item[0],end_index=item[1])
- list_entitys.append(_entity)
- persistEntity(conn,list_entitys,table_entity)
- _sent_index += 1
- persistSentence(conn,list_sentences,"sentences_selffool")
- conn.commit()
- except Exception as e:
- print(e)
- conn.close()
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- finally:
- file_index += 1
-
- conn.close()
- def makeCompare():
- '''
- @summary: 通过比较fool的多个版本的selffool来判断置信度
- '''
-
-
- bilstm_new = BiLSTM()
- path_add = "new_model/"
- path = 'model/'+path_add+'model.ckpt'
- bilstm_new.restore(path)
-
-
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select doc_id,sentence_index,sentence_text from sentences_selffool A where exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='selffool') and not exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='fool') "
-
- cursor.execute(sql)
- rows = cursor.fetchall()
- table_entity = "entity_mention_selffool_notsame"
- _index = 0
- try:
- for row in rows:
- _index += 1
- print(_index,len(rows))
- doc_id = row[0]
- sentence_index = row[1]
- '''
- sql = " select count(1) from "+table_entity+" where doc_id='"+doc_id+"' and sentence_index="+str(sentence_index)+" and fool_version='fool' "
- cursor.execute(sql)
- count_rows = cursor.fetchall()
- if count_rows[0][0]>0:
- continue
- '''
- text = row[2]
- _ner_entity_fool = set()
- _ner_entity_selffool = set()
- _ner_fool = fool.ner(text)[0]
- _ner_selffool = bilstm_new.ner(text)[0]
-
- list_entitys = []
- for item in _ner_fool:
- _entity_id = doc_id+"_"+str(sentence_index)+"_"+str(item[0])+"_"+str(item[1])
- _entity = Entity(doc_id=doc_id,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=sentence_index,begin_index=item[0],end_index=item[1])
- list_entitys.append(_entity)
- persistEntity(conn,list_entitys,table_entity)
-
- conn.commit()
- '''
- for item in _ner_fool:
- if item[2] in ["org","company"]:
- _ner_entity_fool.add(item)
- for item in _ner_selffool:
- if item[2] in ["org","company"]:
- _ner_entity_selffool.add(item)
- if len(_ner_entity_fool&_ner_entity_selffool)==len(_ner_entity_fool) and len(_ner_entity_fool)==len(_ner_entity_selffool):
- print(text)
- print(_ner_selffool)
- '''
- except Exception as e:
- print(e)
- conn.close()
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- conn.close()
- def cluster_difference():
- '''
- @summary: 对截断的尾部进行聚类
- '''
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select entity_id,doc_id,sentence_index,begin_index,end_index,entity_type,entity_text,fool_version from entity_mention_selffool_notsame where entity_type in ('org','company') order by entity_id "
- cursor.execute(sql)
- rows = cursor.fetchall()
-
- row_begin = 0
- DIFF_LEN = 2
- dict_diff_list = dict()
- while(row_begin<len(rows)-1):
- print(row_begin,len(rows))
- doc_id = rows[row_begin][1]
- sentence_index = rows[row_begin][2]
- row_end = row_begin
- for _i in range(row_begin+1,len(rows)):
- row_end = _i
- if rows[_i][1]==doc_id and rows[_i][2]==sentence_index:
- continue
- else:
- break
-
- list_entitys_fool = []
- list_entitys_selffool = []
- #对同一篇文章同一个句子的实体进行分类
- for _row in rows[row_begin:row_end]:
- entity_id = _row[0]
- begin_index = _row[3]
- end_index = _row[4]-1
- entity_type = _row[5]
- entity_text = _row[6]
- fool_version = _row[7]
- if entity_type in ["org","company"]:
- _entity = [entity_id,begin_index,end_index,entity_text,entity_type]
- if fool_version=="fool":
- list_entitys_fool.append(_entity)
- else:
- list_entitys_selffool.append(_entity)
- row_begin = row_end
- #分别遍历fool和selffool的结果,对不同点进行聚类
- list_key_entityid = []
- for _entity_fool in list_entitys_fool:
- entity_id_src = _entity_fool[0]
- begin_index_src = _entity_fool[1]
- end_index_src = _entity_fool[2]
- entity_text_src = _entity_fool[3]
- entity_type_src = _entity_fool[4]
-
- for _entity_selffool in list_entitys_selffool:
- entity_id_des = _entity_selffool[0]
- begin_index_des = _entity_selffool[1]
- end_index_des = _entity_selffool[2]
- entity_text_des = _entity_selffool[3]
- entity_type_des = _entity_selffool[4]
-
- if min(end_index_src,end_index_des)>max(begin_index_des,begin_index_src):
- if begin_index_src==begin_index_des:
- _key_begin = "SAME"
- else:
- _key_begin = entity_text_src[0:min(DIFF_LEN,len(entity_text_src))]+"#"+entity_text_des[0:min(DIFF_LEN,len(entity_text_des))]
- if end_index_src==end_index_des:
- _key_end = "SAME"
- else:
- _key_end = entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]+"#"+entity_text_des[-min(DIFF_LEN,len(entity_text_des)):]
- _key = _key_begin+"|"+_key_end
- list_key_entityid.append([_key,[entity_id_src,entity_id_des]])
- #查看是否独有
- for _entity_fool in list_entitys_fool:
- entity_id_src = _entity_fool[0]
- begin_index_src = _entity_fool[1]
- end_index_src = _entity_fool[2]
- entity_text_src = _entity_fool[3]
- entity_type_src = _entity_fool[4]
-
- find_flag = False
- for item in list_key_entityid:
- if entity_id_src in item[1]:
- find_flag = True
- if not find_flag:
- _key = "fool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]
- list_key_entityid.append([_key,[entity_id_src]])
- for _entity_fool in list_entitys_selffool:
- entity_id_src = _entity_fool[0]
- begin_index_src = _entity_fool[1]
- end_index_src = _entity_fool[2]
- entity_text_src = _entity_fool[3]
- entity_type_src = _entity_fool[4]
-
- find_flag = False
- for item in list_key_entityid:
- if entity_id_src in item[1]:
- find_flag = True
- if not find_flag:
- _key = "selffool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]
- list_key_entityid.append([_key,[entity_id_src]])
- #聚类
- for item in list_key_entityid:
- find_flag = False
- if item[0]=="SAME|SAME":
- continue
- for _key in dict_diff_list.keys():
- if item[0]==_key:
- dict_diff_list[_key].append(item[1])
- find_flag = True
- if not find_flag:
- dict_diff_list[item[0]] = [item[1]]
-
- print(len(dict_diff_list.keys()))
- list_key_count = []
- for _key in dict_diff_list.keys():
- list_key_count.append([_key,len(dict_diff_list[_key])])
- list_key_count.sort(key=lambda x:x[1],reverse=True)
- with codecs.open("diff_key_count.txt","w",encoding="utf8") as f:
- for item in list_key_count:
- f.write(item[0]+"\t"+str(item[1])+"\n")
-
- save(dict_diff_list,"dict_diff_list.pk")
-
- dict_sentence = None
- def get_sentence(doc_id,sentence_index):
- global dict_sentence
- file_dict_sentence = "dict_sentence.pk"
- if dict_sentence is None:
- if os.path.exists(file_dict_sentence):
- dict_sentence = load(file_dict_sentence)
- else:
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select doc_id,sentence_index,sentence_text from sentences_selffool "
- cursor.execute(sql)
-
- dict_sentence = dict()
- rows = cursor.fetchall()
- for row in rows:
- _doc_id = row[0]
- _sentence_index = row[1]
- _sentence_text = row[2]
- _key = _doc_id+str(_sentence_index)
- dict_sentence[_key] = _sentence_text
- save(dict_sentence,file_dict_sentence)
- _key = doc_id+str(sentence_index)
- if _key in dict_sentence.keys():
- return dict_sentence[_key]
- return None
- dict_diff_list = None
- def viewEntityByKey():
- global dict_diff_list
- if dict_diff_list is None:
- dict_diff_list = load("dict_diff_list.pk")
- CONTEXT_LEN = 15
- for key in dict_diff_list.keys():
- diff_list = dict_diff_list[key]
- file = "cluster_view/"+re.sub("[\*\|\/\r\n:]","",key.strip())+".xls"
- if os.path.exists(file):
- continue
- list_entityid = []
- list_before = []
- list_center = []
- list_after = []
- list_change = []
- list_type = []
- list_version = []
- if len(diff_list[0])==2:
- for item in diff_list:
- for i in range(len(item)):
- if i==0:
- list_version.append("fool")
- else:
- list_version.append("selffool")
- entityid = item[i]
- split_entityid = entityid.split("html")[1].split("_")
- doc_id = entityid.split("html")[0]+"html"
- sentence_index = split_entityid[1]
- sentence_text = get_sentence(doc_id, sentence_index)
- begin_index = int(split_entityid[2])
- end_index = int(split_entityid[3])-1
- list_entityid.append(entityid)
- before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index]
- center = sentence_text[begin_index:end_index]
- after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))]
- list_before.append(before)
- list_center.append(center)
- list_after.append(after)
- list_change.append(center)
- list_type.append("")
- else:
- version = key.split("|")[0]
- for item in diff_list:
- list_version.append(version)
- entityid = item[0]
- split_entityid = entityid.split("html")[1].split("_")
- doc_id = entityid.split("html")[0]+"html"
- sentence_index = split_entityid[1]
- sentence_text = get_sentence(doc_id, sentence_index)
- begin_index = int(split_entityid[2])
- end_index = int(split_entityid[3])-1
- list_entityid.append(entityid)
- before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index]
- center = sentence_text[begin_index:end_index]
- after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))]
- list_before.append(before)
- list_center.append(center)
- list_after.append(after)
- list_change.append(center)
- list_type.append("")
- data = {"list_entityid":list_entityid,
- "list_before":list_before,
- "list_center":list_center,
- "list_after":list_after,
- "list_change":list_change,
- "list_type":list_type,
- "list_version":list_version}
- df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- df.to_excel(file)
-
- def alterFileByRule(file):
- df = pd.read_excel(file)
- _location = "location"
- for _index in range(len(df["list_entityid"])):
- version = df["list_version"][_index]
- if version=="selffool":
- ''''''
-
- df["list_change"][_index] = df["list_change"][_index-1]
-
- df.to_excel(file,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- def getCompanyByTianyan():
- token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56"
- headers = {"Authorization":token}
- url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word="
- _inTianYan = "inTianYan"
- _inSource = "inSource"
- _dict = load("dict_company.pk")
-
-
- count = 0
- for entity in load("set_company.pk"):
- try:
- count += 1
- print(count,len(_dict.keys()))
- if entity in _dict:
- _dict[entity][_inSource] = True
- if _dict[entity][_inTianYan]:
- continue
- else:
- _dict[entity] = {_inTianYan:False,_inSource:True}
- r = requests.get(url+entity,headers=headers)
- r_json = r.json()
- if r_json["error_code"]==0:
- for item in r_json["result"]["items"]:
- companyName = re.sub("</?em>","",item["name"]).replace("(","(").replace(")",")")
- if companyName in _dict:
- _dict[companyName][_inTianYan] = True
- else:
- _dict[companyName] = {_inTianYan:True,_inSource:False}
- elif r_json["error_code"]==300007:
- print("剩余次数不足")
- break
- except Exception as e:
- print(str(e))
- save(_dict,"dict_company.pk")
-
- def labelByTianyan():
- '''
- @summary: 通过天眼查的数据接口来获取标注
- '''
- list_entityid = []
- list_before = []
- list_center = []
- list_after = []
- list_change = []
- list_type = []
- list_version = []
-
- list_entityid_notmatch = []
- list_before_notmatch = []
- list_center_notmatch = []
- list_after_notmatch = []
- list_change_notmatch = []
- list_type_notmatch = []
- list_version_notmatch = []
- _inTianYan = "inTianYan"
- _inSource = "inSource"
- _dict_company = load("dict_company.pk")
- is_compare = False
- for file in glob.glob("cluster_view/add/*.xls"):
- df = pd.read_excel(file)
- for _index in range(len(df["list_change"])):
- version = df["list_version"][_index]
- if version in ["selffool","fool"]:
- _match_count = 0
- true_entity = None
- if df["list_change"][_index] in _dict_company:
- if _dict_company[df["list_change"][_index]][_inTianYan]:
- _match_count += 1
- true_entity = df["list_change"][_index]
- if is_compare:
- if df["list_change"][_index-1] in _dict_company:
- if _dict_company[df["list_change"][_index-1]][_inTianYan]:
- _match_count += 1
- true_entity = df["list_change"][_index-1]
- if _match_count==1:
- if is_compare:
- list_entityid.append(df["list_entityid"][_index-1])
- list_before.append(df["list_before"][_index-1])
- list_center.append(df["list_center"][_index-1])
- list_after.append(df["list_after"][_index-1])
- list_change.append(df["list_change"][_index-1])
- list_type.append(df["list_type"][_index-1])
- list_version.append(df["list_version"][_index-1])
- list_entityid.append(df["list_entityid"][_index])
- list_before.append(df["list_before"][_index])
- list_center.append(df["list_center"][_index])
- list_after.append(df["list_after"][_index])
- list_change.append(true_entity)
- list_type.append(df["list_type"][_index])
- list_version.append(df["list_version"][_index])
- else:
- if is_compare:
- list_entityid_notmatch.append(df["list_entityid"][_index-1])
- list_before_notmatch.append(df["list_before"][_index-1])
- list_center_notmatch.append(df["list_center"][_index-1])
- list_after_notmatch.append(df["list_after"][_index-1])
- list_change_notmatch.append(df["list_change"][_index-1])
- list_type_notmatch.append(df["list_type"][_index-1])
- list_version_notmatch.append(df["list_version"][_index-1])
- list_entityid_notmatch.append(df["list_entityid"][_index])
- list_before_notmatch.append(df["list_before"][_index])
- list_center_notmatch.append(df["list_center"][_index])
- list_after_notmatch.append(df["list_after"][_index])
- list_change_notmatch.append(df["list_change"][_index])
- list_type_notmatch.append(df["list_type"][_index])
- list_version_notmatch.append(df["list_version"][_index])
- data = {"list_entityid":list_entityid,
- "list_before":list_before,
- "list_center":list_center,
- "list_after":list_after,
- "list_change":list_change,
- "list_type":list_type,
- "list_version":list_version}
- df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- df.to_excel("cluster_view/add_match.xls")
-
- nums = 50000
- _begin = 0
- while(_begin<len(list_entityid_notmatch)):
- data = {"list_entityid":list_entityid_notmatch[_begin:_begin+nums],
- "list_before":list_before_notmatch[_begin:_begin+nums],
- "list_center":list_center_notmatch[_begin:_begin+nums],
- "list_after":list_after_notmatch[_begin:_begin+nums],
- "list_change":list_change_notmatch[_begin:_begin+nums],
- "list_type":list_type_notmatch[_begin:_begin+nums],
- "list_version":list_version_notmatch[_begin:_begin+nums]}
- df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- df.to_excel("cluster_view/add_notmatch_"+str(_begin)+".xls")
- _begin += nums
-
- def cluster_entitys():
- '''
- @summary: 对实体进行聚类,统一截断
- '''
- tail_pattern = re.compile("学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
-
- dict_tail_entitys = {}
- listfile = ["cluster_view/*_match.xls","cluster_view/done/*.xls","cluster_view/tofix/done/*.xls"]
- count = 0
- for globfile in listfile:
- for file in glob.glob(globfile):
- isadd = re.search("fool|add",file) is not None
- count += 1
- print(count,file)
- df = pd.read_excel(file)
- list_entityid = df["list_entityid"]
- list_before = df["list_before"]
- list_center = df["list_center"]
- list_after = df["list_after"]
- list_change = df["list_change"]
- list_type = df["list_type"]
- list_version = df["list_version"]
- for _index in range(len(list_entityid)):
- '''
- #跳过标注为1的
- if str(list_change[_index])=="1":
- continue
- '''
- #跳过用于对比标注的fool数据
- if not isadd and list_version[_index]=="fool":
- continue
- if str(list_change[_index])=="1":
- _key = "-1-"
- else:
- _find = re.findall(tail_pattern,str(list_change[_index]))
- if len(_find)==0:
- _key = "other"
- else:
- _key = "-".join(_find)
- if _key in dict_tail_entitys:
- dict_tail_entitys[_key]["list_entityid"].append(list_entityid[_index])
- dict_tail_entitys[_key]["list_before"].append(list_before[_index])
- dict_tail_entitys[_key]["list_center"].append(list_center[_index])
- dict_tail_entitys[_key]["list_after"].append(list_after[_index])
- dict_tail_entitys[_key]["list_change"].append(list_change[_index])
- dict_tail_entitys[_key]["list_type"].append(list_type[_index])
- dict_tail_entitys[_key]["list_version"].append(list_version[_index])
- else:
- dict_tail_entitys[_key] = {"list_entityid":[list_entityid[_index]],
- "list_before":[list_before[_index]],
- "list_center":[list_center[_index]],
- "list_after":[list_after[_index]],
- "list_change":[list_change[_index]],
- "list_type":[list_type[_index]],
- "list_version":[list_version[_index]]}
- print(len(dict_tail_entitys.keys()))
- for _key in dict_tail_entitys.keys():
-
- filename = "cluster_view/cluster/"+_key+".xls"
- nums = 50000
- _begin = 0
- if os.path.exists(filename):
- continue
- while(_begin*nums<len(dict_tail_entitys[_key]["list_entityid"])):
- data = {"list_entityid":dict_tail_entitys[_key]["list_entityid"][_begin*nums:(_begin+1)*nums],
- "list_before":dict_tail_entitys[_key]["list_before"][_begin*nums:(_begin+1)*nums],
- "list_center":dict_tail_entitys[_key]["list_center"][_begin*nums:(_begin+1)*nums],
- "list_after":dict_tail_entitys[_key]["list_after"][_begin*nums:(_begin+1)*nums],
- "list_change":dict_tail_entitys[_key]["list_change"][_begin*nums:(_begin+1)*nums],
- "list_type":dict_tail_entitys[_key]["list_type"][_begin*nums:(_begin+1)*nums],
- "list_version":dict_tail_entitys[_key]["list_version"][_begin*nums:(_begin+1)*nums]}
- df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- df.to_excel("cluster_view/cluster/"+_key+"-"+str(_begin)+".xls")
- _begin += 1
-
- def tofix():
- '''
- @summary: 获取所有待标注数据,使用规则过滤掉一些,再均分
- '''
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select entity_id,fool_version,entity_text from entity_mention_selffool_notsame"
- cursor.execute(sql)
-
- dict_idversion_entity = {}
- rows = cursor.fetchall()
- for row in rows:
- entity_id = row[0]
- fool_version = row[1]
- entity_text = row[2]
- _key = entity_id+"-"+fool_version
- dict_idversion_entity[_key] = entity_text
-
- list_entityid_tofix = []
- list_before_tofix = []
- list_center_tofix = []
- list_after_tofix = []
- list_change_tofix = []
- list_type_tofix = []
- list_version_tofix = []
-
- list_entityid_nottofix = []
- list_before_nottofix = []
- list_center_nottofix = []
- list_after_nottofix = []
- list_change_nottofix = []
- list_type_nottofix = []
- list_version_nottofix = []
-
- nottofix_pattern = "^[A-Za-z0-9\-]*[省市区县州镇]|^[A-Za-z0-9\-]+$"
- listfile = ["cluster_view/*notmatch*.xls"]
- count = 0
- data = []
- for globfile in listfile:
- for file in glob.glob(globfile):
- isadd = re.search("fool|add",file) is not None
- count += 1
- print(count)
- df = pd.read_excel(file)
- list_entityid = df["list_entityid"]
- list_before = df["list_before"]
- list_center = df["list_center"]
- list_after = df["list_after"]
- list_change = df["list_change"]
- list_type = df["list_type"]
- list_version = df["list_version"]
- for _index in range(len(list_entityid)):
- if not isadd and list_version[_index]=="fool":
- continue
- _key = str(list_entityid[_index])+"-"+str(list_version[_index])
- if _key in dict_idversion_entity and dict_idversion_entity[_key]!=list_center[_index]:
- list_center[_index] = dict_idversion_entity[_key]
- list_change[_index] = dict_idversion_entity[_key]
- data.append([str(list_entityid[_index]),str(list_before[_index]),str(list_center[_index]),str(list_after[_index]),str(list_change[_index]),str(list_type[_index]),str(list_version[_index])])
- data.sort(key=lambda x:x[4])
-
- for item in data:
- entityid = item[0]
- before = item[1]
- center = item[2]
- after = item[3]
- change = item[4]
- type = item[5]
- version = item[6]
- if re.search(nottofix_pattern,change) is not None:
- list_entityid_nottofix.append(entityid)
- list_before_nottofix.append(before)
- list_center_nottofix.append(center)
- list_after_nottofix.append(after)
- list_change_nottofix.append(change)
- list_type_nottofix.append(type)
- list_version_nottofix.append(version)
- else:
- list_entityid_tofix.append(entityid)
- list_before_tofix.append(before)
- list_center_tofix.append(center)
- list_after_tofix.append(after)
- list_change_tofix.append(change)
- list_type_tofix.append(type)
- list_version_tofix.append(version)
-
- parts = 16
- nums = len(list_entityid_tofix)//parts
- _begin = 0
- while(_begin*nums<len(list_entityid_tofix)):
- data = {"list_entityid":list_entityid_tofix[_begin*nums:(_begin+1)*nums],
- "list_before":list_before_tofix[_begin*nums:(_begin+1)*nums],
- "list_center":list_center_tofix[_begin*nums:(_begin+1)*nums],
- "list_after":list_after_tofix[_begin*nums:(_begin+1)*nums],
- "list_change":list_change_tofix[_begin*nums:(_begin+1)*nums],
- "list_type":list_type_tofix[_begin*nums:(_begin+1)*nums],
- "list_version":list_version_tofix[_begin*nums:(_begin+1)*nums]}
- df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- df.to_excel("cluster_view/tofix/"+str(_begin)+".xls")
- _begin += 1
- data = {"list_entityid":list_entityid_nottofix,
- "list_before":list_before_nottofix,
- "list_center":list_center_nottofix,
- "list_after":list_after_nottofix,
- "list_change":list_change_nottofix,
- "list_type":list_type_nottofix,
- "list_version":list_version_nottofix}
- df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
- df.to_excel("cluster_view/tofix/nottofix.xls")
-
- def updateEntityview():
- '''
- @summary: 将视图中的数据更新回去,以selffool的为准进行更新操作,若是遇到fool新增的,则进行插入操作,fool_version改为selffool_add
- '''
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- listfile = ["cluster_view/cluster/*.xls"]
- count = 0
- for globfile in listfile:
- for file in glob.glob(globfile):
- count += 1
- print(count,file)
- df = pd.read_excel(file)
- for _index in range(len(df["list_entityid"])):
- entity_id = df["list_entityid"][_index]
- doc_id = entity_id.split("html")[0]+"html"
- list_index = entity_id.split("html")[1].split("_")
- sentence_index = list_index[1]
- begin_index = list_index[2]
- end_index = list_index[3]
- change = str(df["list_change"][_index])
- type = str(df["list_type"][_index])
- version = str(df["list_version"][_index])
- if version=="fool":
- sql = " update entity_mention_selffool_notsame set new_text='"+str(change)+"',new_type='"+type+"',fool_version='fool_add' where entity_id='"+entity_id+"' and fool_version='fool' "
- cursor.execute(sql)
- else:
- sql = " update entity_mention_selffool_notsame set new_text='"+str(change)+"',new_type='"+type+"' where entity_id='"+entity_id+"' and fool_version='selffool' "
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def makeLabelText():
- '''
- @summary: 更新数据太慢,直接从数据库查询出数据库进行替换,然后生成训练数据
- '''
- conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- #根据聚类的结果生成替换字典
- dict_replace = dict()
- listfile = ["cluster_view/cluster/*.xls"]
- count = 0
- for globfile in listfile:
- for file in glob.glob(globfile):
- count += 1
- print(count,file)
- df = pd.read_excel(file)
- for _index in range(len(df["list_entityid"])):
- entity_id = df["list_entityid"][_index]
- doc_id = entity_id.split("html")[0]+"html"
- list_index = entity_id.split("html")[1].split("_")
- sentence_index = list_index[1]
- begin_index = list_index[2]
- end_index = list_index[3]
- change = str(df["list_change"][_index])
- type = str(df["list_type"][_index])
- version = str(df["list_version"][_index])
- _key = entity_id+version
- dict_replace[_key] = [change,type]
- print("get dict_replace done")
- #从数据库中查询出实体,按照entity_id排序
- data_entity = []
- sql = " select entity_id,doc_id,sentence_index,entity_text,entity_type,'selffool' from entity_mention_selffool order by entity_id "
- cursor.execute(sql)
- rows = cursor.fetchall()
- data_entity = data_entity+rows
- sql = " select entity_id,doc_id,sentence_index,entity_text,entity_type,fool_version from entity_mention_selffool_notsame order by entity_id "
- cursor.execute(sql)
- rows = cursor.fetchall()
- data_entity = data_entity+rows
- #生成doc_id-sentence_index-list_entity_type字典
- dict_sent_entitys = dict()
- _begin = 0
- while(_begin<len(data_entity)-1):
- _begin_doc_id = data_entity[_begin][1]
- _begin_sentence_index = data_entity[_begin][2]
- _end = _begin
- print(_begin)
- for end in range(_begin+1,len(data_entity)):
- _end = end
- _end_doc_id = data_entity[end][1]
- _end_sentence_index = data_entity[end][2]
- if _begin_doc_id==_end_doc_id and _begin_sentence_index==_end_sentence_index:
- continue
- else:
- break
- for item in data_entity[_begin:_end]:
- entity_id = item[0]
- doc_id = item[1]
- sentence_index = item[2]
- entity_text = item[3]
- entity_type = item[4]
- version = item[5]
- _key = doc_id+"-"+str(sentence_index)
- if _key not in dict_sent_entitys:
- dict_sent_entitys[_key] = []
- #进行替换
- if entity_type in ["org","company"]:
- _key1 = entity_id+version
- if _key1 in dict_replace:
- if str(dict_replace[_key1][0])=="1":
- continue
- if dict_replace[_key1][1] in ["org","company","person","location"]:
- for item in re.split("##",dict_replace[_key1][0]):
- dict_sent_entitys[_key].append([item,dict_replace[_key1][1]])
- else:
- for item in re.split("##",dict_replace[_key1][0]):
- dict_sent_entitys[_key].append([item,entity_type])
- else:
- if version=="selffool":
- dict_sent_entitys[_key].append([entity_text,entity_type])
- else:
- dict_sent_entitys[_key].append([entity_text,entity_type])
- _begin = _end
- print("get dict_sent_entitys done")
- #查询出句子
- sql = " select doc_id,sentence_index,sentence_text from sentences_selffool order by doc_id "
- cursor.execute(sql)
-
- list_sentence = cursor.fetchall()
- count = 0
- with codecs.open("selffool_train.txt","w",encoding="utf8") as f:
-
-
- for sent in list_sentence:
-
- count += 1
- print(count)
- _key = sent[0]+"-"+str(sent[1])
- sentence = sent[2]
- if len(sentence)>2000:
- continue
- if _key in dict_sent_entitys:
- data_item,_find_flag = makeLabel(sentence, dict_sent_entitys[_key])
- for _word,_label in data_item:
- f.write(_word+" "+_label+"\n")
- else:
- if np.random.random()>0.8:
- data_item,_find_flag = makeLabel(sentence, [])
- for _word,_label in data_item:
- f.write(_word+" "+_label+"\n")
- f.write("\n")
- f.flush()
-
-
-
-
- if __name__=="__main__":
- #makeFoolTrainData()
- #makeTrainTxt()
- #labelEntity()
- #readlabeldata("cleanedEntity.txt",getContext("ner_train.txt"))
- #makeDict_filename_content()
- #selectByRule()
- #updateLabel()
- #importLabelData()
- #makeCertainEntity()
- #addContextToTheEntity("company_found.tsv")
- #makeContext_by_fool_selffool()
- #makeCompare()
- #cluster_difference()
- #viewEntityByKey()
- #alterFileByRule("cluster_view/change/SAME版社#大学.xls")
- #getCompanyByTianyan()
- '''
- data = load("dict_company.pk")
- for item in data.keys():
- print(item,data[item])
- '''
- #labelByTianyan()
- '''
- token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56"
- headers = {"Authorization":token}
- url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word="
- r = requests.get(url+"安阳鑫龙煤业(集团)龙山煤业有限责任公司",headers=headers)
- r_json = r.json()
- print(r_json)
- '''
- #tofix()
- cluster_entitys()
- makeLabelText()
|