''' Created on 2019年6月13日 @author: User ''' import fool from bi_lstm_crf import * import pandas as pd import codecs import re '''''' def compare(text): print(fool.ner(text)) ''' bilstm.initVariables() ''' # init_op = tf.global_variables_initializer() # sess.run(init_op) # summaryWriter = tf.summary.FileWriter('log/', tf.get_default_graph()) print(bilstm.ner(text)) _ner_fool = fool.ner(text) _ner_selffool = bilstm.ner(text) if len(set(_ner_fool[0]) & set(_ner_selffool[0])) == len(_ner_fool[0]): print(set(fool.ner(text)[0]) & set(bilstm.ner(text)[0])) def dealNotFoundEntity(): ''' @summary: 处理未识别数据 ''' df = pd.read_excel("C:\\Users\\User\\Desktop\\无法分离实体名称.xlsx") list_newname_fool = [] list_newname_selffool = [] count = 0 for _name in df["name"]: count += 1 print(_name) if str(_name) == "nan": list_newname_fool.append("") list_newname_selffool.append("") continue print(count, len(df["name"])) _newname_fool = "" _newname_selffool = "" for _ner in fool.ner(_name)[0]: _newname_fool += _ner[3] + "##" for _ner in bilstm.ner(_name)[0]: _newname_selffool += _ner[3] + "##" list_newname_fool.append(_newname_fool[:-2]) list_newname_selffool.append(_newname_selffool[:-2]) data = {"id": df["id"], "area": df["area"], "province": df["province"], "city": df["city"], "district": df["district"], "name": df["name"], "newname_fool": list_newname_fool, "newname_selffool": list_newname_selffool} _df = pd.DataFrame(data, columns=["id", "area", "province", "city", "district", "name", "newname_fool", "newname_selffool"]) _df.to_excel("C:\\Users\\User\\Desktop\\无法分离实体名称_deal.xls") def nerEntity(): file = "C:\\Users\\User\\Desktop\\select_company_name_from_bxkc_C_CONTACT_.tsv" file_found = "C:\\Users\\User\\Desktop\\company_found.tsv" file_notfound = "C:\\Users\\User\\Desktop\\company_notfound.tsv" with codecs.open(file, "r", encoding="utf8") as f: with codecs.open(file_found, "w", encoding="utf8") as f_found: with codecs.open(file_notfound, "w", encoding="utf8") as f_notfound: while (True): line = f.readline().strip() if not line: break entity = re.sub(")", ")", re.sub("(", "(", line)) if re.search("公司$", entity): _ner = bilstm.ner(entity)[0] if len(_ner) == 1 and _ner[0][3] == entity: f_found.write(entity + "\n") else: f_notfound.write(entity + "\n") def cleanEntity(): source_file = "C:\\Users\\User\\Desktop\\notcleanedEntity.tsv" temp_file = "C:\\Users\\User\\Desktop\\temp.tsv" set_cleanedEntity = set() set_notcleanedEntity = set() with codecs.open(source_file, "r", encoding="utf8") as f_nce: while (True): line = f_nce.readline().strip() if not line: break entity = re.sub('["\s]', "", line) f_1 = list(re.finditer("公司", entity)) f_2 = list(re.finditer("[支分]公司", entity)) # if len(f_1)==2 and len(f_2)==1 and re.search("[原;;.。、\|,,]",entity[f_1[0].span()[1]:f_1[1].span()[0]]) is None: if re.search("br|/", entity) is not None: # f_ce.write(entity+"\n") set_cleanedEntity.add(entity) else: set_notcleanedEntity.add(entity) list_cleanedEntity = list(set_cleanedEntity) list_cleanedEntity.sort(key=lambda x: len(x)) list_notcleanedEntity = list(set_notcleanedEntity) list_notcleanedEntity.sort(key=lambda x: len(x)) with codecs.open(temp_file, "w", encoding="utf8") as f_ce: with codecs.open(source_file, "w", encoding="utf8") as f_nce: for item in list_cleanedEntity: f_ce.write(item + "\n"); for item in list_notcleanedEntity: f_nce.write(item + "\n") if __name__ == "__main__": ''' path_add = "0-12/" path = 'model/'+path_add+'model.ckpt' bilstm = BiLSTM().restore(path) ''' bertCrf = BertCRF().restore() text = '小册子一批采购计划一、采购人:广州市比地数据科技有限公司,二、采购项目编号:' print(bertCrf.ner(text)) # dealNotFoundEntity() pass ''' cleanEntity() '''