123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- '''
- Created on 2019年6月13日
- @author: User
- '''
- import fool
- from bi_lstm_crf import *
- import pandas as pd
- import codecs
- import re
- ''''''
- def compare(text):
- print(fool.ner(text))
- '''
- bilstm.initVariables()
- '''
- # init_op = tf.global_variables_initializer()
- # sess.run(init_op)
- # summaryWriter = tf.summary.FileWriter('log/', tf.get_default_graph())
- print(bilstm.ner(text))
- _ner_fool = fool.ner(text)
- _ner_selffool = bilstm.ner(text)
- if len(set(_ner_fool[0]) & set(_ner_selffool[0])) == len(_ner_fool[0]):
- print(set(fool.ner(text)[0]) & set(bilstm.ner(text)[0]))
- def dealNotFoundEntity():
- '''
- @summary: 处理未识别数据
- '''
- df = pd.read_excel("C:\\Users\\User\\Desktop\\无法分离实体名称.xlsx")
- list_newname_fool = []
- list_newname_selffool = []
- count = 0
- for _name in df["name"]:
- count += 1
- print(_name)
- if str(_name) == "nan":
- list_newname_fool.append("")
- list_newname_selffool.append("")
- continue
- print(count, len(df["name"]))
- _newname_fool = ""
- _newname_selffool = ""
- for _ner in fool.ner(_name)[0]:
- _newname_fool += _ner[3] + "##"
- for _ner in bilstm.ner(_name)[0]:
- _newname_selffool += _ner[3] + "##"
- list_newname_fool.append(_newname_fool[:-2])
- list_newname_selffool.append(_newname_selffool[:-2])
- data = {"id": df["id"],
- "area": df["area"],
- "province": df["province"],
- "city": df["city"],
- "district": df["district"],
- "name": df["name"],
- "newname_fool": list_newname_fool,
- "newname_selffool": list_newname_selffool}
- _df = pd.DataFrame(data, columns=["id", "area", "province", "city", "district", "name", "newname_fool",
- "newname_selffool"])
- _df.to_excel("C:\\Users\\User\\Desktop\\无法分离实体名称_deal.xls")
- def nerEntity():
- file = "C:\\Users\\User\\Desktop\\select_company_name_from_bxkc_C_CONTACT_.tsv"
- file_found = "C:\\Users\\User\\Desktop\\company_found.tsv"
- file_notfound = "C:\\Users\\User\\Desktop\\company_notfound.tsv"
- with codecs.open(file, "r", encoding="utf8") as f:
- with codecs.open(file_found, "w", encoding="utf8") as f_found:
- with codecs.open(file_notfound, "w", encoding="utf8") as f_notfound:
- while (True):
- line = f.readline().strip()
- if not line:
- break
- entity = re.sub(")", ")", re.sub("(", "(", line))
- if re.search("公司$", entity):
- _ner = bilstm.ner(entity)[0]
- if len(_ner) == 1 and _ner[0][3] == entity:
- f_found.write(entity + "\n")
- else:
- f_notfound.write(entity + "\n")
- def cleanEntity():
- source_file = "C:\\Users\\User\\Desktop\\notcleanedEntity.tsv"
- temp_file = "C:\\Users\\User\\Desktop\\temp.tsv"
- set_cleanedEntity = set()
- set_notcleanedEntity = set()
- with codecs.open(source_file, "r", encoding="utf8") as f_nce:
- while (True):
- line = f_nce.readline().strip()
- if not line:
- break
- entity = re.sub('["\s]', "", line)
- f_1 = list(re.finditer("公司", entity))
- f_2 = list(re.finditer("[支分]公司", entity))
- # if len(f_1)==2 and len(f_2)==1 and re.search("[原;;.。、\|,,]",entity[f_1[0].span()[1]:f_1[1].span()[0]]) is None:
- if re.search("br|/", entity) is not None:
- # f_ce.write(entity+"\n")
- set_cleanedEntity.add(entity)
- else:
- set_notcleanedEntity.add(entity)
- list_cleanedEntity = list(set_cleanedEntity)
- list_cleanedEntity.sort(key=lambda x: len(x))
- list_notcleanedEntity = list(set_notcleanedEntity)
- list_notcleanedEntity.sort(key=lambda x: len(x))
- with codecs.open(temp_file, "w", encoding="utf8") as f_ce:
- with codecs.open(source_file, "w", encoding="utf8") as f_nce:
- for item in list_cleanedEntity:
- f_ce.write(item + "\n");
- for item in list_notcleanedEntity:
- f_nce.write(item + "\n")
- if __name__ == "__main__":
- '''
- path_add = "0-12/"
- path = 'model/'+path_add+'model.ckpt'
- bilstm = BiLSTM().restore(path)
- '''
- bertCrf = BertCRF().restore()
- text = '小册子一批采购计划一、采购人:广州市比地数据科技有限公司,二、采购项目编号:'
- print(bertCrf.ner(text))
- # dealNotFoundEntity()
- pass
- '''
- cleanEntity()
- '''
|