luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							'''
Created on 2019年6月13日

@author: User
'''

import fool
from bi_lstm_crf import *
import pandas as pd
import codecs
import re

''''''


def compare(text):
    print(fool.ner(text))
    '''
    bilstm.initVariables()
    '''
    # init_op = tf.global_variables_initializer()
    # sess.run(init_op)
    # summaryWriter = tf.summary.FileWriter('log/', tf.get_default_graph())
    print(bilstm.ner(text))
    _ner_fool = fool.ner(text)
    _ner_selffool = bilstm.ner(text)

    if len(set(_ner_fool[0]) & set(_ner_selffool[0])) == len(_ner_fool[0]):
        print(set(fool.ner(text)[0]) & set(bilstm.ner(text)[0]))


def dealNotFoundEntity():
    '''
    @summary: 处理未识别数据
    '''
    df = pd.read_excel("C:\\Users\\User\\Desktop\\无法分离实体名称.xlsx")
    list_newname_fool = []
    list_newname_selffool = []
    count = 0
    for _name in df["name"]:
        count += 1
        print(_name)
        if str(_name) == "nan":
            list_newname_fool.append("")
            list_newname_selffool.append("")
            continue
        print(count, len(df["name"]))
        _newname_fool = ""
        _newname_selffool = ""
        for _ner in fool.ner(_name)[0]:
            _newname_fool += _ner[3] + "##"
        for _ner in bilstm.ner(_name)[0]:
            _newname_selffool += _ner[3] + "##"
        list_newname_fool.append(_newname_fool[:-2])
        list_newname_selffool.append(_newname_selffool[:-2])
    data = {"id": df["id"],
            "area": df["area"],
            "province": df["province"],
            "city": df["city"],
            "district": df["district"],
            "name": df["name"],
            "newname_fool": list_newname_fool,
            "newname_selffool": list_newname_selffool}
    _df = pd.DataFrame(data, columns=["id", "area", "province", "city", "district", "name", "newname_fool",
                                      "newname_selffool"])
    _df.to_excel("C:\\Users\\User\\Desktop\\无法分离实体名称_deal.xls")


def nerEntity():
    file = "C:\\Users\\User\\Desktop\\select_company_name_from_bxkc_C_CONTACT_.tsv"
    file_found = "C:\\Users\\User\\Desktop\\company_found.tsv"
    file_notfound = "C:\\Users\\User\\Desktop\\company_notfound.tsv"
    with codecs.open(file, "r", encoding="utf8") as f:
        with codecs.open(file_found, "w", encoding="utf8") as f_found:
            with codecs.open(file_notfound, "w", encoding="utf8") as f_notfound:
                while (True):
                    line = f.readline().strip()
                    if not line:
                        break
                    entity = re.sub("）", ")", re.sub("（", "(", line))
                    if re.search("公司$", entity):
                        _ner = bilstm.ner(entity)[0]
                        if len(_ner) == 1 and _ner[0][3] == entity:
                            f_found.write(entity + "\n")
                        else:
                            f_notfound.write(entity + "\n")


def cleanEntity():
    source_file = "C:\\Users\\User\\Desktop\\notcleanedEntity.tsv"
    temp_file = "C:\\Users\\User\\Desktop\\temp.tsv"

    set_cleanedEntity = set()
    set_notcleanedEntity = set()

    with codecs.open(source_file, "r", encoding="utf8") as f_nce:
        while (True):
            line = f_nce.readline().strip()
            if not line:
                break
            entity = re.sub('["\s]', "", line)
            f_1 = list(re.finditer("公司", entity))
            f_2 = list(re.finditer("[支分]公司", entity))
            # if len(f_1)==2 and len(f_2)==1 and re.search("[原；;.。、\|,，]",entity[f_1[0].span()[1]:f_1[1].span()[0]]) is None:
            if re.search("br|/", entity) is not None:
                # f_ce.write(entity+"\n")
                set_cleanedEntity.add(entity)
            else:
                set_notcleanedEntity.add(entity)
    list_cleanedEntity = list(set_cleanedEntity)
    list_cleanedEntity.sort(key=lambda x: len(x))

    list_notcleanedEntity = list(set_notcleanedEntity)
    list_notcleanedEntity.sort(key=lambda x: len(x))
    with codecs.open(temp_file, "w", encoding="utf8") as f_ce:
        with codecs.open(source_file, "w", encoding="utf8") as f_nce:
            for item in list_cleanedEntity:
                f_ce.write(item + "\n");
            for item in list_notcleanedEntity:
                f_nce.write(item + "\n")


if __name__ == "__main__":
    '''
    path_add = "0-12/"
    path = 'model/'+path_add+'model.ckpt'
    bilstm = BiLSTM().restore(path)
    '''
    bertCrf = BertCRF().restore()

    text = '小册子一批采购计划一、采购人：广州市比地数据科技有限公司，二、采购项目编号：'

    print(bertCrf.ner(text))
    # dealNotFoundEntity()
    pass
    '''
    cleanEntity()
    '''