luojiehua
/
iepy-develop


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
							'''
Created on 2019年5月21日

@author: User
'''

import re
import os

import time
_time = time.time()
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.Entitys import *
import json

def edit_distance(source,target):
    dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
    for i in range(len(dp)):
        for j in range(len(dp[i])):
            if i==0:
                dp[i][j] = j
            elif j==0:
                dp[i][j] = i
            else:
                if source[j-1]==target[i-1]:
                    cost = 0
                else:
                    cost = 2
                dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
    return dp[-1][-1]
    
def jaccard_score(source,target):
    source_set = set([s for s in source])
    target_set = set([s for s in target])
    if len(source_set)==0 or len(target_set)==0:
        return 0
    return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))

def link_entitys(list_entitys,on_value=0.8):
    for list_entity in list_entitys:
        range_entity = []
        for _entity in list_entity:
            if _entity.entity_type in ["org","company"]:
                range_entity.append(_entity)
        range_entity = range_entity[:1000]
        for first_i in range(len(range_entity)):
            _entity = range_entity[first_i]
            for second_i in range(first_i+1,len(range_entity)):
                _ent = range_entity[second_i]
                _score = jaccard_score(_entity.entity_text, _ent.entity_text)
                if _entity.entity_text!=_ent.entity_text and _score>=on_value:
                    _entity.linked_entitys.append(_ent)
                    _ent.linked_entitys.append(_entity)
        #替换公司名称
        for _entity in range_entity:
            if re.search("公司",_entity.entity_text) is None:
                for _ent in _entity.linked_entitys:
                    if re.search("公司$",_ent.entity_text) is not None:
                        if len(_ent.entity_text)>len(_entity.entity_text):
                            _entity.entity_text = _ent.entity_text
 
DICT_ENTERPRISE = {}
DICT_ENTERPRISE_DONE = False
def getDict_enterprise():
    global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
    filename = os.path.dirname(__file__)+"/../LEGAL_ENTERPRISE.txt"
    filepath = os.path.dirname(__file__)+"/../"
    real_path = filename
    if os.path.exists(os.path.join(filepath,filename)):
        real_path = os.path.join(filepath,filename)
    with open(real_path,"r",encoding="UTF8") as f:
        for _e in f:
            if not _e:
                continue
            _e = _e.strip()
            if len(_e)>=4:
                key_enter = _e[:4]
                if key_enter not in DICT_ENTERPRISE:
                    DICT_ENTERPRISE[key_enter] = set()
                DICT_ENTERPRISE[key_enter].add(_e[4:])

    # for _e in ["河南省柘源","建筑工程有限公司"]:
    #     if not _e:
    #         continue
    #     _e = _e.strip()
    #     if len(_e)>=4:
    #         key_enter = _e[:4]
    #         if key_enter not in DICT_ENTERPRISE:
    #             DICT_ENTERPRISE[key_enter] = set()
    #         DICT_ENTERPRISE[key_enter].add(_e[4:])
    DICT_ENTERPRISE_DONE = True
    return DICT_ENTERPRISE

import threading
import time

load_enterprise_thread = threading.Thread(target=getDict_enterprise)
load_enterprise_thread.start()


MAX_ENTERPRISE_LEN = 30

def match_enterprise_max_first(sentence):
    while True:
        if not DICT_ENTERPRISE_DONE:
            time.sleep(1)
        else:
            break
    list_match = []
    begin_index = 0
    if len(sentence)>4:
        while True:
            if begin_index+4<len(sentence):
                key_enter = sentence[begin_index:begin_index+4]
                if key_enter in DICT_ENTERPRISE:
                    for _i in range(MAX_ENTERPRISE_LEN-4+1):
                        enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
                        if enter_name in DICT_ENTERPRISE[key_enter]:
                            match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
                            list_match.append(match_item)
                            begin_index += (len(key_enter)+len(enter_name))-1
                            break
                begin_index += 1
            else:
                break
    return list_match

def calibrateEnterprise(list_articles,list_sentences,list_entitys):
    for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
        list_calibrate = []
        match_add = False
        match_replace = False
        range_entity = []
        for p_entity in list_entity:
            if p_entity.entity_type in ("org","company","location"):
                range_entity.append(p_entity)
            if len(range_entity)>1000:
                break
        for p_sentence in list_sentence:
            sentence = p_sentence.sentence_text
            list_match = match_enterprise_max_first(sentence)

            doc_id = p_sentence.doc_id
            sentence_index = p_sentence.sentence_index
            tokens = p_sentence.tokens


            list_match.sort(key=lambda x:x["begin_index"])


            for _match_index in range(len(list_match)):
                _match = list_match[_match_index]
                find_flag = False
                for p_entity in range_entity:
                    if p_entity.sentence_index!=p_sentence.sentence_index:
                        continue

                    if p_entity=="location" and p_entity.entity_text==_match["entity_text"]:
                        find_flag = True
                        p_entity.entity_type = "company"
                    #有重叠
                    #match部分被包含则不处理
                    if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                        find_flag = True
                        #判断是否是多个公司
                        for _match_j in range(_match_index,len(list_match)):
                            if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
                                _match_j -= 1
                                break
                        if _match_j>_match_index:

                            match_replace = True
                            match_add = True
                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                            p_entity.entity_text = _match["entity_text"]
                            p_entity.wordOffset_begin = _match["begin_index"]
                            p_entity.wordOffset_end = _match["end_index"]
                            p_entity.begin_index = begin_index
                            p_entity.end_index = end_index

                            for _match_h in range(_match_index+1,_match_j+1):
                                entity_text = list_match[_match_h]["entity_text"]
                                entity_type = "company"

                                begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
                                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
                                list_entity.append(add_entity)

                                range_entity.append(add_entity)

                                list_calibrate.append({"type":"add","from":"","to":entity_text})
                            _match_index = _match_j
                            break

                        continue
                    elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
                        find_flag = True
                        if p_entity.entity_type in ("org","company"):
                            if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                                _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
                                if re.search("分",_diff_text) is not None:
                                    pass
                                else:
                                    match_replace = True
                                    begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                                    list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                                    p_entity.entity_text = _match["entity_text"]
                                    p_entity.wordOffset_begin = _match["begin_index"]
                                    p_entity.wordOffset_end = _match["end_index"]
                                    p_entity.begin_index = begin_index
                                    p_entity.end_index = end_index
                            elif _match["end_index"]>=p_entity.wordOffset_end:
                                match_replace = True
                                begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                                end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                                list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                                p_entity.entity_text = _match["entity_text"]
                                p_entity.wordOffset_begin = _match["begin_index"]
                                p_entity.wordOffset_end = _match["end_index"]
                                p_entity.begin_index = begin_index
                                p_entity.end_index = end_index
                    elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
                        find_flag = True
                        if p_entity.entity_type in ("org","company"):
                            match_replace = True
                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                            p_entity.entity_text = _match["entity_text"]
                            p_entity.wordOffset_begin = _match["begin_index"]
                            p_entity.wordOffset_end = _match["end_index"]
                            p_entity.begin_index = begin_index
                            p_entity.end_index = end_index
                if not find_flag:
                    match_add = True
                    entity_text = _match["entity_text"]
                    entity_type = "company"


                    begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
                    entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                    add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
                    list_entity.append(add_entity)
                    range_entity.append(add_entity)

                    list_calibrate.append({"type":"add","from":"","to":entity_text})

        #去重
        set_calibrate = set()
        list_match_enterprise = []
        for _calibrate in list_calibrate:
            _from = _calibrate.get("from","")
            _to = _calibrate.get("to","")
            _key = _from+_to
            if _key not in set_calibrate:
                list_match_enterprise.append(_calibrate)
            set_calibrate.add(_key)
        match_enterprise_type = 0
        if match_add:
            match_enterprise_type += 1
        if match_replace:
            match_enterprise_type += 2
        _article.match_enterprise = list_match_enterprise
        _article.match_enterprise_type = match_enterprise_type


if __name__=="__main__":
    # edit_distance("GUMBO","GAMBOL")
    # print(jaccard_score("GUMBO","GAMBOL"))

    sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
    print(match_enterprise_max_first(sentences))

    print("takes %d s"%(time.time()-_time))