luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
							#coding:UTF8
'''
Created on 2019年5月21日

@author: User
'''
import re
import os
import time
import pandas as pd
_time = time.time()
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.Entitys import *
import json
from BiddingKG.dl.common.constDict import ConstDict

business_dic = {}

def edit_distance(source,target):
    dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
    for i in range(len(dp)):
        for j in range(len(dp[i])):
            if i==0:
                dp[i][j] = j
            elif j==0:
                dp[i][j] = i
            else:
                if source[j-1]==target[i-1]:
                    cost = 0
                else:
                    cost = 2
                dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
    return dp[-1][-1]
    
def jaccard_score(source,target):
    source_set = set([s for s in source])
    target_set = set([s for s in target])
    if len(source_set)==0 or len(target_set)==0:
        return 0
    return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))


def get_place_list():
    path = os.path.dirname(__file__) + '/../place_info.csv'
    place_df = pd.read_csv(path)

    place_list = []
    for index, row in place_df.iterrows():
        place_list.append(row[1])

    place_list.append('台湾')
    place_list.append('澳门')
    place_list.append('香港')
    # place_list.append('東莞')
    # place_list.append('廣州')
    # place_list.append('韩国')
    # place_list.append('德国')
    # place_list.append('英国')
    # place_list.append('日本')
    # place_list.append('意大利')
    # place_list.append('新加坡')
    # place_list.append('加拿大')
    # place_list.append('西班牙')
    # place_list.append('澳大利亚')
    # place_list.append('美国')

    place_list = list(set(place_list))
    return place_list


place_list = get_place_list()
place_pattern = "|".join(place_list)

def is_short(shorter_cut, longer):
    '''
    判断是否为简称
    :param shorter_cut: 简称
    :param longer: 全称
    :return:
    '''
    flag = 1
    for words in shorter_cut:
        if words in longer:
            longer = longer[longer.find(words) + len(words):]
        else:
            flag = 0
            break
    if flag:
        return 1
    else:
        return 0

def get_business_data(enterprise_name):
    '''
    获取指定公司名称是否有工商数据，有就返回True及相关招投标数据，没有返回False及{}
    :param enterprise_name: 公司名称
    :return:
    '''
    global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
    # print("test",enterprise_name)
    if ENTERPRISE_HUGE:
        if POOL_REDIS is None:
            init_redis_pool()
        _db = POOL_REDIS.getConnector()
        try:
            _time = time.time()
            _v = _db.get(enterprise_name)

            POOL_REDIS.putConnector(_db)
            if _v is None:
                return False, {}
            else:
                _v = str(_v, 'utf-8')
                if 'have_business' in _v:
                    # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
                    d = json.loads(_v)
                    if d.get('have_business', '') == 1:
                        return True, d
                    return False, d
                else:
                    return False, {}
        except Exception as e:
            traceback.print_exc()
        return False, {}
    else:
        if enterprise_name in SET_ENTERPRISE:
            return True, {}
        else:
            return False, {}

def get_role(dic):
    '''
    通过字典统计 招标、代理、中标公告数量 返回最大比例及对应类别
    :param dic: redics 获取实体的工商数据字典
    :return:
    '''
    if 'zhao_biao_number' in dic:
        zhaobiao = dic.get('zhao_biao_number', 0)
        daili = dic.get('dai_li_number', 0)
        zhongbiao = dic.get('zhong_biao_number', 0)
        bid = zhaobiao+ daili+ zhongbiao
        if bid > 100: # 总数大于100的才统计
            if zhaobiao>=daili:
                if zhaobiao>=zhongbiao:
                    return 0, zhaobiao/bid
                else:
                    return 2, zhongbiao/bid
            elif daili >= zhongbiao:
                return 1, daili/bid
            else:
                return 2, zhongbiao/bid
    return 5, 0

def link_entitys(list_entitys,on_value=1):#on_value=0.81
    for list_entity in list_entitys:
        range_entity = []
        short_entity = []  # 不包含工商数据实体
        long_entity = []  # 包含工商数据实体
        n = 0
        bus_dic = {} # 保存已查询包含工商数据实体 属于招标、代理、中标 何种类别及对应概率
        find_tenderee = False
        bus_tenderee = []
        for _entity in list_entity:
            if _entity.entity_type in ["org","company"]:
                ser = re.search('(?P<name>.{2,}(医院|大学|公司))(招[投议]?标|采购)(中心|办公室)$', _entity.entity_text) # 2024-06-07 规范单位名称，去除非必要字眼
                if ser:
                    _entity.entity_text = ser.group('name')
                range_entity.append(_entity)
                if _entity.entity_text in bus_dic:
                    have_bus = True
                else:
                    if _entity.entity_text not in business_dic:
                        have_bus, dic = get_business_data(_entity.entity_text)
                        business_dic[_entity.entity_text] = (have_bus, dic)
                    else:
                        have_bus, dic = business_dic.get(_entity.entity_text)  # 20240708 字典保存查询过的工商数据，避免重复查询redis
                    if re.search('^\w{,5}[分支](行|公司)$|^\w{1,3}公司$|^\w{2,5}段$', _entity.entity_text):
                        have_bus = False
                    if have_bus:
                        lb, prob = get_role(dic)
                        bus_dic[_entity.entity_text] = (lb, prob)
                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
                            bus_tenderee.append(_entity)
                    elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据
                        have_bus = True
                        bus_dic[_entity.entity_text] = (0, 0.5)
                if have_bus: # 20231115 改为只判断是否有工商数据，没有就考虑替换
                    long_entity.append(_entity)
                    if len(_entity.entity_text)< 6 and re.search('(大学|医院)', _entity.entity_text) == None:
                        short_entity.append(_entity)
                    lb, prob = bus_dic[_entity.entity_text]
                    if lb in [0,1] and prob>0.9 and _entity.label in [0, 1] and _entity.values[_entity.label]<0.55: # 如果工商统计概率较高，文中概率较低，换为统计类别，主要为标题及发布人等招标、代理划分不明确情况
                        if _entity.label != lb:
                            _entity.label = lb
                            _entity.values[_entity.label] = 0.55
                        else:
                            _entity.values[_entity.label] += 0.05
                else:
                    short_entity.append(_entity)
                if _entity.label == 0:  # 找到招标人
                    find_tenderee = True
                n += 1
                if n > 1000:
                    break
        if find_tenderee == False and len(bus_tenderee)==1 and bus_tenderee[0].label==5:  # 如果整篇都没招标人，工商统计只有一个高概率招标人把它作为招标人
            bus_tenderee[0].label = 0
            bus_tenderee[0].values[0] = 0.55

        range_entity = range_entity[:1000]
        #替换公司的逻辑有问题，先取消
        # for first_i in range(len(range_entity)):
        #     _entity = range_entity[first_i]
        #     for second_i in range(first_i+1,len(range_entity)):
        #         _ent = range_entity[second_i]
        #         # 2021/5/21 update: 两个实体标签互斥（一个是招标人、一个是代理人）且entity_text不相等时，跳过
        #         if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
        #             continue
        #         _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
        #         if _entity.entity_text!=_ent.entity_text and _score>=on_value:
        #             _entity.linked_entitys.append(_ent)
        #             _ent.linked_entitys.append(_entity)
        #             print("=-===",_entity.entity_text,_ent.entity_text,_score)
        # #替换公司名称
        # for _entity in range_entity:
        #     if re.search("公司",_entity.entity_text) is None:
        #         for _ent in _entity.linked_entitys:
        #             if re.search("公司$",_ent.entity_text) is not None:
        #                 if len(_ent.entity_text)>len(_entity.entity_text):
        #                     _entity.entity_text = _ent.entity_text

        if short_entity and long_entity:  #
            for first_i in range(len(short_entity)):
                _entity = short_entity[first_i]
                if _entity.label == 0:
                    for second_i in range(len(long_entity)):
                        _ent = long_entity[second_i]
                        if _ent.label in [0,1,5]:
                            if len(_entity.entity_text)<len(_ent.entity_text) and is_short(_entity.entity_text, _ent.entity_text):  # 简称顺序包含在工商名称内的替换
                                _entity.entity_text = _ent.entity_text
                                lb, prob = bus_dic[_entity.entity_text]
                                if lb in [0, 1] and prob > 0.9 and _entity.values[
                                    _entity.label] < 0.55:  # 如果工商统计概率较高，文中概率较低，换为统计类别，主要为标题及发布人等招标、代理划分不明确情况
                                    if _entity.label != lb:
                                        _entity.label = lb
                                        _entity.values[_entity.label] = 0.55
                                    else:
                                        _entity.values[_entity.label] += 0.05
                                break
                            elif len(_entity.entity_text)>len(_ent.entity_text) and _ent.entity_text in _entity.entity_text \
                                    and re.search('(医院|大学)$', _ent.entity_text) and re.search('[部处室科]$', _entity.entity_text):  # 不包含工商数据实体完全包含工商数据实体名称的替换 20240520调整限定部门结尾才替换，防止出错
                                _entity.entity_text = _ent.entity_text
                                lb, prob = bus_dic[_entity.entity_text]
                                if lb in [0, 1] and prob > 0.9 and _entity.values[
                                    _entity.label] < 0.55:  # 如果工商统计概率较高，文中概率较低，换为统计类别，主要为标题及发布人等招标、代理划分不明确情况
                                    if _entity.label != lb:
                                        _entity.label = lb
                                        _entity.values[_entity.label] = 0.55
                                    else:
                                        _entity.values[_entity.label] += 0.05
                                break

        # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
        for _entity in range_entity:
            used_linked_entitys = []
            if not _entity.linked_entitys:
                continue
            _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True)
            for _ent in _entity.linked_entitys:
                if _ent in used_linked_entitys:
                    break
                # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
                if _ent.if_dict_match == 1:
                    if len(_ent.entity_text) > len(_entity.entity_text):
                        # 判断两个公司地区相同
                        match_list_1, match_list_2 = [], []
                        for place in place_list:
                            if place in _entity.entity_text:
                                match_list_1.append(place)
                            if place in _ent.entity_text:
                                match_list_2.append(place)

                        if str(match_list_1) == str(match_list_2):
                            # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
                            _entity.origin_entity_text = _entity.entity_text
                            _entity.entity_text = _ent.entity_text
                            used_linked_entitys.append(_ent)
                            # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
# 用于去重的标题
def doctitle_refine(doctitle):
    _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
                             r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)
    return _doctitle_refine
# 前100个公司实体
def get_nlp_enterprise(list_entity):
    nlp_enterprise = []
    nlp_enterprise_attachment = []
    dict_enterprise = {}
    max_num = 100
    list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
    for entity in list_entity:
        if entity.entity_type in ['org','company']:
            if entity.entity_text not in dict_enterprise:
                if entity.entity_text not in business_dic:
                    have_bus, dic = get_business_data(entity.entity_text)
                    business_dic[entity.entity_text] = (have_bus, dic)
                else:
                    have_bus, dic = business_dic.get(entity.entity_text)  # 20240708 字典保存查询过的工商数据，避免重复查询redis
                credit_code = dic.get('credit_code', '')
                in_text = 0 if entity.in_attachment else 1
                if entity.label in [0,1,2,3,4] or len(dict_enterprise)<=max_num:
                    dict_enterprise[entity.entity_text] = {'in_text': in_text}
                    if credit_code != "":
                        dict_enterprise[entity.entity_text]['credit_code'] = credit_code
            else:
                in_text = 0 if entity.in_attachment else 1
                if in_text != dict_enterprise[entity.entity_text]['in_text']:
                    dict_enterprise[entity.entity_text]['in_text'] = 2

            if not entity.in_attachment:
                if entity.entity_text not in nlp_enterprise:
                    nlp_enterprise.append(entity.entity_text)
            else:
                if entity.entity_text not in nlp_enterprise_attachment:
                    nlp_enterprise_attachment.append(entity.entity_text)
    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise

ENTERPRISE_HUGE = None

def getEnterprisePath():
    global ENTERPRISE_HUGE
    filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
    huge_path = getFileFromSysPath(filename_huge)
    if huge_path is None:
        if os.path.exists(filename_huge):
            log("enterprise path:%s"%(filename_huge))
            ENTERPRISE_HUGE = True
            return filename_huge,ENTERPRISE_HUGE
    else:
        log("enterprise path:%s"%(huge_path))
        ENTERPRISE_HUGE = True
        return huge_path,ENTERPRISE_HUGE

    filename = "LEGAL_ENTERPRISE.txt"

    real_path = getFileFromSysPath(filename)
    if real_path is None:
        real_path = filename
    log("ENTERPRISE path:%s"%(real_path))
    ENTERPRISE_HUGE = False
    return real_path,ENTERPRISE_HUGE


DICT_ENTERPRISE_DONE = False

POOL_REDIS = None

ENTERPRISE_KEY_LEN = 3

ENTERPRISE_PREFIX_LEN = 3
ENTERPRISE_TAIL_LEN = 3

SET_ENTERPRISE = set()
SET_PREFIX_ENTERPRISE = set()
SET_TAIL_ENTERPRISE = set()
SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"

def getDict_enterprise():
    global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
    real_path,is_huge = getEnterprisePath()
    _ok = False
    if is_huge:
        if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE):
            SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE)
            SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE)
            _ok = True
    if not _ok:
        with open(real_path,"r",encoding="UTF8") as f:
            for _e in f:
                if not _e:
                    continue
                _e = _e.strip()
                if len(_e)>=4:
                    key_enter = _e[:ENTERPRISE_KEY_LEN]
                    SET_PREFIX_ENTERPRISE.add(key_enter)
                    SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
                    if not is_huge:
                        SET_ENTERPRISE.add(_e)
        #仅在大文件情况下才使用缓存加载
        if is_huge:
            save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
            save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)


    log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
    log("SET_TAIL_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_TAIL_ENTERPRISE)/1024/1024,len(SET_TAIL_ENTERPRISE)))
    log("SET_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_ENTERPRISE)/1024/1024,len(SET_ENTERPRISE)))

    # for _e in ["河南省柘源","建筑工程有限公司"]:
    #     if not _e:
    #         continue
    #     _e = _e.strip()
    #     if len(_e)>=4:
    #         key_enter = _e[:4]
    #         if key_enter not in DICT_ENTERPRISE:
    #             DICT_ENTERPRISE[key_enter] = set()
    #         DICT_ENTERPRISE[key_enter].add(_e[4:])
    DICT_ENTERPRISE_DONE = True


def init_redis_pool():
    from BiddingKG.dl.common.pool import ConnectorPool
    from BiddingKG.dl.common.source import getConnect_redis_baseline
    global POOL_REDIS
    if POOL_REDIS is None:
        POOL_REDIS = ConnectorPool(init_num=1,max_num=10,method_init=getConnect_redis_baseline)

# 插入 Redis
# def add_redis(company_list):
#     global ENTERPRISE_HUGE,POOL_REDIS
#     if ENTERPRISE_HUGE:
#         _db = POOL_REDIS.getConnector()
#         for enterprise_name in company_list:
#             _v = _db.get(enterprise_name)
#             if _v is None:
#                 if isLegalNewName(enterprise_name):
#                     _db.set(enterprise_name,1)
# 新实体合法判断
def isLegalNewName(enterprise_name):
    # head_character_list = ["[",'【',"(",'（']
    # tail_character_list = ["]",'】',")",'）']
    # 名称开头判断
    if re.search("^[\da-zA-Z][^\da-zA-Z]|"
                 "^[^\da-zA-Z\u4e00-\u9fa5\[【(（]|"
                 "^[\[【(（].{,1}[\]】)）]|"
                 "^[0〇]|"
                 "^(20[0-2][0-9]|[0-2]?[0-9]年|[0-1]?[0-9]月|[0-3]?[0-9]日)",enterprise_name):
        return -1
    if len(re.findall("[\u4e00-\u9fa5]",enterprise_name))<2:
        return -1
    if re.search("╳|＊|\*|×|xx|XX",enterprise_name):
        return -1
    if re.search("^(省|自治[县州区]|市|县|区|镇|乡|街道)",enterprise_name) and not re.search("^(镇江|乡宁|镇原|镇海|镇安|镇巴|镇坪|镇赉|镇康|镇沅|镇雄|镇远|镇宁|乡城|镇平|市中|市南|市北)",enterprise_name):
        return -1
    if re.search("\d{1,2}:\d{2}(:\d{2})?|(rar|xlsx|zip|png|jpg|swf|docx|txt|pdf|PDF|doc|xls|bmp|&?nbsp)",enterprise_name):
        return -1
    if re.search("(招标|代理)(人|机构)|联系(人|方式)|中标|候选|第.名",enterprise_name):
        return -1
    if re.search("[a-zA-Z\d]{1,2}(包|标段?)|第.批"):
        return 0
    return 1

# 过滤掉Redis里值为0的错误实体
def enterprise_filter(entity_list):
    global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
    if ENTERPRISE_HUGE:
        if POOL_REDIS is None:
            init_redis_pool()
        _db = POOL_REDIS.getConnector()
        remove_list = []
        try:
            for entity in entity_list:
                if entity.entity_type in ['company','org']:
                    _v = _db.get(entity.entity_text)
                    if _v==0:
                        remove_list.append(entity)
        except Exception as e:
            traceback.print_exc()
        POOL_REDIS.putConnector(_db)
        for _entity in remove_list:
            entity_list.remove(_entity)

    return entity_list


def is_enterprise_exist(enterprise_name):
    global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
    # print("test",enterprise_name)
    if ENTERPRISE_HUGE:
        if POOL_REDIS is None:
            init_redis_pool()
        _db = POOL_REDIS.getConnector()
        try:
            _time = time.time()
            _v = _db.get(enterprise_name)

            POOL_REDIS.putConnector(_db)
            if _v is None:
                return False
            else:
                if _v:
                    # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
                    return True
                else:
                    return False
        except Exception as e:
            traceback.print_exc()
        return False
    else:
        if enterprise_name in SET_ENTERPRISE:
            return True
        else:
            return False


import threading
import time

load_enterprise_thread = threading.Thread(target=getDict_enterprise)
load_enterprise_thread.start()


MAX_ENTERPRISE_LEN = 30

def match_enterprise_max_first(sentence):
    while True:
        if not DICT_ENTERPRISE_DONE:
            time.sleep(1)
        else:
            break
    list_match = []
    begin_index = 0
    if len(sentence)>4:
        while True:
            if begin_index+ENTERPRISE_KEY_LEN<len(sentence):
                key_enter = sentence[begin_index:begin_index+ENTERPRISE_KEY_LEN]

                # if key_enter in DICT_ENTERPRISE:
                #     _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
                #     for _i in range(_len):
                #         enter_name = sentence[begin_index+ENTERPRISE_KEY_LEN:begin_index+_len-_i]
                #         if enter_name in DICT_ENTERPRISE[key_enter]:
                #             match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
                #             list_match.append(match_item)
                #             begin_index += (len(key_enter)+len(enter_name))-1
                #             break

                if key_enter in SET_PREFIX_ENTERPRISE:
                    _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
                    for _i in range(_len):
                        enter_name = sentence[begin_index:begin_index+_len-_i]
                        enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
                        if re.search('[\u4e00-\u9fa5]', enter_tail) == None: # 20240111不包含中文后缀不要
                            continue
                        elif enter_name in ['黄埔军校',  '五金建材', '铝合金门窗', '测试单位' ,'生产管理部', '华电XXX发电有限公司']: # '国有资产管理处',
                            continue
                        elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$|交汇处$|大厦$|大楼$|^华电X{1,4}发电有限公司$', enter_name):
                            continue
                        if len(enter_name)<4: # 20240521 短于4个字的不要
                            break
                        if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail):
                            if enter_name not in business_dic:
                                have_bus, dic = get_business_data(enter_name) # 20210124 改为有工商数据的实体才添加
                                business_dic[enter_name] = (have_bus, dic)
                            else:
                                have_bus, dic = business_dic.get(enter_name) # 20240708 字典保存查询过的工商数据，避免重复查询redis
                            if have_bus:
                            # if is_enterprise_exist(enter_name):
                                match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
                                # print("match_item",key_enter,enter_name)
                                list_match.append(match_item)
                                begin_index += len(enter_name)-1
                                break
                begin_index += 1
            else:
                break
    # print("======",list_match)
    return list_match

def calibrateEnterprise(list_articles,list_sentences,list_entitys):
    for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
        list_calibrate = []
        match_add = False
        match_replace = False
        range_entity = []
        for p_entity in list_entity:
            if p_entity.entity_type in ("org","company","location"):
                range_entity.append(p_entity)
            if len(range_entity)>1000:
                break
        for p_sentence in list_sentence:
            sentence = p_sentence.sentence_text
            sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
            list_match = match_enterprise_max_first(sentence)
            # print("list_match", list_match)

            doc_id = p_sentence.doc_id
            sentence_index = p_sentence.sentence_index
            tokens = p_sentence.tokens

            list_match.sort(key=lambda x:x["begin_index"])


            for _match_index in range(len(list_match)):
                _match = list_match[_match_index]
                find_flag = False
                for p_entity in range_entity:
                    if p_entity.sentence_index!=p_sentence.sentence_index:
                        continue

                    if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
                        find_flag = True
                        p_entity.entity_type = "company"
                        p_entity.if_dict_match = 1

                    if p_entity.entity_type not in ["location","org","company"]:
                        continue

                    if _match["entity_text"] == p_entity.entity_text:
                        p_entity.if_dict_match = 1

                    #有重叠
                    #match部分被包含则不处理
                    if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                        find_flag = True
                        # 判断是否是多个公司
                        if re.search('[分支](公司|中心|监狱|部|行)|^\w{4,15}公司\w{2,3}公司$'
                                     '|(大学|学院)\w{,2}附属\w{,6}医院$|(\w{2,5}办事处\w{2,6}$|^\w{2,6}银行\w{2,10}[分支]行$'
                                     '|\w{2,4}[省市县]\w{2,14}村)(股份)?经济(合作|联合)社$|国家税务总局\w{2,10}税务局$',
                                     p_entity.entity_text):
                            continue
                        if p_entity.entity_type == "location" and re.search('\d[楼室号]', p_entity.entity_text):  # 明确地址不进行替换避免 类似 434052508 西宁市城西区西关大街128号山东大厦15楼1152室 更新为 西宁市城西
                            continue
                        for _match_j in range(_match_index,len(list_match)):
                            if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
                                _match_j -= 1
                                break
                        if _match_j>_match_index:

                            match_replace = True
                            match_add = True
                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                            p_entity.entity_text = _match["entity_text"]
                            p_entity.wordOffset_begin = _match["begin_index"]
                            p_entity.wordOffset_end = _match["end_index"]
                            p_entity.begin_index = begin_index
                            p_entity.end_index = end_index
                            # 该公司实体是字典识别的
                            p_entity.if_dict_match = 1

                            for _match_h in range(_match_index+1,_match_j+1):
                                entity_text = list_match[_match_h]["entity_text"]
                                entity_type = "company"

                                begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
                                end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
                                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment)
                                add_entity.if_dict_match = 1
                                list_entity.append(add_entity)

                                range_entity.append(add_entity)

                                list_calibrate.append({"type":"add","from":"","to":entity_text})
                            _match_index = _match_j
                            break

                        continue
                    elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
                        find_flag = True

                        if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                            if p_entity.entity_type in ("org","company"):
                                _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
                                if re.search("分",_diff_text) is not None:
                                    pass
                                else:
                                    match_replace = True
                                    begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                                    list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                                    p_entity.entity_text = _match["entity_text"]
                                    p_entity.wordOffset_begin = _match["begin_index"]
                                    p_entity.wordOffset_end = _match["end_index"]
                                    p_entity.begin_index = begin_index
                                    p_entity.end_index = end_index
                                    p_entity.if_dict_match = 1
                        elif _match["end_index"]>=p_entity.wordOffset_end:
                            # 原entity列表已有实体，则不重复添加
                            if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
                                match_replace = True
                                begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                                end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                                list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                                p_entity.entity_text = _match["entity_text"]
                                p_entity.wordOffset_begin = _match["begin_index"]
                                p_entity.wordOffset_end = _match["end_index"]
                                p_entity.begin_index = begin_index
                                p_entity.end_index = end_index
                                p_entity.entity_type = "company"
                                p_entity.if_dict_match = 1
                    elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
                        find_flag = True
                        if p_entity.entity_type in ("org","company"):
                            match_replace = True
                            begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                            end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                            list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
                            p_entity.entity_text = _match["entity_text"]
                            p_entity.wordOffset_begin = _match["begin_index"]
                            p_entity.wordOffset_end = _match["end_index"]
                            p_entity.begin_index = begin_index
                            p_entity.end_index = end_index
                            p_entity.if_dict_match = 1
                if not find_flag:
                    match_add = True
                    entity_text = _match["entity_text"]
                    entity_type = "company"

                    begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                    end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                    entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                    add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment)
                    list_entity.append(add_entity)
                    range_entity.append(add_entity)

                    list_calibrate.append({"type":"add","from":"","to":entity_text})

        #去重
        set_calibrate = set()
        list_match_enterprise = []
        for _calibrate in list_calibrate:
            _from = _calibrate.get("from","")
            _to = _calibrate.get("to","")
            _key = _from+_to
            if _key not in set_calibrate:
                list_match_enterprise.append(_calibrate)
            set_calibrate.add(_key)
        match_enterprise_type = 0
        if match_add:
            match_enterprise_type += 1
        if match_replace:
            match_enterprise_type += 2
        _article.match_enterprise = list_match_enterprise
        _article.match_enterprise_type = match_enterprise_type

def isLegalEnterprise(name):
    is_legal = True
    if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|（主）",name) is not None:
        is_legal = False
    return is_legal

def fix_LEGAL_ENTERPRISE():
    unlegal_enterprise = []
    _path = getEnterprisePath()
    _sum = 0
    set_enter = set()
    paths = [_path]
    for _p in paths:
        with open(_p,"r",encoding="utf8") as f:
            while True:
                line = f.readline()
                if not line:
                    break
                line = line.strip()
                if isLegalEnterprise(line):
                    set_enter.add(line)
                if line=="有限责任公司" or line=='设计研究院' or line=='限责任公司' or (re.search("^.{,4}(分公司|支行|分行)$",line) is not None and re.search("电信|移动|联通|建行|工行|农行|中行|交行",line) is None):
                    print(line)
                    if line in set_enter:
                        set_enter.remove(line)

    with open("enter.txt","w",encoding="utf8") as fwrite:
        for line in list(set_enter):

            fwrite.write(line.replace("(","（").replace(")","）"))
            fwrite.write("\n")
                # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)||
                #     _count += 1
                #     print("=",line)
                #     print("%d/%d"%(_count,_sum))
    # a_list = []
    # with open("电信分公司.txt","r",encoding="utf8") as f:
    #     while True:
    #         _line = f.readline()
    #         if not _line:
    #             break
    #         if _line.strip()!="":
    #             a_list.append(_line.strip())
    # with open("enter.txt","a",encoding="utf8") as f:
    #     for _line in a_list:
    #         f.write(_line)
    #         f.write("\n")


if __name__=="__main__":
    # edit_distance("GUMBO","GAMBOL")
    # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
    #
    # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
    # print(match_enterprise_max_first(sentences))
    #
    # print("takes %d s"%(time.time()-_time))
    # fix_LEGAL_ENTERPRISE()
    # print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司"))
    print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))