123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450 |
- #coding:UTF8
- '''
- Created on 2019年5月21日
- @author: User
- '''
- import re
- import os
- import time
- import pandas as pd
- _time = time.time()
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.Entitys import *
- import json
- def edit_distance(source,target):
- dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
- for i in range(len(dp)):
- for j in range(len(dp[i])):
- if i==0:
- dp[i][j] = j
- elif j==0:
- dp[i][j] = i
- else:
- if source[j-1]==target[i-1]:
- cost = 0
- else:
- cost = 2
- dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
- return dp[-1][-1]
-
- def jaccard_score(source,target):
- source_set = set([s for s in source])
- target_set = set([s for s in target])
- if len(source_set)==0 or len(target_set)==0:
- return 0
- return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
- def get_place_list():
- path = os.path.dirname(__file__) + '/../place_info.csv'
- place_df = pd.read_csv(path)
- place_list = []
- for index, row in place_df.iterrows():
- place_list.append(row[1])
- place_list.append('台湾')
- place_list.append('澳门')
- place_list.append('香港')
- # place_list.append('東莞')
- # place_list.append('廣州')
- # place_list.append('韩国')
- # place_list.append('德国')
- # place_list.append('英国')
- # place_list.append('日本')
- # place_list.append('意大利')
- # place_list.append('新加坡')
- # place_list.append('加拿大')
- # place_list.append('西班牙')
- # place_list.append('澳大利亚')
- # place_list.append('美国')
- place_list = list(set(place_list))
- return place_list
- place_list = get_place_list()
- place_pattern = "|".join(place_list)
- def link_entitys(list_entitys,on_value=1):#on_value=0.81
- for list_entity in list_entitys:
- range_entity = []
- for _entity in list_entity:
- if _entity.entity_type in ["org","company"]:
- range_entity.append(_entity)
- range_entity = range_entity[:1000]
- #替换公司的逻辑有问题,先取消
- # for first_i in range(len(range_entity)):
- # _entity = range_entity[first_i]
- # for second_i in range(first_i+1,len(range_entity)):
- # _ent = range_entity[second_i]
- # # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
- # if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
- # continue
- # _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
- # if _entity.entity_text!=_ent.entity_text and _score>=on_value:
- # _entity.linked_entitys.append(_ent)
- # _ent.linked_entitys.append(_entity)
- # print("=-===",_entity.entity_text,_ent.entity_text,_score)
- #替换公司名称
- for _entity in range_entity:
- if re.search("公司",_entity.entity_text) is None:
- for _ent in _entity.linked_entitys:
- if re.search("公司$",_ent.entity_text) is not None:
- if len(_ent.entity_text)>len(_entity.entity_text):
- _entity.entity_text = _ent.entity_text
- # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
- for _entity in range_entity:
- used_linked_entitys = []
- if not _entity.linked_entitys:
- continue
- _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True)
- for _ent in _entity.linked_entitys:
- if _ent in used_linked_entitys:
- break
- # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
- if _ent.if_dict_match == 1:
- if len(_ent.entity_text) > len(_entity.entity_text):
- # 判断两个公司地区相同
- match_list_1, match_list_2 = [], []
- for place in place_list:
- if place in _entity.entity_text:
- match_list_1.append(place)
- if place in _ent.entity_text:
- match_list_2.append(place)
- if str(match_list_1) == str(match_list_2):
- # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
- _entity.origin_entity_text = _entity.entity_text
- _entity.entity_text = _ent.entity_text
- used_linked_entitys.append(_ent)
- # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
- # 用于去重的标题
- def doctitle_refine(doctitle):
- _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
- r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)
- return _doctitle_refine
- # 前100个公司实体
- def get_nlp_enterprise(list_entity):
- nlp_enterprise = []
- nlp_enterprise_attachment = []
- max_num = 100
- list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
- for entity in list_entity:
- if entity.entity_type in ['org','company']:
- if not entity.in_attachment:
- if entity.entity_text not in nlp_enterprise:
- nlp_enterprise.append(entity.entity_text)
- else:
- if entity.entity_text not in nlp_enterprise_attachment:
- nlp_enterprise_attachment.append(entity.entity_text)
- return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
- def getEnterprisePath():
- filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
- huge_path = getFileFromSysPath(filename_huge)
- if huge_path is None:
- if os.path.exists(huge_path):
- return huge_path
- else:
- return huge_path
- filename = "LEGAL_ENTERPRISE.txt"
- real_path = getFileFromSysPath(filename)
- if real_path is None:
- real_path = filename
- return real_path
- DICT_ENTERPRISE = {}
- DICT_ENTERPRISE_DONE = False
- def getDict_enterprise():
- global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
- real_path = getEnterprisePath()
- with open(real_path,"r",encoding="UTF8") as f:
- for _e in f:
- if not _e:
- continue
- _e = _e.strip()
- if len(_e)>=4:
- key_enter = _e[:4]
- if key_enter not in DICT_ENTERPRISE:
- DICT_ENTERPRISE[key_enter] = set()
- DICT_ENTERPRISE[key_enter].add(_e[4:])
- log("dict_enterprise takes memory:%dM"%(sys.getsizeof(DICT_ENTERPRISE)/1024/1024))
- # for _e in ["河南省柘源","建筑工程有限公司"]:
- # if not _e:
- # continue
- # _e = _e.strip()
- # if len(_e)>=4:
- # key_enter = _e[:4]
- # if key_enter not in DICT_ENTERPRISE:
- # DICT_ENTERPRISE[key_enter] = set()
- # DICT_ENTERPRISE[key_enter].add(_e[4:])
- DICT_ENTERPRISE_DONE = True
- return DICT_ENTERPRISE
- import threading
- import time
- load_enterprise_thread = threading.Thread(target=getDict_enterprise)
- load_enterprise_thread.start()
- MAX_ENTERPRISE_LEN = 30
- def match_enterprise_max_first(sentence):
- while True:
- if not DICT_ENTERPRISE_DONE:
- time.sleep(1)
- else:
- break
- list_match = []
- begin_index = 0
- if len(sentence)>4:
- while True:
- if begin_index+4<len(sentence):
- key_enter = sentence[begin_index:begin_index+4]
- if key_enter in DICT_ENTERPRISE:
- for _i in range(MAX_ENTERPRISE_LEN-4+1):
- enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
- if enter_name in DICT_ENTERPRISE[key_enter]:
- match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
- list_match.append(match_item)
- begin_index += (len(key_enter)+len(enter_name))-1
- break
- begin_index += 1
- else:
- break
- return list_match
- def calibrateEnterprise(list_articles,list_sentences,list_entitys):
- for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
- list_calibrate = []
- match_add = False
- match_replace = False
- range_entity = []
- for p_entity in list_entity:
- if p_entity.entity_type in ("org","company","location"):
- range_entity.append(p_entity)
- if len(range_entity)>1000:
- break
- for p_sentence in list_sentence:
- sentence = p_sentence.sentence_text
- sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
- list_match = match_enterprise_max_first(sentence)
- # print("list_match", list_match)
- doc_id = p_sentence.doc_id
- sentence_index = p_sentence.sentence_index
- tokens = p_sentence.tokens
- list_match.sort(key=lambda x:x["begin_index"])
- for _match_index in range(len(list_match)):
- _match = list_match[_match_index]
- find_flag = False
- for p_entity in range_entity:
- if p_entity.sentence_index!=p_sentence.sentence_index:
- continue
- if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
- find_flag = True
- p_entity.entity_type = "company"
- p_entity.if_dict_match = 1
- if p_entity.entity_type not in ["location","org","company"]:
- continue
- if _match["entity_text"] == p_entity.entity_text:
- p_entity.if_dict_match = 1
- #有重叠
- #match部分被包含则不处理
- if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
- find_flag = True
- #判断是否是多个公司
- for _match_j in range(_match_index,len(list_match)):
- if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
- _match_j -= 1
- break
- if _match_j>_match_index:
- match_replace = True
- match_add = True
- begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
- list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
- p_entity.entity_text = _match["entity_text"]
- p_entity.wordOffset_begin = _match["begin_index"]
- p_entity.wordOffset_end = _match["end_index"]
- p_entity.begin_index = begin_index
- p_entity.end_index = end_index
- # 该公司实体是字典识别的
- p_entity.if_dict_match = 1
- for _match_h in range(_match_index+1,_match_j+1):
- entity_text = list_match[_match_h]["entity_text"]
- entity_type = "company"
- begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
- end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment)
- add_entity.if_dict_match = 1
- list_entity.append(add_entity)
- range_entity.append(add_entity)
- list_calibrate.append({"type":"add","from":"","to":entity_text})
- _match_index = _match_j
- break
- continue
- elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
- find_flag = True
- if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
- if p_entity.entity_type in ("org","company"):
- _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
- if re.search("分",_diff_text) is not None:
- pass
- else:
- match_replace = True
- begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
- list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
- p_entity.entity_text = _match["entity_text"]
- p_entity.wordOffset_begin = _match["begin_index"]
- p_entity.wordOffset_end = _match["end_index"]
- p_entity.begin_index = begin_index
- p_entity.end_index = end_index
- p_entity.if_dict_match = 1
- elif _match["end_index"]>=p_entity.wordOffset_end:
- # 原entity列表已有实体,则不重复添加
- if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
- match_replace = True
- begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
- list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
- p_entity.entity_text = _match["entity_text"]
- p_entity.wordOffset_begin = _match["begin_index"]
- p_entity.wordOffset_end = _match["end_index"]
- p_entity.begin_index = begin_index
- p_entity.end_index = end_index
- p_entity.entity_type = "company"
- p_entity.if_dict_match = 1
- elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
- find_flag = True
- if p_entity.entity_type in ("org","company"):
- match_replace = True
- begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
- list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
- p_entity.entity_text = _match["entity_text"]
- p_entity.wordOffset_begin = _match["begin_index"]
- p_entity.wordOffset_end = _match["end_index"]
- p_entity.begin_index = begin_index
- p_entity.end_index = end_index
- p_entity.if_dict_match = 1
- if not find_flag:
- match_add = True
- entity_text = _match["entity_text"]
- entity_type = "company"
- begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
- add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment)
- list_entity.append(add_entity)
- range_entity.append(add_entity)
- list_calibrate.append({"type":"add","from":"","to":entity_text})
- #去重
- set_calibrate = set()
- list_match_enterprise = []
- for _calibrate in list_calibrate:
- _from = _calibrate.get("from","")
- _to = _calibrate.get("to","")
- _key = _from+_to
- if _key not in set_calibrate:
- list_match_enterprise.append(_calibrate)
- set_calibrate.add(_key)
- match_enterprise_type = 0
- if match_add:
- match_enterprise_type += 1
- if match_replace:
- match_enterprise_type += 2
- _article.match_enterprise = list_match_enterprise
- _article.match_enterprise_type = match_enterprise_type
- def isLegalEnterprise(name):
- is_legal = True
- if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|(主)",name) is not None:
- is_legal = False
- return is_legal
- def fix_LEGAL_ENTERPRISE():
- unlegal_enterprise = []
- _path = getEnterprisePath()
- _sum = 0
- set_enter = set()
- paths = [_path]
- for _p in paths:
- with open(_p,"r",encoding="utf8") as f:
- while True:
- line = f.readline()
- if not line:
- break
- line = line.strip()
- if isLegalEnterprise(line):
- set_enter.add(line)
- if line=="有限责任公司" or line=='设计研究院' or line=='限责任公司' or (re.search("^.{,4}(分公司|支行|分行)$",line) is not None and re.search("电信|移动|联通|建行|工行|农行|中行|交行",line) is None):
- print(line)
- if line in set_enter:
- set_enter.remove(line)
- with open("enter.txt","w",encoding="utf8") as fwrite:
- for line in list(set_enter):
- fwrite.write(line.replace("(","(").replace(")",")"))
- fwrite.write("\n")
- # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)||
- # _count += 1
- # print("=",line)
- # print("%d/%d"%(_count,_sum))
- # a_list = []
- # with open("电信分公司.txt","r",encoding="utf8") as f:
- # while True:
- # _line = f.readline()
- # if not _line:
- # break
- # if _line.strip()!="":
- # a_list.append(_line.strip())
- # with open("enter.txt","a",encoding="utf8") as f:
- # for _line in a_list:
- # f.write(_line)
- # f.write("\n")
-
- if __name__=="__main__":
- # edit_distance("GUMBO","GAMBOL")
- # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
- #
- # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
- # print(match_enterprise_max_first(sentences))
- #
- # print("takes %d s"%(time.time()-_time))
- # fix_LEGAL_ENTERPRISE()
- print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司"))
- # print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))
|