|
@@ -62,6 +62,15 @@ def link_entitys(list_entitys,on_value=0.8):
|
|
|
if len(_ent.entity_text)>len(_entity.entity_text):
|
|
|
_entity.entity_text = _ent.entity_text
|
|
|
|
|
|
+ # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
|
|
|
+ for _entity in range_entity:
|
|
|
+ for _ent in _entity.linked_entitys:
|
|
|
+ print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
|
|
|
+ if re.search("公司$", _ent.entity_text) is not None \
|
|
|
+ and _ent.if_dict_match == 1:
|
|
|
+ if len(_ent.entity_text) > len(_entity.entity_text):
|
|
|
+ _entity.entity_text = _ent.entity_text
|
|
|
+
|
|
|
|
|
|
def getEnterprisePath():
|
|
|
filename = "LEGAL_ENTERPRISE.txt"
|
|
@@ -146,6 +155,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
for p_sentence in list_sentence:
|
|
|
sentence = p_sentence.sentence_text
|
|
|
list_match = match_enterprise_max_first(sentence)
|
|
|
+ print("list_match", list_match)
|
|
|
|
|
|
doc_id = p_sentence.doc_id
|
|
|
sentence_index = p_sentence.sentence_index
|
|
@@ -164,10 +174,14 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
|
|
|
find_flag = True
|
|
|
p_entity.entity_type = "company"
|
|
|
+ p_entity.if_dict_match = 1
|
|
|
|
|
|
if p_entity.entity_type not in ["location","org","company"]:
|
|
|
continue
|
|
|
|
|
|
+ if _match["entity_text"] == p_entity.entity_text:
|
|
|
+ p_entity.if_dict_match = 1
|
|
|
+
|
|
|
#有重叠
|
|
|
#match部分被包含则不处理
|
|
|
if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
|
|
@@ -189,6 +203,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
p_entity.wordOffset_end = _match["end_index"]
|
|
|
p_entity.begin_index = begin_index
|
|
|
p_entity.end_index = end_index
|
|
|
+ # 该公司实体是字典识别的
|
|
|
+ p_entity.if_dict_match = 1
|
|
|
|
|
|
for _match_h in range(_match_index+1,_match_j+1):
|
|
|
entity_text = list_match[_match_h]["entity_text"]
|
|
@@ -198,6 +214,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
|
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
|
add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
|
|
|
+ add_entity.if_dict_match = 1
|
|
|
list_entity.append(add_entity)
|
|
|
|
|
|
range_entity.append(add_entity)
|
|
@@ -225,6 +242,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
p_entity.wordOffset_end = _match["end_index"]
|
|
|
p_entity.begin_index = begin_index
|
|
|
p_entity.end_index = end_index
|
|
|
+ p_entity.if_dict_match = 1
|
|
|
elif _match["end_index"]>=p_entity.wordOffset_end:
|
|
|
match_replace = True
|
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
@@ -236,6 +254,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
p_entity.begin_index = begin_index
|
|
|
p_entity.end_index = end_index
|
|
|
p_entity.entity_type = "company"
|
|
|
+ p_entity.if_dict_match = 1
|
|
|
elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
|
|
|
find_flag = True
|
|
|
if p_entity.entity_type in ("org","company"):
|
|
@@ -248,12 +267,12 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
|
p_entity.wordOffset_end = _match["end_index"]
|
|
|
p_entity.begin_index = begin_index
|
|
|
p_entity.end_index = end_index
|
|
|
+ p_entity.if_dict_match = 1
|
|
|
if not find_flag:
|
|
|
match_add = True
|
|
|
entity_text = _match["entity_text"]
|
|
|
entity_type = "company"
|
|
|
|
|
|
-
|
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
|
end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
|
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|