|
@@ -204,6 +204,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
break
|
|
break
|
|
for p_sentence in list_sentence:
|
|
for p_sentence in list_sentence:
|
|
sentence = p_sentence.sentence_text
|
|
sentence = p_sentence.sentence_text
|
|
|
|
+ sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
|
|
list_match = match_enterprise_max_first(sentence)
|
|
list_match = match_enterprise_max_first(sentence)
|
|
# print("list_match", list_match)
|
|
# print("list_match", list_match)
|
|
|
|
|
|
@@ -246,7 +247,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
match_replace = True
|
|
match_replace = True
|
|
match_add = True
|
|
match_add = True
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
|
|
|
|
|
|
+ end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
|
|
list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
p_entity.entity_text = _match["entity_text"]
|
|
p_entity.entity_text = _match["entity_text"]
|
|
p_entity.wordOffset_begin = _match["begin_index"]
|
|
p_entity.wordOffset_begin = _match["begin_index"]
|
|
@@ -261,7 +262,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
entity_type = "company"
|
|
entity_type = "company"
|
|
|
|
|
|
begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
|
|
begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
|
|
- end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
|
|
|
|
|
|
+ end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
|
|
add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
|
|
add_entity.if_dict_match = 1
|
|
add_entity.if_dict_match = 1
|
|
@@ -285,7 +286,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
else:
|
|
else:
|
|
match_replace = True
|
|
match_replace = True
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
|
|
|
|
|
|
+ end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
|
|
list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
p_entity.entity_text = _match["entity_text"]
|
|
p_entity.entity_text = _match["entity_text"]
|
|
p_entity.wordOffset_begin = _match["begin_index"]
|
|
p_entity.wordOffset_begin = _match["begin_index"]
|
|
@@ -294,23 +295,25 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
p_entity.end_index = end_index
|
|
p_entity.end_index = end_index
|
|
p_entity.if_dict_match = 1
|
|
p_entity.if_dict_match = 1
|
|
elif _match["end_index"]>=p_entity.wordOffset_end:
|
|
elif _match["end_index"]>=p_entity.wordOffset_end:
|
|
- match_replace = True
|
|
|
|
- begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
|
|
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
|
|
|
|
- list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
|
|
- p_entity.entity_text = _match["entity_text"]
|
|
|
|
- p_entity.wordOffset_begin = _match["begin_index"]
|
|
|
|
- p_entity.wordOffset_end = _match["end_index"]
|
|
|
|
- p_entity.begin_index = begin_index
|
|
|
|
- p_entity.end_index = end_index
|
|
|
|
- p_entity.entity_type = "company"
|
|
|
|
- p_entity.if_dict_match = 1
|
|
|
|
|
|
+ # 原entity列表已有实体,则不重复添加
|
|
|
|
+ if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
|
|
|
|
+ match_replace = True
|
|
|
|
+ begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
|
|
+ end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
|
|
|
|
+ list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
|
|
+ p_entity.entity_text = _match["entity_text"]
|
|
|
|
+ p_entity.wordOffset_begin = _match["begin_index"]
|
|
|
|
+ p_entity.wordOffset_end = _match["end_index"]
|
|
|
|
+ p_entity.begin_index = begin_index
|
|
|
|
+ p_entity.end_index = end_index
|
|
|
|
+ p_entity.entity_type = "company"
|
|
|
|
+ p_entity.if_dict_match = 1
|
|
elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
|
|
elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
|
|
find_flag = True
|
|
find_flag = True
|
|
if p_entity.entity_type in ("org","company"):
|
|
if p_entity.entity_type in ("org","company"):
|
|
match_replace = True
|
|
match_replace = True
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
|
|
|
|
|
|
+ end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
|
|
list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
|
|
p_entity.entity_text = _match["entity_text"]
|
|
p_entity.entity_text = _match["entity_text"]
|
|
p_entity.wordOffset_begin = _match["begin_index"]
|
|
p_entity.wordOffset_begin = _match["begin_index"]
|
|
@@ -324,7 +327,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
entity_type = "company"
|
|
entity_type = "company"
|
|
|
|
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
|
|
- end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
|
|
|
|
|
|
+ end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
|
|
add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
|
|
list_entity.append(add_entity)
|
|
list_entity.append(add_entity)
|