|
@@ -1687,8 +1687,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
if k == 'text':
|
|
|
entity = v
|
|
|
b = it.start() + len(keyword)
|
|
|
- e = it.end()-1
|
|
|
- if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
|
|
|
+ e = it.end() - 1
|
|
|
+ if (b, e, 'location', entity) in ner_entitys:
|
|
|
+ ner_entitys.remove((b, e, 'location', entity))
|
|
|
+ ner_entitys.append((b, e, 'company', entity))
|
|
|
+ elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
|
|
|
ner_entitys.append((b, e, 'company', entity))
|
|
|
|
|
|
for it in re.finditer(
|
|
@@ -1701,6 +1704,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
entity = v
|
|
|
b = it.start() + len(keyword)
|
|
|
e = it.end() - 1
|
|
|
+ if (b, e, 'location', entity) in ner_entitys:
|
|
|
+ ner_entitys.remove((b, e, 'location', entity))
|
|
|
+ ner_entitys.append((b, e, 'org', entity))
|
|
|
if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
|
|
|
ner_entitys.append((b, e, 'org', entity))
|
|
|
|
|
@@ -1733,7 +1739,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
#去掉标点符号
|
|
|
entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
|
|
|
entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
|
|
|
- list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]-1))
|
|
|
+ list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]))
|
|
|
# 标记文章末尾的"发布人”、“发布时间”实体
|
|
|
if sentence_index==len(list_sentence)-1:
|
|
|
if len(list_sentence_entitys[-2:])>2:
|
|
@@ -1952,8 +1958,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
else:
|
|
|
entity_text = str(getUnifyMoney(entity_text))
|
|
|
|
|
|
- # if float(entity_text)<100 or float(entity_text)>100000000000:
|
|
|
- if float(entity_text)<50 or float(entity_text)>100000000000:
|
|
|
+ if float(entity_text)<100 or float(entity_text)>100000000000:
|
|
|
# print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
|
|
|
continue
|
|
|
|