|
@@ -1418,7 +1418,7 @@ def segment(soup,final=True):
|
|
|
|
|
|
if len(text)<10000000:
|
|
|
while(LOOP_BEGIN<len(text)):
|
|
|
- _text += re.sub(")",")",re.sub("(","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
|
|
|
+ _text += re.sub(")",")",re.sub("(","(",re.sub("\s(?!\d{2}:\d{2})","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
|
|
|
LOOP_BEGIN += LOOP_LEN
|
|
|
text = _text
|
|
|
# 附件标识前修改为句号,避免正文和附件内容混合在一起
|
|
@@ -2737,7 +2737,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
|
|
|
#去掉标点符号
|
|
|
if entity_type!='time':
|
|
|
- entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
|
|
|
+ entity_text = re.sub("[,,。:!&@$\*\s]","",entity_text)
|
|
|
entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
|
|
|
# 组织机构实体名称补充
|
|
|
if entity_type in ["org", "company"]:
|