浏览代码

空格替换修复

lsm 2 年之前
父节点
当前提交
75a7301df8
共有 1 个文件被更改,包括 2 次插入2 次删除
  1. 2 2
      BiddingKG/dl/interface/Preprocessing.py

+ 2 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1418,7 +1418,7 @@ def segment(soup,final=True):
 
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s{2,}","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s(?!\d{2}:\d{2})","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
         text = _text
     # 附件标识前修改为句号,避免正文和附件内容混合在一起
@@ -2737,7 +2737,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
                 #去掉标点符号
                 if entity_type!='time':
-                    entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
+                    entity_text = re.sub("[,,。:!&@$\*\s]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
                 # 组织机构实体名称补充
                 if entity_type in ["org", "company"]: