Explorar el Código

公告文本中逗号修正,时间修正

znj hace 3 años
padre
commit
14d87a0adb
Se han modificado 1 ficheros con 4 adiciones y 4 borrados
  1. 4 4
      BiddingKG/dl/interface/Preprocessing.py

+ 4 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1472,10 +1472,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         start_time = time.time()
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
         # 修正被","逗号分隔的时间
-        repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
-                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
-                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
-                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
+        repair_time = re.compile("[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[号]?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
+                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[号]?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d分?|"
+                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[号]?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
+                                 "[12]\d,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?[日号]|"
                                  "[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
                                  )
         for _time in set(re.findall(repair_time,article_processed)):