3 years ago · de79943104
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1066,13 +1066,33 @@ def segment(soup,final=True):
 
															     text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
														
 
															     #替换为中文分号
														
 
															     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
														
 
															-    #替换"？"为 "" ,update:2021/7/20
														
 
															-    text = re.sub("？+","",text)
														
 
															+    #替换"？"为 " " ,update:2021/7/20
														
 
															+    text = re.sub("？+"," ",text)
														
 
															     #替换"""为"“",否则导入deepdive出错
														
 
															     text = text.replace('"',"“").replace("\r","").replace("\n","，")
														
 
															-    text = re.sub("\s{4,}","，",text)   
														
 
															+    # print('==1',text)
														
 
															+    # text = re.sub("\s{4,}","，",text)
														
 
															+    # 解决公告中的" "空格替换问题
														
 
															+    if re.search("\s{4,}",text):
														
 
															+        _text = ""
														
 
															+        for _sent in re.split("。+",text):
														
 
															+            for _sent2 in re.split('，+',_sent):
														
 
															+                for _sent3 in re.split("：+",_sent2):
														
 
															+                    for _t in re.split("\s{4,}",_sent3):
														
 
															+                        if len(_t)<3:
														
 
															+                            _text += _t
														
 
															+                        else:
														
 
															+                            _text += "，"+_t
														
 
															+                    _text += "："
														
 
															+                _text = _text[:-1]
														
 
															+                _text += "，"
														
 
															+            _text = _text[:-1]
														
 
															+            _text += "。"
														
 
															+        _text = _text[:-1]
														
 
															+        text = _text
														
 
															+    # print('==2',text)
														
 
															     #替换标点
														
 
															     #替换连续的标点
														
@@ -1451,7 +1471,17 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
															         key_preprocess = "tableToText"
														
 
															         start_time = time.time()
														
 
															         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
														
 
															-
														
 
															+        # 修正被"，"逗号分隔的时间
														
 
															+        repair_time = re.compile("20，?\d，?\d，?[-—－―/年]，?[0-1]?\d，?[-—－―/月]，?[0-3]?\d，?日?，?(?:上午|下午)?，?[0-2]?\d，?:，?[0-6]\d，?:，?[0-6]\d|"
														
 
															+                                 "20，?\d，?\d，?[-—－―/年]，?[0-1]?\d，?[-—－―/月]，?[0-3]?\d，?日?，?(?:上午|下午)?，?[0-2]?\d，?[:时点]，?[0-6]\d，?分？|"
														
 
															+                                 "20，?\d，?\d，?[-—－―/年]，?[0-1]?\d，?[-—－―/月]，?[0-3]?\d，?日?，?(?:上午|下午)?，?[0-2]?\d，?[时点]|"
														
 
															+                                 "20，?\d，?\d，?[-—－―/年]，?[0-1]?\d，?[-—－―/月]，?[0-3]?\d，?日?|"
														
 
															+                                 "[0-2]?\d，?:，?[0-6]\d，?:，?[0-6]\d"
														
 
															+                                 )
														
 
															+        for _time in set(re.findall(repair_time,article_processed)):
														
 
															+            if re.search("，",_time):
														
 
															+                article_processed = article_processed.replace(_time,re.sub("，","",_time))
														
 
															+        # print('re_rtime',re.findall(repair_time,article_processed))
														
 
															         # log(article_processed)
														
 
															         if key_preprocess not in cost_time:
														
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -37,7 +37,7 @@ tf.nn.ctc_loss
 
															 if __name__=="__main__":
														
 
															     # filename = "比地_52_79929693.html"
														
 
															     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
														
 
															-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
														
 
															+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
														
 
															     content = str(BeautifulSoup(text).find("div",id="pcontent"))
														
 
															     # df_a = {"html":[]}
														
 
															     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
														
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -427,8 +427,8 @@ if __name__=="__main__":
 
															     a = time.time()
														
 
															     print("start")
														
 
															     # print(predict("12",content))
														
 
															-    result = predict("12",text)
														
 
															-    # result = predict("12",content)
														
 
															+    # result = predict("12",text)
														
 
															+    result = predict("12",content)
														
 
															     # print(json.loads(result))
														
 
															     #test("12",text)
														
 
															     print("takes",time.time()-a)