Browse Source

公告文本中逗号修正,时间修正

znj 3 years ago
parent
commit
de79943104

+ 34 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -1066,13 +1066,33 @@ def segment(soup,final=True):
     text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
     text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
     #替换为中文分号
     #替换为中文分号
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
-    #替换"?"为 "" ,update:2021/7/20
-    text = re.sub("?+","",text)
+    #替换"?"为 " " ,update:2021/7/20
+    text = re.sub("?+"," ",text)
          
          
 
 
     #替换"""为"“",否则导入deepdive出错
     #替换"""为"“",否则导入deepdive出错
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
-    text = re.sub("\s{4,}",",",text)   
+    # print('==1',text)
+    # text = re.sub("\s{4,}",",",text)
+    # 解决公告中的" "空格替换问题
+    if re.search("\s{4,}",text):
+        _text = ""
+        for _sent in re.split("。+",text):
+            for _sent2 in re.split(',+',_sent):
+                for _sent3 in re.split(":+",_sent2):
+                    for _t in re.split("\s{4,}",_sent3):
+                        if len(_t)<3:
+                            _text += _t
+                        else:
+                            _text += ","+_t
+                    _text += ":"
+                _text = _text[:-1]
+                _text += ","
+            _text = _text[:-1]
+            _text += "。"
+        _text = _text[:-1]
+        text = _text
+    # print('==2',text)
     #替换标点
     #替换标点
 
 
     #替换连续的标点
     #替换连续的标点
@@ -1451,7 +1471,17 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         key_preprocess = "tableToText"
         key_preprocess = "tableToText"
         start_time = time.time()
         start_time = time.time()
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
         article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
-
+        # 修正被","逗号分隔的时间
+        repair_time = re.compile("20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d|"
+                                 "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[:时点],?[0-6]\d,?分?|"
+                                 "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?,?(?:上午|下午)?,?[0-2]?\d,?[时点]|"
+                                 "20,?\d,?\d,?[-—-―/年],?[0-1]?\d,?[-—-―/月],?[0-3]?\d,?日?|"
+                                 "[0-2]?\d,?:,?[0-6]\d,?:,?[0-6]\d"
+                                 )
+        for _time in set(re.findall(repair_time,article_processed)):
+            if re.search(",",_time):
+                article_processed = article_processed.replace(_time,re.sub(",","",_time))
+        # print('re_rtime',re.findall(repair_time,article_processed))
         # log(article_processed)
         # log(article_processed)
 
 
         if key_preprocess not in cost_time:
         if key_preprocess not in cost_time:

+ 1 - 1
BiddingKG/dl/test/test4.py

@@ -37,7 +37,7 @@ tf.nn.ctc_loss
 if __name__=="__main__":
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))

+ 2 - 2
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -427,8 +427,8 @@ if __name__=="__main__":
     a = time.time()
     a = time.time()
     print("start")
     print("start")
     # print(predict("12",content))
     # print(predict("12",content))
-    result = predict("12",text)
-    # result = predict("12",content)
+    # result = predict("12",text)
+    result = predict("12",content)
     # print(json.loads(result))
     # print(json.loads(result))
     #test("12",text)
     #test("12",text)
     print("takes",time.time()-a)
     print("takes",time.time()-a)