12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- '''
- Created on 2019年10月12日
- @author: User
- '''
- import glob
- from bs4 import BeautifulSoup
- import re
- import shutil
- import codecs
- from BiddingKG.dl_dev.test.test4 import predict
- from BiddingKG.dl.common.Utils import save, load
- def tofix():
- '''
- @summary: 获取需要fix的数据
- '''
- #paths = ["C:\\Users\\User\\Desktop\\数据20191011\\*.html","C:\\Users\\User\\Desktop\\数据20191011_multi\\*.html"]
- paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"]
- number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}")
- target_dir = "C:\\Users\\User\\Desktop\\tofix1"
- list_articles = []
- _count1 = 0
- _count2 = 0
- MAX_COUNT = 500
- with codecs.open(target_dir+".txt","w",encoding="utf8") as f:
- for path in paths:
- files = glob.glob(path)
- for file in files:
- filename = file.split("/")[-1]
- _content = open(file,"r",encoding="utf8").read()
- _soup = BeautifulSoup(_content,"lxml")
- _text = _soup.get_text()
- _find = ""
- _set_pack = set()
- if _count1>=MAX_COUNT and _count2>=MAX_COUNT:
- break
- for _iter in re.finditer("(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))",_text):
- _find += "/"+_text[_iter.span()[0]-3:_iter.span()[1]+3]
- temp_package_number = re.findall(number_pattern,_text[_iter.span()[0]:_iter.span()[1]])[0]
- if temp_package_number!="":
- _set_pack.add(temp_package_number)
- if len(_set_pack)>1:
- print(file.split("\\")[-1],_find.replace("\n",""))
- f.write(file.split("\\")[-1]+" "+_find.replace("\n","")+"\n")
- shutil.copy(file,target_dir)
- if re.search("_52_",file) is not None:
- if _count1<MAX_COUNT:
- _count1 += 1
- _label = predict("12",_content)
- list_articles.append({"content":_content,"label":_label,"predict":_label,"filename":filename})
- if re.search("_101_",file) is not None:
- if _count2<MAX_COUNT:
- _count2 += 1
- _label = predict("12",_content)
- list_articles.append({"content":_content,"label":_label,"predict":_label,"filename":filename})
- save(list_articles, "../../dl_dev/money/traindata/article_label_1000_muti.pk")
-
- def getFile(filename):
- path = "C:\\Users\\User\\Desktop\\数据20191014\\"
- file = path+filename
- dest_dir = "C:\\Users\\User\\Desktop\\getfile"
- shutil.copy(file,dest_dir)
-
- if __name__=="__main__":
- #tofix()
- #getFile("比地_101_79888947.html")
- data = load("../../dl_dev/money/traindata/article_label_1000_muti.pk")
- data1 = load("label_0_1000.pk")
- for i in range(84):
- print(data[i]["filename"])
- print(data[i]["content"][:10])
- print(data1[i]["content"][:10])
- data[i]["label"] = data1[i]["label"]
- save(data, "../../dl_dev/money/traindata/article_label_1000_muti.pk")
|