''' Created on 2019年10月12日 @author: User ''' import glob from bs4 import BeautifulSoup import re import shutil import codecs from BiddingKG.dl_dev.test.test4 import predict from BiddingKG.dl.common.Utils import save, load def tofix(): ''' @summary: 获取需要fix的数据 ''' #paths = ["C:\\Users\\User\\Desktop\\数据20191011\\*.html","C:\\Users\\User\\Desktop\\数据20191011_multi\\*.html"] paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"] number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十]{1,4}") target_dir = "C:\\Users\\User\\Desktop\\tofix1" list_articles = [] _count1 = 0 _count2 = 0 MAX_COUNT = 500 with codecs.open(target_dir+".txt","w",encoding="utf8") as f: for path in paths: files = glob.glob(path) for file in files: filename = file.split("/")[-1] _content = open(file,"r",encoding="utf8").read() _soup = BeautifulSoup(_content,"lxml") _text = _soup.get_text() _find = "" _set_pack = set() if _count1>=MAX_COUNT and _count2>=MAX_COUNT: break for _iter in re.finditer("(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))",_text): _find += "/"+_text[_iter.span()[0]-3:_iter.span()[1]+3] temp_package_number = re.findall(number_pattern,_text[_iter.span()[0]:_iter.span()[1]])[0] if temp_package_number!="": _set_pack.add(temp_package_number) if len(_set_pack)>1: print(file.split("\\")[-1],_find.replace("\n","")) f.write(file.split("\\")[-1]+" "+_find.replace("\n","")+"\n") shutil.copy(file,target_dir) if re.search("_52_",file) is not None: if _count1