''' Created on 2019年10月15日 @author: User ''' import glob from bs4 import BeautifulSoup from BiddingKG.dl.interface.Preprocessing import tableToText,segment import codecs import re import pandas as pd def getBlockLinkedKey(): ''' @summary: 获取所有公告中关键词和标段的关联度 ''' paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"] pattern_block = "([^承](包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))" pattern_key = "[的,.。,、\))]([\u4e00-\u9fa5\(\)]+?)[::是为]" dict_key = dict() count = 0 for path in paths: for file in glob.glob(path): count += 1 print(count,file.split("/")[-1]) sourceContent = codecs.open(file,"r",encoding="utf8").read() article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml"))) list_word_type_begin_end = [] for _iter in re.finditer(pattern_key,article_processed): _word = _iter.group(1) _type = "key" _begin = int(_iter.span()[0]) _end = int(_iter.span()[1]) list_word_type_begin_end.append([_word,_type,_begin,_end]) for _iter in re.finditer(pattern_block,article_processed): _type = "block" _begin = int(_iter.span()[0]) _end = int(_iter.span()[1]) _word = article_processed[_begin:_end] list_word_type_begin_end.append([_word,_type,_begin,_end]) list_word_type_begin_end.sort(key=lambda x:x[2]) _find = False _score = 5 for item in list_word_type_begin_end: if item[1]=="block": _find = True _score = 10 continue if _find: if item[0] not in dict_key: dict_key[item[0]] = [0,0] dict_key[item[0]][0] += _score dict_key[item[0]][1] += 1 _score -= 1 list_word_score_times = [] for _key in dict_key.keys(): list_word_score_times.append([_key,dict_key[_key][0],dict_key[_key][1]]) list_word_score_times.sort(key=lambda x:x[1]/x[2]) list_word = [] list_score = [] list_times = [] for item in list_word_score_times: if item[2]>10: list_word.append(item[0]) list_score.append(item[1]) list_times.append(item[2]) data = {"list_word":list_word,"list_score":list_score,"list_times":list_times} df = pd.DataFrame(data) df.to_excel("cluster.xls",columns=["list_word","list_score","list_times"]) def filter(): df = pd.read_excel("cluster.xls") list_word = [] list_score = [] list_times = [] for _word,_score,_times in zip(df["list_word"],df["list_score"],df["list_times"]): if _times>10: list_word.append(_word) list_score.append(_score) list_times.append(_times) data = {"list_word":list_word,"list_score":list_score,"list_times":list_times} df = pd.DataFrame(data) df.to_excel("cluster_filter.xls",columns=["list_word","list_score","list_times"]) def getPackageScope(packageList,list_sentence): ''' @summary: 获取标段的作用域,通过使用关键词来判断作用域的起止,读取关键词词典,使用正则匹配,在获取包号的时候,通过前面获取的关键词来限定作用域 ''' pass if __name__=="__main__": getBlockLinkedKey() #filter()