12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- '''
- Created on 2019年10月15日
- @author: User
- '''
- import glob
- from bs4 import BeautifulSoup
- from BiddingKG.dl.interface.Preprocessing import tableToText,segment
- import codecs
- import re
- import pandas as pd
- def getBlockLinkedKey():
- '''
- @summary: 获取所有公告中关键词和标段的关联度
- '''
- paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"]
- pattern_block = "([^承](包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))"
- pattern_key = "[的,.。,、\))]([\u4e00-\u9fa5\(\)]+?)[::是为]"
- dict_key = dict()
- count = 0
- for path in paths:
- for file in glob.glob(path):
- count += 1
- print(count,file.split("/")[-1])
- sourceContent = codecs.open(file,"r",encoding="utf8").read()
- article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
- list_word_type_begin_end = []
- for _iter in re.finditer(pattern_key,article_processed):
- _word = _iter.group(1)
- _type = "key"
- _begin = int(_iter.span()[0])
- _end = int(_iter.span()[1])
- list_word_type_begin_end.append([_word,_type,_begin,_end])
- for _iter in re.finditer(pattern_block,article_processed):
- _type = "block"
- _begin = int(_iter.span()[0])
- _end = int(_iter.span()[1])
- _word = article_processed[_begin:_end]
- list_word_type_begin_end.append([_word,_type,_begin,_end])
- list_word_type_begin_end.sort(key=lambda x:x[2])
- _find = False
- _score = 5
- for item in list_word_type_begin_end:
- if item[1]=="block":
- _find = True
- _score = 10
- continue
- if _find:
- if item[0] not in dict_key:
- dict_key[item[0]] = [0,0]
- dict_key[item[0]][0] += _score
- dict_key[item[0]][1] += 1
- _score -= 1
-
- list_word_score_times = []
- for _key in dict_key.keys():
- list_word_score_times.append([_key,dict_key[_key][0],dict_key[_key][1]])
- list_word_score_times.sort(key=lambda x:x[1]/x[2])
- list_word = []
- list_score = []
- list_times = []
- for item in list_word_score_times:
- if item[2]>10:
- list_word.append(item[0])
- list_score.append(item[1])
- list_times.append(item[2])
- data = {"list_word":list_word,"list_score":list_score,"list_times":list_times}
- df = pd.DataFrame(data)
- df.to_excel("cluster.xls",columns=["list_word","list_score","list_times"])
-
- def filter():
- df = pd.read_excel("cluster.xls")
- list_word = []
- list_score = []
- list_times = []
- for _word,_score,_times in zip(df["list_word"],df["list_score"],df["list_times"]):
- if _times>10:
- list_word.append(_word)
- list_score.append(_score)
- list_times.append(_times)
- data = {"list_word":list_word,"list_score":list_score,"list_times":list_times}
- df = pd.DataFrame(data)
- df.to_excel("cluster_filter.xls",columns=["list_word","list_score","list_times"])
-
- def getPackageScope(packageList,list_sentence):
- '''
- @summary: 获取标段的作用域,通过使用关键词来判断作用域的起止,读取关键词词典,使用正则匹配,在获取包号的时候,通过前面获取的关键词来限定作用域
- '''
- pass
-
-
-
- if __name__=="__main__":
- getBlockLinkedKey()
- #filter()
|