clusterBlock.py 3.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. '''
  2. Created on 2019年10月15日
  3. @author: User
  4. '''
  5. import glob
  6. from bs4 import BeautifulSoup
  7. from BiddingKG.dl.interface.Preprocessing import tableToText,segment
  8. import codecs
  9. import re
  10. import pandas as pd
  11. def getBlockLinkedKey():
  12. '''
  13. @summary: 获取所有公告中关键词和标段的关联度
  14. '''
  15. paths = ["C:\\Users\\User\\Desktop\\数据20191014\\*.html"]
  16. pattern_block = "([^承](包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))"
  17. pattern_key = "[的,.。,、\))]([\u4e00-\u9fa5\(\)]+?)[::是为]"
  18. dict_key = dict()
  19. count = 0
  20. for path in paths:
  21. for file in glob.glob(path):
  22. count += 1
  23. print(count,file.split("/")[-1])
  24. sourceContent = codecs.open(file,"r",encoding="utf8").read()
  25. article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
  26. list_word_type_begin_end = []
  27. for _iter in re.finditer(pattern_key,article_processed):
  28. _word = _iter.group(1)
  29. _type = "key"
  30. _begin = int(_iter.span()[0])
  31. _end = int(_iter.span()[1])
  32. list_word_type_begin_end.append([_word,_type,_begin,_end])
  33. for _iter in re.finditer(pattern_block,article_processed):
  34. _type = "block"
  35. _begin = int(_iter.span()[0])
  36. _end = int(_iter.span()[1])
  37. _word = article_processed[_begin:_end]
  38. list_word_type_begin_end.append([_word,_type,_begin,_end])
  39. list_word_type_begin_end.sort(key=lambda x:x[2])
  40. _find = False
  41. _score = 5
  42. for item in list_word_type_begin_end:
  43. if item[1]=="block":
  44. _find = True
  45. _score = 10
  46. continue
  47. if _find:
  48. if item[0] not in dict_key:
  49. dict_key[item[0]] = [0,0]
  50. dict_key[item[0]][0] += _score
  51. dict_key[item[0]][1] += 1
  52. _score -= 1
  53. list_word_score_times = []
  54. for _key in dict_key.keys():
  55. list_word_score_times.append([_key,dict_key[_key][0],dict_key[_key][1]])
  56. list_word_score_times.sort(key=lambda x:x[1]/x[2])
  57. list_word = []
  58. list_score = []
  59. list_times = []
  60. for item in list_word_score_times:
  61. if item[2]>10:
  62. list_word.append(item[0])
  63. list_score.append(item[1])
  64. list_times.append(item[2])
  65. data = {"list_word":list_word,"list_score":list_score,"list_times":list_times}
  66. df = pd.DataFrame(data)
  67. df.to_excel("cluster.xls",columns=["list_word","list_score","list_times"])
  68. def filter():
  69. df = pd.read_excel("cluster.xls")
  70. list_word = []
  71. list_score = []
  72. list_times = []
  73. for _word,_score,_times in zip(df["list_word"],df["list_score"],df["list_times"]):
  74. if _times>10:
  75. list_word.append(_word)
  76. list_score.append(_score)
  77. list_times.append(_times)
  78. data = {"list_word":list_word,"list_score":list_score,"list_times":list_times}
  79. df = pd.DataFrame(data)
  80. df.to_excel("cluster_filter.xls",columns=["list_word","list_score","list_times"])
  81. def getPackageScope(packageList,list_sentence):
  82. '''
  83. @summary: 获取标段的作用域,通过使用关键词来判断作用域的起止,读取关键词词典,使用正则匹配,在获取包号的时候,通过前面获取的关键词来限定作用域
  84. '''
  85. pass
  86. if __name__=="__main__":
  87. getBlockLinkedKey()
  88. #filter()