outline_extractor.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. @author: bidikeji
  5. @time: 2024/7/19 10:05
  6. """
  7. import re
  8. from BiddingKG.dl.interface.htmlparser import ParseDocument,get_childs
  9. class Sentence2():
  10. def __init__(self,text,sentence_index,wordOffset_begin,wordOffset_end):
  11. self.name = 'sentence2'
  12. self.text = text
  13. self.sentence_index = sentence_index
  14. self.wordOffset_begin = wordOffset_begin
  15. self.wordOffset_end = wordOffset_end
  16. def get_text(self):
  17. return self.text
  18. def extract_sentence_list(sentence_list):
  19. new_sentence2_list = []
  20. new_sentence2_list_attach = []
  21. for sentence in sentence_list:
  22. sentence_index = sentence.sentence_index
  23. sentence_text = sentence.sentence_text
  24. begin_index = 0
  25. end_index = 0
  26. for it in re.finditer('([\w:][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
  27. temp = it.group(0)
  28. sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
  29. for item in re.finditer('[,。;;!!??]+', sentence_text):
  30. end_index = item.end()
  31. if end_index!=len(sentence_text):
  32. if end_index-begin_index<6 and item.group()[-1] in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', item.group())==None:
  33. continue
  34. new_sentence_text = sentence_text[begin_index:end_index]
  35. sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
  36. if sentence.in_attachment:
  37. new_sentence2_list_attach.append(sentence2)
  38. else:
  39. new_sentence2_list.append(sentence2)
  40. begin_index = end_index
  41. if end_index!=len(sentence_text):
  42. end_index = len(sentence_text)
  43. new_sentence_text = sentence_text[begin_index:end_index]
  44. sentence2 = Sentence2(new_sentence_text, sentence_index, begin_index, end_index)
  45. if sentence.in_attachment:
  46. new_sentence2_list_attach.append(sentence2)
  47. else:
  48. new_sentence2_list.append(sentence2)
  49. return new_sentence2_list, new_sentence2_list_attach
  50. requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围)([及与和](其它|\w{,2})要求)?" \
  51. "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::]|$)"
  52. aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
  53. # out_lines = []
  54. def extract_parameters(parse_document):
  55. list_data = parse_document.tree
  56. requirement_text = ''
  57. aptitude_text = ''
  58. _find_count = 0
  59. _data_i = -1
  60. while _data_i<len(list_data)-1:
  61. _data_i += 1
  62. _data = list_data[_data_i]
  63. _type = _data["type"]
  64. _text = _data["text"].strip()
  65. # print(_data.keys())
  66. if _type=="sentence":
  67. if _data["sentence_title"] is not None:
  68. if re.search(requirement_pattern,_text) is not None:
  69. childs = get_childs([_data])
  70. for c in childs:
  71. requirement_text += c["text"]+"\n"
  72. _data_i += len(childs)
  73. _data_i -= 1
  74. _data_i = -1
  75. while _data_i<len(list_data)-1:
  76. _data_i += 1
  77. _data = list_data[_data_i]
  78. _type = _data["type"]
  79. _text = _data["text"].strip()
  80. # print(_data.keys())
  81. if _type=="sentence":
  82. # print("aptitude_pattern", _text)
  83. if _data["sentence_title"] is not None:
  84. # print("aptitude_pattern",_text)
  85. # outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
  86. # re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
  87. if re.search(aptitude_pattern,_text[:30]) is not None:
  88. childs = get_childs([_data])
  89. for c in childs:
  90. aptitude_text += c["text"]
  91. # if c["sentence_title"]:
  92. # aptitude_text += c["text"]+"\n"
  93. # else:
  94. # aptitude_text += c["text"]
  95. _data_i += len(childs)
  96. _data_i -= 1
  97. # elif re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', _text) and len(_text)<30 and re.search('资质|资格', _text):
  98. # out_lines.append(outline)
  99. if _type=="table":
  100. list_table = _data["list_table"]
  101. parent_title = _data["parent_title"]
  102. if list_table is not None:
  103. for line in list_table[:2]:
  104. for cell_i in range(len(line)):
  105. cell = line[cell_i]
  106. cell_text = cell[0]
  107. if len(cell_text)>120 and re.search(aptitude_pattern,cell_text) is not None:
  108. aptitude_text += cell_text+"\n"
  109. return requirement_text,aptitude_text
  110. if __name__ == "__main__":
  111. # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
  112. # html = f.read()
  113. #
  114. # l = []
  115. import pandas as pd
  116. # from collections import Counter
  117. # from BiddingKG.dl.interface import Preprocessing
  118. # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
  119. # n = 0
  120. # datas = []
  121. # for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']):
  122. # # if id not in [289647738, 289647739]:
  123. # # continue
  124. # # print(id, type(id))
  125. # # parse_document = ParseDocument(html, True)
  126. # # requirement_text, aptitude_text = extract_parameters(parse_document)
  127. # if re.search('资\s*[格质]', html)==None:
  128. # continue
  129. #
  130. # list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True)
  131. # sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
  132. #
  133. # # sentence2_list = []
  134. #
  135. # parse_document = ParseDocument(html, True, list_obj=sentence2_list)
  136. # requirement_text, aptitude_text = extract_parameters(parse_document)
  137. # if len(aptitude_text)>0:
  138. # datas.append((id, aptitude_text[:1500]))
  139. # print(id, aptitude_text[:10], aptitude_text[-20:])
  140. # else:
  141. # parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
  142. # requirement_text, aptitude_text = extract_parameters(parse_document)
  143. #
  144. # # if 0<len(aptitude_text)<20:
  145. # # l.append(len(aptitude_text))
  146. # # n += 1
  147. # # print(id, aptitude_text)
  148. # # if n > 5:
  149. # # break
  150. #
  151. # c = Counter(out_lines)
  152. # print(c.most_common(1000))
  153. #
  154. # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
  155. # df.to_excel('E:/公告资质要求提取结果.xlsx')
  156. # if len(aptitude_text)> 1000:
  157. # print(id, aptitude_text[:10], aptitude_text[-20:])
  158. # print(Counter(l).most_common(50))
  159. # print(len(df), len(l), min(l), max(l), sum(l)/len(l))
  160. # n1 = len([it for it in l if it < 500])
  161. # n2 = len([it for it in l if it < 1000])
  162. # n3 = len([it for it in l if it < 1500])
  163. # n4 = len([it for it in l if it < 2000])
  164. # print(n1, n2, n3, n4, n1/len(l), n2/len(l), n3/len(l), n4/len(l))
  165. # parse_document = ParseDocument(html,True)
  166. # requirement_text, new_list_policy, aptitude_text = extract_parameters(parse_document)
  167. # print(aptitude_text)
  168. # sentence_text = '5、要求:3.1投标其他条件:1、中国宝武集团项目未列入禁入名单的投标人。2、具有有效的营业执照;'
  169. # begin_index = 0
  170. # for item in re.finditer('[,。;;!!??]+', sentence_text):
  171. # end_index = item.end()
  172. # if end_index != len(sentence_text):
  173. # if end_index - begin_index < 6:
  174. # continue
  175. # new_sentence_text = sentence_text[begin_index:end_index]
  176. # print(new_sentence_text)
  177. df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
  178. pos = neg = 0
  179. for docid, text in zip(df['docid'], df['资质要求']):
  180. if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
  181. pos += 1
  182. pass
  183. else:
  184. neg += 1
  185. print(docid, text[:50])
  186. print('异常:%d, 正常:%d'%(neg, pos))