outline_extractor.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. @author: bidikeji
  5. @time: 2024/7/19 10:05
  6. """
  7. import re
  8. from BiddingKG.dl.interface.htmlparser import ParseDocument,get_childs
  9. class Sentence2():
  10. def __init__(self,text,sentence_index,wordOffset_begin,wordOffset_end):
  11. self.name = 'sentence2'
  12. self.text = text
  13. self.sentence_index = sentence_index
  14. self.wordOffset_begin = wordOffset_begin
  15. self.wordOffset_end = wordOffset_end
  16. def get_text(self):
  17. return self.text
  18. def extract_sentence_list(sentence_list):
  19. new_sentence2_list = []
  20. new_sentence2_list_attach = []
  21. for sentence in sentence_list:
  22. sentence_index = sentence.sentence_index
  23. sentence_text = sentence.sentence_text
  24. begin_index = 0
  25. end_index = 0
  26. for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d\.、,。a-zA-Z]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
  27. temp = it.group(0)
  28. sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
  29. for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔
  30. end_index = item.end()
  31. # if end_index!=len(sentence_text):
  32. # # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误
  33. # # continue
  34. if end_index != len(sentence_text) and re.match('[一二三四五六七八九十\d.]{1,2}[、,.]+$', sentence_text[begin_index:end_index]): # 避免表格序号和内容在不同表格情况 例:293178161
  35. continue
  36. new_sentence_text = sentence_text[begin_index:end_index]
  37. sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
  38. if sentence.in_attachment:
  39. new_sentence2_list_attach.append(sentence2)
  40. else:
  41. new_sentence2_list.append(sentence2)
  42. begin_index = end_index
  43. if end_index!=len(sentence_text):
  44. end_index = len(sentence_text)
  45. new_sentence_text = sentence_text[begin_index:end_index]
  46. sentence2 = Sentence2(new_sentence_text, sentence_index, begin_index, end_index)
  47. if sentence.in_attachment:
  48. new_sentence2_list_attach.append(sentence2)
  49. else:
  50. new_sentence2_list.append(sentence2)
  51. return new_sentence2_list, new_sentence2_list_attach
  52. requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
  53. "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
  54. "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
  55. winter_pattern = "((乙方|竞得|受让|买受|签约|供货|供应|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(?:(包))?|入围|入选|竞买|中标|中选|中价|中签|成交|候选)[\u4e00-\u9fa5]{0,5}" \
  56. "(公示)?(信息|概况|情况|名称|联系人|联系方式|负责人)|中标公示单位)为?([::,、]|$)"
  57. aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求"
  58. addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
  59. addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
  60. pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
  61. policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
  62. not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
  63. correction_pattern = "(更正|更改|修正|修改|变更|延期)(信息|内容|事项|详情)"
  64. def extract_parameters(parse_document):
  65. '''
  66. 通过大纲、预处理后文本正则获取需要字段
  67. :param parse_document: ParseDocument() 方法返回结果
  68. :return:
  69. '''
  70. list_data = parse_document.tree
  71. requirement_text = '' # 采购内容
  72. aptitude_text = '' # 资质要求
  73. addr_bidopen_text = '' # 开标地址
  74. addr_bidsend_text = '' # 投标地址
  75. requirement_scope = [] # 采购内容始末位置
  76. winter_scope = [] # 中标信息始末位置
  77. pinmu_name = '' # 品目名称
  78. list_policy = [] # 政策法规
  79. correction_content = "" # 更正内容
  80. out_lines = []
  81. _find_count = 0
  82. _data_i = -1
  83. while _data_i<len(list_data)-1:
  84. _data_i += 1
  85. _data = list_data[_data_i]
  86. _type = _data["type"]
  87. _text = _data["text"].strip()
  88. _text = _text.replace('(', '(').replace(')', ')') # 20250729 统一为中文括号
  89. # print(_data.keys())
  90. if _type=="sentence":
  91. if _data["sentence_title"] is not None:
  92. if re.search('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*[..、]|^\d{1,2}[..、][\u4e00-\u9fa5]', _text[:10]):
  93. idx = _text.replace(':', ':').find(':')
  94. outline_text = _text[:idx] if idx >= 4 else _text
  95. # out_lines.append((outline_text, _data['sentence_index'], _data['wordOffset_begin']))
  96. childs = get_childs([_data])
  97. if len(childs) > 0:
  98. scope = ((childs[0]['sentence_index'], childs[0]['wordOffset_begin']),
  99. (childs[-1]['sentence_index'], childs[-1]['wordOffset_end']))
  100. else:
  101. scope = ((_data['sentence_index'], _data['wordOffset_begin']),
  102. (_data['sentence_index'], _data['wordOffset_end']))
  103. out_lines.append((outline_text, _data['sentence_title'], _data['title_index'], _data['next_index'], scope))
  104. if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
  105. b = (_data['sentence_index'], _data['wordOffset_begin'])
  106. childs = get_childs([_data])
  107. for c in childs:
  108. # requirement_text += c["text"]+"\n"
  109. requirement_text += c["text"]
  110. e = (c['sentence_index'], c["wordOffset_end"]) if len(childs)>0 else (_data['sentence_index'], _data['wordOffset_end'])
  111. requirement_scope.append(b)
  112. requirement_scope.append(e)
  113. _data_i += len(childs)
  114. _data_i -= 1
  115. _data_i = -1
  116. # 中标信息
  117. while _data_i<len(list_data)-1:
  118. _data_i += 1
  119. _data = list_data[_data_i]
  120. _type = _data["type"]
  121. _text = _data["text"].strip()
  122. # print(_data.keys())
  123. if _type=="sentence":
  124. # print('_text',_text)
  125. # print('sentence_title',_data["sentence_title"])
  126. if _data["sentence_title"] is not None:
  127. if re.search(winter_pattern,_text[:30]) is not None:
  128. b = (_data['sentence_index'], _data['wordOffset_begin'])
  129. childs = get_childs([_data])
  130. e = (childs[-1]['sentence_index'], childs[-1]["wordOffset_end"]) if len(childs)>0 else (_data['sentence_index'], _data['wordOffset_end'])
  131. winter_scope.append(b)
  132. winter_scope.append(e)
  133. _data_i += len(childs)
  134. _data_i -= 1
  135. _data_i = -1
  136. # 更正内容
  137. while _data_i < len(list_data) - 1:
  138. _data_i += 1
  139. _data = list_data[_data_i]
  140. _type = _data["type"]
  141. _text = _data["text"].strip()
  142. if _type == "sentence":
  143. if _data["sentence_title"] is not None:
  144. if re.search(correction_pattern, _text[:20]) is not None:
  145. childs = get_childs([_data])
  146. correction_text = ""
  147. for c in childs:
  148. correction_text += c["text"].strip()
  149. # print('correction_text',correction_text)
  150. correction_content += correction_text
  151. _data_i += len(childs)
  152. _data_i -= 1
  153. _data_i = -1
  154. while _data_i<len(list_data)-1:
  155. _data_i += 1
  156. _data = list_data[_data_i]
  157. _type = _data["type"]
  158. _text = _data["text"].strip()
  159. # print(_data.keys())
  160. if _type=="sentence":
  161. # print("aptitude_pattern", _text)
  162. if _data["sentence_title"] is not None:
  163. # print("aptitude_pattern",_text)
  164. # outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
  165. # re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
  166. if re.search(aptitude_pattern,_text[:15]) is not None:
  167. childs = get_childs([_data])
  168. for c in childs:
  169. aptitude_text += c["text"]
  170. # if c["sentence_title"]:
  171. # aptitude_text += c["text"]+"\n"
  172. # else:
  173. # aptitude_text += c["text"]
  174. _data_i += len(childs)
  175. _data_i -= 1
  176. # elif re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', _text) and len(_text)<30 and re.search('资质|资格', _text):
  177. # out_lines.append(outline)
  178. if _type=="table":
  179. list_table = _data["list_table"]
  180. parent_title = _data["parent_title"]
  181. if list_table is not None:
  182. for line in list_table[:2]:
  183. for cell_i in range(len(line)):
  184. cell = line[cell_i]
  185. cell_text = cell[0]
  186. if len(cell_text)>120 and re.search(aptitude_pattern,cell_text) is not None:
  187. aptitude_text += cell_text+"\n"
  188. _data_i = -1
  189. while _data_i < len(list_data) - 1:
  190. _data_i += 1
  191. _data = list_data[_data_i]
  192. _type = _data["type"]
  193. _text = _data["text"].strip()
  194. # print(_data.keys())
  195. if _type == "sentence":
  196. if _data["sentence_title"] is not None:
  197. if re.search(addr_bidopen_pattern, _text[:20]) is not None:
  198. childs = get_childs([_data], max_depth=1)
  199. for c in childs:
  200. addr_bidopen_text += c["text"]
  201. _data_i += len(childs)
  202. _data_i -= 1
  203. elif re.search(addr_bidsend_pattern, _text[:20]):
  204. childs = get_childs([_data], max_depth=1)
  205. for c in childs:
  206. addr_bidsend_text += c["text"]
  207. _data_i += len(childs)
  208. _data_i -= 1
  209. elif re.search(pinmu_name_pattern, _text):
  210. childs = get_childs([_data], max_depth=1)
  211. for c in childs:
  212. pinmu_name += c["text"]
  213. _data_i += len(childs)
  214. _data_i -= 1
  215. _data_i = -1
  216. while _data_i<len(list_data)-1:
  217. _data_i += 1
  218. _data = list_data[_data_i]
  219. _type = _data["type"]
  220. _text = _data["text"].strip()
  221. # print(_data.keys())
  222. if _type=="sentence":
  223. for it in re.finditer(policy_pattern, _text):
  224. if it not in list_policy:
  225. list_policy.append(it.group(0))
  226. ser = re.search('地[址点][:为](?P<addr>([\w()()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w()()【】-]{,60}))[,。]', addr_bidopen_text)
  227. addr_bidopen_text = ser.group('addr') if ser else ''
  228. ser = re.search('地[址点][:为](?P<addr>([\w()()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w()()【】-]{,60}))[,。]', addr_bidsend_text)
  229. addr_bidsend_text = ser.group('addr') if ser else ''
  230. if re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
  231. addr_bidopen_text = ""
  232. ser = re.search(pinmu_name_pattern, pinmu_name)
  233. if ser:
  234. pinmu_name = pinmu_name[ser.end():]
  235. if re.search('[^\w]$', pinmu_name):
  236. pinmu_name = pinmu_name[:-1]
  237. if len(out_lines) < 3: # 小于三个的大纲去掉
  238. out_lines = []
  239. else:
  240. text_, title_type, title_index, next_index, scope = out_lines[-1]
  241. if scope[0][0] < scope[1][0]:# 最后一个大纲范围取当句,避免错误
  242. out_lines[-1] = (text_, title_type, title_index, next_index,(scope[0], (scope[0][0]+1, 0)))
  243. return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope,correction_content
  244. def extract_addr(content):
  245. '''
  246. 通过正则提取地址
  247. :param content: 公告预处理后文本
  248. :return:
  249. '''
  250. addr_bidopen_text = ''
  251. ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
  252. if ser:
  253. addr_bidopen_text = ser.group(0)
  254. return addr_bidopen_text
  255. if __name__ == "__main__":
  256. # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
  257. # html = f.read()
  258. #
  259. l = []
  260. import pandas as pd
  261. from collections import Counter
  262. from BiddingKG.dl.interface import Preprocessing
  263. from BiddingKG.dl.interface.get_label_dic import get_all_label
  264. from bs4 import BeautifulSoup
  265. import json
  266. # df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
  267. # df['len']= df['招标内容'].apply(lambda x: len(x))
  268. # print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
  269. # print(len([it for it in df['len'] if it>1500]))
  270. # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
  271. # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx')
  272. # df = df[df['docid'].isin(df1['docid'])]
  273. #
  274. # df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
  275. # print(df.columns, len(df))
  276. #
  277. #
  278. # # def get_text(html):
  279. # # soup = BeautifulSoup(html, 'lxml')
  280. # # text = soup.get_text()
  281. # # return text
  282. # # df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
  283. # # df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
  284. # # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
  285. # # df1 = df[['docid', '标签']]
  286. #
  287. # n = 0
  288. # datas = []
  289. # for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']):
  290. # # if id not in [289647738, 289647739]:
  291. # # continue
  292. # # print(id, type(id))
  293. # # parse_document = ParseDocument(html, True)
  294. # # requirement_text, aptitude_text = extract_parameters(parse_document)
  295. # # if re.search('资\s*[格质]', html)==None:
  296. # # continue
  297. #
  298. # list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True)
  299. # sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
  300. #
  301. # # sentence2_list = []
  302. #
  303. # parse_document = ParseDocument(html, True, list_obj=sentence2_list)
  304. # requirement_text, aptitude_text = extract_parameters(parse_document)
  305. # # if len(aptitude_text)>0:
  306. # # datas.append((id, aptitude_text[:1500]))
  307. # # print(id, aptitude_text[:10], aptitude_text[-20:])
  308. # # else:
  309. # # parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
  310. # # requirement_text, aptitude_text = extract_parameters(parse_document)
  311. #
  312. # # if 0<len(aptitude_text)<20:
  313. # # l.append(len(aptitude_text))
  314. # # n += 1
  315. # # print(id, aptitude_text)
  316. # # if n > 5:
  317. # # break
  318. #
  319. # if len(requirement_text)>0:
  320. # label_dic = get_all_label(title, list_articles[0].content)
  321. # # datas.append((id, requirement_text))
  322. # datas.append((id, requirement_text, label_dic))
  323. #
  324. # c = Counter(out_lines)
  325. # print(c.most_common(1000))
  326. # #
  327. # # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
  328. # # df.to_excel('E:/公告资质要求提取结果.xlsx')
  329. #
  330. # df = pd.DataFrame(datas, columns=['docid', '招标内容', '标签'])
  331. # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
  332. # df.to_excel('E:/公告招标内容提取结果2.xlsx')
  333. # if len(aptitude_text)> 1000:
  334. # print(id, aptitude_text[:10], aptitude_text[-20:])
  335. # print(Counter(l).most_common(50))
  336. # print(len(df), len(l), min(l), max(l), sum(l)/len(l))
  337. # n1 = len([it for it in l if it < 500])
  338. # n2 = len([it for it in l if it < 1000])
  339. # n3 = len([it for it in l if it < 1500])
  340. # n4 = len([it for it in l if it < 2000])
  341. # print(n1, n2, n3, n4, n1/len(l), n2/len(l), n3/len(l), n4/len(l))
  342. # parse_document = ParseDocument(html,True)
  343. # requirement_text, new_list_policy, aptitude_text = extract_parameters(parse_document)
  344. # print(aptitude_text)
  345. # sentence_text = '5、要求:3.1投标其他条件:1、中国宝武集团项目未列入禁入名单的投标人。2、具有有效的营业执照;'
  346. # begin_index = 0
  347. # for item in re.finditer('[,。;;!!??]+', sentence_text):
  348. # end_index = item.end()
  349. # if end_index != len(sentence_text):
  350. # if end_index - begin_index < 6:
  351. # continue
  352. # new_sentence_text = sentence_text[begin_index:end_index]
  353. # print(new_sentence_text)
  354. # df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
  355. # docids = []
  356. # pos = neg = 0
  357. # for docid, text in zip(df['docid'], df['资质要求']):
  358. # if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
  359. # pos += 1
  360. # pass
  361. # else:
  362. # neg += 1
  363. # print(docid, text[:50])
  364. # docids.append(docid)
  365. # print('异常:%d, 正常:%d'%(neg, pos))
  366. # print(docids)