#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: bidikeji @time: 2024/7/19 10:05 """ import re from BiddingKG.dl.interface.htmlparser import ParseDocument,get_childs class Sentence2(): def __init__(self,text,sentence_index,wordOffset_begin,wordOffset_end): self.name = 'sentence2' self.text = text self.sentence_index = sentence_index self.wordOffset_begin = wordOffset_begin self.wordOffset_end = wordOffset_end def get_text(self): return self.text def extract_sentence_list(sentence_list): new_sentence2_list = [] new_sentence2_list_attach = [] for sentence in sentence_list: sentence_index = sentence.sentence_index sentence_text = sentence.sentence_text begin_index = 0 end_index = 0 for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d\.、,。a-zA-Z]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、 temp = it.group(0) sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:]) for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔 end_index = item.end() # if end_index!=len(sentence_text): # # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误 # # continue if end_index != len(sentence_text) and re.match('[一二三四五六七八九十\d.]{1,2}[、,.]+$', sentence_text[begin_index:end_index]): # 避免表格序号和内容在不同表格情况 例:293178161 continue new_sentence_text = sentence_text[begin_index:end_index] sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index) if sentence.in_attachment: new_sentence2_list_attach.append(sentence2) else: new_sentence2_list.append(sentence2) begin_index = end_index if end_index!=len(sentence_text): end_index = len(sentence_text) new_sentence_text = sentence_text[begin_index:end_index] sentence2 = Sentence2(new_sentence_text, sentence_index, begin_index, end_index) if sentence.in_attachment: new_sentence2_list_attach.append(sentence2) else: new_sentence2_list.append(sentence2) return new_sentence2_list, new_sentence2_list_attach requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \ "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)" aptitude_pattern = "(资格要求|资质要求)([::,]|$)" addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)" addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)" pinmu_name_pattern = "采购品目名称:(\w{2,50})[,。]" out_lines = [] def extract_parameters(parse_document): ''' 通过大纲、预处理后文本正则获取需要字段 :param parse_document: ParseDocument() 方法返回结果 :return: ''' list_data = parse_document.tree requirement_text = '' # 采购内容 aptitude_text = '' # 资质要求 addr_bidopen_text = '' # 开标地址 addr_bidsend_text = '' # 投标地址 requirement_scope = [] # 采购内容始末位置 pinmu_name = '' # 品目名称 _find_count = 0 _data_i = -1 while _data_i0 else (_data['sentence_index'], _data['wordOffset_end']) requirement_scope.append(b) requirement_scope.append(e) _data_i += len(childs) _data_i -= 1 _data_i = -1 while _data_i120 and re.search(aptitude_pattern,cell_text) is not None: aptitude_text += cell_text+"\n" _data_i = -1 while _data_i < len(list_data) - 1: _data_i += 1 _data = list_data[_data_i] _type = _data["type"] _text = _data["text"].strip() # print(_data.keys()) if _type == "sentence": if _data["sentence_title"] is not None: if re.search(addr_bidopen_pattern, _text[:20]) is not None: childs = get_childs([_data], max_depth=1) for c in childs: addr_bidopen_text += c["text"] _data_i += len(childs) _data_i -= 1 elif re.search(addr_bidsend_pattern, _text[:20]): childs = get_childs([_data], max_depth=1) for c in childs: addr_bidsend_text += c["text"] _data_i += len(childs) _data_i -= 1 elif re.search(pinmu_name_pattern, _text): pinmu_name += re.search(pinmu_name_pattern, _text).group(1) if re.search('时间:', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text): for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text): b, e = ser.span() addr_bidopen_text = addr_bidopen_text[b:e] elif re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况 addr_bidopen_text = "" if re.search('时间:', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text): for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text): b, e = ser.span() addr_bidsend_text = addr_bidsend_text[b:e] return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name def extract_addr(content): ''' 通过正则提取地址 :param content: 公告预处理后文本 :return: ''' addr_bidopen_text = '' ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content) if ser: addr_bidopen_text = ser.group(0) return addr_bidopen_text if __name__ == "__main__": # with open('D:\html/2.html', 'r', encoding='UTF-8') as f: # html = f.read() # l = [] import pandas as pd from collections import Counter from BiddingKG.dl.interface import Preprocessing from BiddingKG.dl.interface.get_label_dic import get_all_label from bs4 import BeautifulSoup import json df = pd.read_excel('E:/公告招标内容提取结果2.xlsx') df['len']= df['招标内容'].apply(lambda x: len(x)) print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len'])) print(len([it for it in df['len'] if it>1500])) # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv') # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx') # df = df[df['docid'].isin(df1['docid'])] # # df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True) # print(df.columns, len(df)) # # # # def get_text(html): # # soup = BeautifulSoup(html, 'lxml') # # text = soup.get_text() # # return text # # df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x)) # # df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1) # # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2)) # # df1 = df[['docid', '标签']] # # n = 0 # datas = [] # for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']): # # if id not in [289647738, 289647739]: # # continue # # print(id, type(id)) # # parse_document = ParseDocument(html, True) # # requirement_text, aptitude_text = extract_parameters(parse_document) # # if re.search('资\s*[格质]', html)==None: # # continue # # list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True) # sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0]) # # # sentence2_list = [] # # parse_document = ParseDocument(html, True, list_obj=sentence2_list) # requirement_text, aptitude_text = extract_parameters(parse_document) # # if len(aptitude_text)>0: # # datas.append((id, aptitude_text[:1500])) # # print(id, aptitude_text[:10], aptitude_text[-20:]) # # else: # # parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach) # # requirement_text, aptitude_text = extract_parameters(parse_document) # # # if 0 5: # # break # # if len(requirement_text)>0: # label_dic = get_all_label(title, list_articles[0].content) # # datas.append((id, requirement_text)) # datas.append((id, requirement_text, label_dic)) # # c = Counter(out_lines) # print(c.most_common(1000)) # # # # df = pd.DataFrame(datas, columns=['docid', '资质要求']) # # df.to_excel('E:/公告资质要求提取结果.xlsx') # # df = pd.DataFrame(datas, columns=['docid', '招标内容', '标签']) # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2)) # df.to_excel('E:/公告招标内容提取结果2.xlsx') # if len(aptitude_text)> 1000: # print(id, aptitude_text[:10], aptitude_text[-20:]) # print(Counter(l).most_common(50)) # print(len(df), len(l), min(l), max(l), sum(l)/len(l)) # n1 = len([it for it in l if it < 500]) # n2 = len([it for it in l if it < 1000]) # n3 = len([it for it in l if it < 1500]) # n4 = len([it for it in l if it < 2000]) # print(n1, n2, n3, n4, n1/len(l), n2/len(l), n3/len(l), n4/len(l)) # parse_document = ParseDocument(html,True) # requirement_text, new_list_policy, aptitude_text = extract_parameters(parse_document) # print(aptitude_text) # sentence_text = '5、要求:3.1投标其他条件:1、中国宝武集团项目未列入禁入名单的投标人。2、具有有效的营业执照;' # begin_index = 0 # for item in re.finditer('[,。;;!!??]+', sentence_text): # end_index = item.end() # if end_index != len(sentence_text): # if end_index - begin_index < 6: # continue # new_sentence_text = sentence_text[begin_index:end_index] # print(new_sentence_text) # df = pd.read_excel('E:/公告资质要求提取结果.xlsx') # docids = [] # pos = neg = 0 # for docid, text in zip(df['docid'], df['资质要求']): # if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]): # pos += 1 # pass # else: # neg += 1 # print(docid, text[:50]) # docids.append(docid) # print('异常:%d, 正常:%d'%(neg, pos)) # print(docids)