#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: bidikeji @time: 2024/7/19 10:05 """ import re from BiddingKG.dl.interface.htmlparser import ParseDocument,get_childs class Sentence2(): def __init__(self,text,sentence_index,wordOffset_begin,wordOffset_end): self.name = 'sentence2' self.text = text self.sentence_index = sentence_index self.wordOffset_begin = wordOffset_begin self.wordOffset_end = wordOffset_end def get_text(self): return self.text def extract_sentence_list(sentence_list): new_sentence2_list = [] new_sentence2_list_attach = [] for sentence in sentence_list: sentence_index = sentence.sentence_index sentence_text = sentence.sentence_text begin_index = 0 end_index = 0 for it in re.finditer('([\w:][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、 temp = it.group(0) sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:]) for item in re.finditer('[,。;;!!??]+', sentence_text): end_index = item.end() if end_index!=len(sentence_text): if end_index-begin_index<6 and item.group()[-1] in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', item.group())==None: continue new_sentence_text = sentence_text[begin_index:end_index] sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index) if sentence.in_attachment: new_sentence2_list_attach.append(sentence2) else: new_sentence2_list.append(sentence2) begin_index = end_index if end_index!=len(sentence_text): end_index = len(sentence_text) new_sentence_text = sentence_text[begin_index:end_index] sentence2 = Sentence2(new_sentence_text, sentence_index, begin_index, end_index) if sentence.in_attachment: new_sentence2_list_attach.append(sentence2) else: new_sentence2_list.append(sentence2) return new_sentence2_list, new_sentence2_list_attach requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围)([及与和](其它|\w{,2})要求)?" \ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::]|$)" aptitude_pattern = "(资格要求|资质要求)([::,]|$)" # out_lines = [] def extract_parameters(parse_document): list_data = parse_document.tree requirement_text = '' aptitude_text = '' _find_count = 0 _data_i = -1 while _data_i120 and re.search(aptitude_pattern,cell_text) is not None: aptitude_text += cell_text+"\n" return requirement_text,aptitude_text if __name__ == "__main__": # with open('D:\html/2.html', 'r', encoding='UTF-8') as f: # html = f.read() # # l = [] import pandas as pd # from collections import Counter # from BiddingKG.dl.interface import Preprocessing # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv') # n = 0 # datas = [] # for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']): # # if id not in [289647738, 289647739]: # # continue # # print(id, type(id)) # # parse_document = ParseDocument(html, True) # # requirement_text, aptitude_text = extract_parameters(parse_document) # if re.search('资\s*[格质]', html)==None: # continue # # list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True) # sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0]) # # # sentence2_list = [] # # parse_document = ParseDocument(html, True, list_obj=sentence2_list) # requirement_text, aptitude_text = extract_parameters(parse_document) # if len(aptitude_text)>0: # datas.append((id, aptitude_text[:1500])) # print(id, aptitude_text[:10], aptitude_text[-20:]) # else: # parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach) # requirement_text, aptitude_text = extract_parameters(parse_document) # # # if 0 5: # # break # # c = Counter(out_lines) # print(c.most_common(1000)) # # df = pd.DataFrame(datas, columns=['docid', '资质要求']) # df.to_excel('E:/公告资质要求提取结果.xlsx') # if len(aptitude_text)> 1000: # print(id, aptitude_text[:10], aptitude_text[-20:]) # print(Counter(l).most_common(50)) # print(len(df), len(l), min(l), max(l), sum(l)/len(l)) # n1 = len([it for it in l if it < 500]) # n2 = len([it for it in l if it < 1000]) # n3 = len([it for it in l if it < 1500]) # n4 = len([it for it in l if it < 2000]) # print(n1, n2, n3, n4, n1/len(l), n2/len(l), n3/len(l), n4/len(l)) # parse_document = ParseDocument(html,True) # requirement_text, new_list_policy, aptitude_text = extract_parameters(parse_document) # print(aptitude_text) # sentence_text = '5、要求:3.1投标其他条件:1、中国宝武集团项目未列入禁入名单的投标人。2、具有有效的营业执照;' # begin_index = 0 # for item in re.finditer('[,。;;!!??]+', sentence_text): # end_index = item.end() # if end_index != len(sentence_text): # if end_index - begin_index < 6: # continue # new_sentence_text = sentence_text[begin_index:end_index] # print(new_sentence_text) df = pd.read_excel('E:/公告资质要求提取结果.xlsx') pos = neg = 0 for docid, text in zip(df['docid'], df['资质要求']): if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]): pos += 1 pass else: neg += 1 print(docid, text[:50]) print('异常:%d, 正常:%d'%(neg, pos))