|
@@ -27,14 +27,14 @@ def extract_sentence_list(sentence_list):
|
|
|
sentence_text = sentence.sentence_text
|
|
|
begin_index = 0
|
|
|
end_index = 0
|
|
|
- for it in re.finditer('([\w:][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
|
|
|
+ for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
|
|
|
temp = it.group(0)
|
|
|
sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
|
|
|
- for item in re.finditer('[,。;;!!??]+', sentence_text):
|
|
|
+ for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔
|
|
|
end_index = item.end()
|
|
|
- if end_index!=len(sentence_text):
|
|
|
- if end_index-begin_index<6 and item.group()[-1] in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', item.group())==None:
|
|
|
- continue
|
|
|
+ # if end_index!=len(sentence_text):
|
|
|
+ # # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误
|
|
|
+ # # continue
|
|
|
new_sentence_text = sentence_text[begin_index:end_index]
|
|
|
sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
|
|
|
if sentence.in_attachment:
|
|
@@ -53,16 +53,23 @@ def extract_sentence_list(sentence_list):
|
|
|
|
|
|
return new_sentence2_list, new_sentence2_list_attach
|
|
|
|
|
|
-requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围)([及与和](其它|\w{,2})要求)?" \
|
|
|
- "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::]|$)"
|
|
|
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围|信息)([及与和](其它|\w{,2})要求)?" \
|
|
|
+ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::,]|$)"
|
|
|
aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
|
|
|
+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
|
|
|
+out_lines = []
|
|
|
|
|
|
-# out_lines = []
|
|
|
-
|
|
|
-def extract_parameters(parse_document):
|
|
|
+def extract_parameters(parse_document, content):
|
|
|
+ '''
|
|
|
+ 通过大纲、预处理后文本正则获取需要字段
|
|
|
+ :param parse_document: ParseDocument() 方法返回结果
|
|
|
+ :param content: 公告预处理后文本
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
list_data = parse_document.tree
|
|
|
requirement_text = ''
|
|
|
aptitude_text = ''
|
|
|
+ addr_bidopen_text = ''
|
|
|
|
|
|
_find_count = 0
|
|
|
_data_i = -1
|
|
@@ -74,10 +81,16 @@ def extract_parameters(parse_document):
|
|
|
# print(_data.keys())
|
|
|
if _type=="sentence":
|
|
|
if _data["sentence_title"] is not None:
|
|
|
- if re.search(requirement_pattern,_text) is not None:
|
|
|
+
|
|
|
+ outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
|
|
|
+ re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
|
|
|
+
|
|
|
+ if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
|
|
|
+ out_lines.append(outline)
|
|
|
childs = get_childs([_data])
|
|
|
for c in childs:
|
|
|
- requirement_text += c["text"]+"\n"
|
|
|
+ # requirement_text += c["text"]+"\n"
|
|
|
+ requirement_text += c["text"]
|
|
|
_data_i += len(childs)
|
|
|
_data_i -= 1
|
|
|
_data_i = -1
|
|
@@ -108,7 +121,6 @@ def extract_parameters(parse_document):
|
|
|
|
|
|
# elif re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', _text) and len(_text)<30 and re.search('资质|资格', _text):
|
|
|
# out_lines.append(outline)
|
|
|
-
|
|
|
if _type=="table":
|
|
|
list_table = _data["list_table"]
|
|
|
parent_title = _data["parent_title"]
|
|
@@ -119,18 +131,67 @@ def extract_parameters(parse_document):
|
|
|
cell_text = cell[0]
|
|
|
if len(cell_text)>120 and re.search(aptitude_pattern,cell_text) is not None:
|
|
|
aptitude_text += cell_text+"\n"
|
|
|
-
|
|
|
- return requirement_text,aptitude_text
|
|
|
+ _data_i = -1
|
|
|
+ while _data_i < len(list_data) - 1:
|
|
|
+ _data_i += 1
|
|
|
+ _data = list_data[_data_i]
|
|
|
+ _type = _data["type"]
|
|
|
+ _text = _data["text"].strip()
|
|
|
+ # print(_data.keys())
|
|
|
+ if _type == "sentence":
|
|
|
+ if _data["sentence_title"] is not None:
|
|
|
+ if re.search(addr_bidopen_pattern, _text[:20]) is not None:
|
|
|
+ childs = get_childs([_data], max_depth=1)
|
|
|
+ for c in childs:
|
|
|
+ addr_bidopen_text += c["text"]
|
|
|
+ _data_i += len(childs)
|
|
|
+ _data_i -= 1
|
|
|
+ if re.search('时间:', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
|
|
|
+ for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
|
|
|
+ b, e = ser.span()
|
|
|
+ addr_bidopen_text = addr_bidopen_text[b:e]
|
|
|
+ elif re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
|
|
|
+ addr_bidopen_text = ""
|
|
|
+ if addr_bidopen_text == "":
|
|
|
+ ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
|
|
|
+ if ser:
|
|
|
+ addr_bidopen_text = ser.group(0)
|
|
|
+ return requirement_text, aptitude_text, addr_bidopen_text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
|
|
|
# html = f.read()
|
|
|
#
|
|
|
- # l = []
|
|
|
+ l = []
|
|
|
import pandas as pd
|
|
|
- # from collections import Counter
|
|
|
- # from BiddingKG.dl.interface import Preprocessing
|
|
|
+ from collections import Counter
|
|
|
+ from BiddingKG.dl.interface import Preprocessing
|
|
|
+ from BiddingKG.dl.interface.get_label_dic import get_all_label
|
|
|
+ from bs4 import BeautifulSoup
|
|
|
+ import json
|
|
|
+
|
|
|
+ df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
|
|
|
+ df['len']= df['招标内容'].apply(lambda x: len(x))
|
|
|
+ print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
|
|
|
+ print(len([it for it in df['len'] if it>1500]))
|
|
|
+
|
|
|
# df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
|
|
|
+ # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx')
|
|
|
+ # df = df[df['docid'].isin(df1['docid'])]
|
|
|
+ #
|
|
|
+ # df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
|
|
|
+ # print(df.columns, len(df))
|
|
|
+ #
|
|
|
+ #
|
|
|
+ # # def get_text(html):
|
|
|
+ # # soup = BeautifulSoup(html, 'lxml')
|
|
|
+ # # text = soup.get_text()
|
|
|
+ # # return text
|
|
|
+ # # df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
|
|
|
+ # # df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
|
|
|
+ # # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
|
|
|
+ # # df1 = df[['docid', '标签']]
|
|
|
+ #
|
|
|
# n = 0
|
|
|
# datas = []
|
|
|
# for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']):
|
|
@@ -139,8 +200,8 @@ if __name__ == "__main__":
|
|
|
# # print(id, type(id))
|
|
|
# # parse_document = ParseDocument(html, True)
|
|
|
# # requirement_text, aptitude_text = extract_parameters(parse_document)
|
|
|
- # if re.search('资\s*[格质]', html)==None:
|
|
|
- # continue
|
|
|
+ # # if re.search('资\s*[格质]', html)==None:
|
|
|
+ # # continue
|
|
|
#
|
|
|
# list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True)
|
|
|
# sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
|
|
@@ -149,12 +210,12 @@ if __name__ == "__main__":
|
|
|
#
|
|
|
# parse_document = ParseDocument(html, True, list_obj=sentence2_list)
|
|
|
# requirement_text, aptitude_text = extract_parameters(parse_document)
|
|
|
- # if len(aptitude_text)>0:
|
|
|
- # datas.append((id, aptitude_text[:1500]))
|
|
|
- # print(id, aptitude_text[:10], aptitude_text[-20:])
|
|
|
- # else:
|
|
|
- # parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
|
|
|
- # requirement_text, aptitude_text = extract_parameters(parse_document)
|
|
|
+ # # if len(aptitude_text)>0:
|
|
|
+ # # datas.append((id, aptitude_text[:1500]))
|
|
|
+ # # print(id, aptitude_text[:10], aptitude_text[-20:])
|
|
|
+ # # else:
|
|
|
+ # # parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
|
|
|
+ # # requirement_text, aptitude_text = extract_parameters(parse_document)
|
|
|
#
|
|
|
# # if 0<len(aptitude_text)<20:
|
|
|
# # l.append(len(aptitude_text))
|
|
@@ -163,11 +224,20 @@ if __name__ == "__main__":
|
|
|
# # if n > 5:
|
|
|
# # break
|
|
|
#
|
|
|
+ # if len(requirement_text)>0:
|
|
|
+ # label_dic = get_all_label(title, list_articles[0].content)
|
|
|
+ # # datas.append((id, requirement_text))
|
|
|
+ # datas.append((id, requirement_text, label_dic))
|
|
|
+ #
|
|
|
# c = Counter(out_lines)
|
|
|
# print(c.most_common(1000))
|
|
|
+ # #
|
|
|
+ # # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
|
|
|
+ # # df.to_excel('E:/公告资质要求提取结果.xlsx')
|
|
|
#
|
|
|
- # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
|
|
|
- # df.to_excel('E:/公告资质要求提取结果.xlsx')
|
|
|
+ # df = pd.DataFrame(datas, columns=['docid', '招标内容', '标签'])
|
|
|
+ # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
|
|
|
+ # df.to_excel('E:/公告招标内容提取结果2.xlsx')
|
|
|
|
|
|
# if len(aptitude_text)> 1000:
|
|
|
# print(id, aptitude_text[:10], aptitude_text[-20:])
|
|
@@ -193,14 +263,17 @@ if __name__ == "__main__":
|
|
|
# new_sentence_text = sentence_text[begin_index:end_index]
|
|
|
# print(new_sentence_text)
|
|
|
|
|
|
- df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
|
|
|
- pos = neg = 0
|
|
|
- for docid, text in zip(df['docid'], df['资质要求']):
|
|
|
- if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
|
|
|
- pos += 1
|
|
|
- pass
|
|
|
- else:
|
|
|
- neg += 1
|
|
|
- print(docid, text[:50])
|
|
|
- print('异常:%d, 正常:%d'%(neg, pos))
|
|
|
+ # df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
|
|
|
+ # docids = []
|
|
|
+ # pos = neg = 0
|
|
|
+ # for docid, text in zip(df['docid'], df['资质要求']):
|
|
|
+ # if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
|
|
|
+ # pos += 1
|
|
|
+ # pass
|
|
|
+ # else:
|
|
|
+ # neg += 1
|
|
|
+ # print(docid, text[:50])
|
|
|
+ # docids.append(docid)
|
|
|
+ # print('异常:%d, 正常:%d'%(neg, pos))
|
|
|
+ # print(docids)
|
|
|
|