#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: bidikeji @time: 2024/12/26 10:31 """ from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree import re requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \ "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)" aptitude_pattern = "((资格|资质)[的及]?(要求|条件)|竞买资格及要求|供应商报价须知)([::,]|$)|(竞买|竞买人|竞投人|投标人|报价人)?资格(条件)?:|按以下要求参与竞买|(报名|竞买|投标)(条件|资格)" pinmu_name_pattern = "采购品目(名称)?([::,]|$)" addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)" addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)" pattern_dic_single = {'requirement': requirement_pattern, 'aptitude': aptitude_pattern, 'pinmu_name': pinmu_name_pattern} pattern_dic_addr = {'addr_bidopen': addr_bidopen_pattern, 'addr_bidsend': addr_bidsend_pattern} def get_kvtree_value(html): ''' 通过kv数解析,正则匹配 k 值获取内容 :param html: :return: ''' _pd = Html2KVTree(html) kv_single_dic = {} # 单独放在外面的字段 kv_addr_dic = {} # 放在地址字典的字段 for k, v in pattern_dic_single.items(): kv_l = _pd.extract_kv(v) value = '' for d in kv_l: if d.get('value', '').strip() != '': value = d['value'].strip() break if value != '' and re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要 kv_single_dic[k] = value for k, v in pattern_dic_addr.items(): kv_l = _pd.extract_kv(v) value = '' for d in kv_l: if d.get('value', '').strip() != '': value = d['value'].strip() if re.search('时间:', value) and re.search('地[点址]:(?P[\w()()【】-]{5,50})[,。]', value): value = re.search('地[点址]:(?P[\w()()【】-]{5,50})[,。]', value).group('addr') break if value != '' and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', value): # 包含两个中文以上的才要 避免 571236792 文件获取地点:-- 这种也提取 kv_addr_dic[k] = value return kv_single_dic, kv_addr_dic if __name__ == "__main__": with open('d:/html/2.html', encoding='utf-8') as f: html = f.read() rs = get_kvtree_value(html) print(rs)