import copy import os import re import sys import time import traceback from decimal import Decimal import pandas as pd from bs4 import BeautifulSoup sys.path.append(os.path.abspath(os.path.dirname(__file__)) + '/../../../') from BiddingKG.dl.common.Utils import spanWindow, timeFormat class PBPredictor: def __init__(self): self.stage_pattern, self.stage_priority_dict = get_stage_pattern() self.industry_pattern = get_industry_pattern() self.property_pattern, self.property_priority_dict = get_property_pattern() with open(os.path.abspath(os.path.dirname(__file__)) + '/structure_keyword.txt', 'r', encoding='utf-8') as f: self.structure_keyword_list = f.readlines() def get_col_from_prem(self, prem): tenderee, agency, product = None, None, None for item in prem: prem = item.get('prem') for key in prem.keys(): project = prem.get(key) role_list = project.get('roleList') for role_dict in role_list: if tenderee is None and role_dict.get('role_name') == 'tenderee': tenderee = role_dict.get('role_text') if agency is None and role_dict.get('role_name') == 'agency': agency = role_dict.get('role_text') product = item.get('product') begin_time = item.get('time_commencement') end_time = item.get('time_completion') return tenderee, agency, product, begin_time, end_time def predict(self, prem, list_articles, list_sentences, list_entitys, doctitle, code_name_dict, dochtmlcon, show=0): try: for list_article, list_sentence, list_entity in zip(list_articles, list_sentences, list_entitys): list_sentence.sort(key=lambda x: x.sentence_index) tenderee, agency, product, begin_time, end_time = self.get_col_from_prem(prem) content = list_article.content dochtmlcon = re.sub('[\r\n]', '', dochtmlcon) # 获取无附件的html soup = BeautifulSoup(dochtmlcon, 'lxml') attachment_div = soup.find('div', class_='richTextFetch') if attachment_div: attachment_div.decompose() content_no_att = soup.text else: content_no_att = '' project_name = code_name_dict.get('name') project_code = code_name_dict.get('code') if project_code: project_code = project_code[0] else: project_code = None start_time = time.time() stage = extract_legal_stage(project_name+doctitle, self.stage_pattern, self.stage_priority_dict, product, tenderee=tenderee, agency=agency) if show: print('extract_legal_stage time', time.time()-start_time) start_time = time.time() industry1 = extract_industry(doctitle+content, self.industry_pattern) if show: print('extract_industry time', time.time()-start_time) start_time = time.time() industry = extract_industry(doctitle+content_no_att, self.industry_pattern) if show: print('extract_industry time', time.time()-start_time) start_time = time.time() # print('industry', industry, industry1) if not industry and industry1: industry = industry1 proportion1, proportion = extract_proportion(content) if show: print('extract_proportion time', time.time()-start_time) start_time = time.time() project_digest = extract_project_digest(content) if show: print('extract_project_digest time', time.time()-start_time) start_time = time.time() project_address = extract_project_address(list_sentence, list_entity) if show: print('extract_project_address time', time.time()-start_time) start_time = time.time() location = get_bid_location(doctitle+"\t"+project_name) if show: print('get_bid_location time', time.time()-start_time) start_time = time.time() project_name_refind, show_name_refind = get_project_name_refind(project_name, doctitle, tenderee, agency) if show: print('get_project_name_refind time', time.time()-start_time) start_time = time.time() has_elevator = extract_has_elevator(content) if show: print('extract_has_elevator time', time.time()-start_time) start_time = time.time() project_property = extract_project_property(doctitle+"\t"+project_name, self.property_pattern, self.property_priority_dict) if show: print('extract_project_property time', time.time()-start_time) start_time = time.time() total_invest, construct_install_fee, engineer_cost = extract_several_money(list_sentence, dochtmlcon) if show: print('extract_several_money time', time.time()-start_time) start_time = time.time() max_floor = extract_max_floor(content, dochtmlcon) if show: print('extract_max_floor time', time.time()-start_time) start_time = time.time() structure = extract_structure(content, dochtmlcon, self.structure_keyword_list) if show: print('extract_structure time', time.time()-start_time) start_time = time.time() has_steel = extract_has_steel_structure(structure) if show: print('extract_has_steel_structure time', time.time()-start_time) start_time = time.time() wall_type, wall_type2 = extract_wall_type(doctitle+"\t"+project_name, content) if show: print('extract_wall_type time', time.time()-start_time) start_time = time.time() if stage is not None: has_stage = 1 else: has_stage = 0 pb_json = { 'tenderee': tenderee, 'agency': agency, 'project_code': project_code, 'project_name': project_name, 'doctitle': doctitle, 'stage': stage, 'industry': industry, 'proportion': proportion, 'projectDigest': project_digest, 'projectAddress': project_address, 'location': location, 'project_name_refind': project_name_refind, 'has_elevator': has_elevator, 'project_property': project_property, 'total_invest': total_invest, 'construct_install_fee': construct_install_fee, 'engineer_cost': engineer_cost, 'max_floor': max_floor, 'structure': structure, 'has_steel': has_steel, 'wall_type': wall_type, 'wall_type2': wall_type2, 'begin_time': begin_time, 'end_time': end_time, 'has_stage': has_stage, } pb_json = {'pb': pb_json} return pb_json except: traceback.print_exc() return {'pb': 'error'} def extract_legal_stage(content, _pattern, priority_dict, product='', tenderee='', agency=''): # 判断这几类直接返回 if not content: return None if re.search("拍卖|转让|产权|出让|租赁|招租", content) is not None: return None # 去掉招标人代理人 content = re.sub(str(tenderee), '', content) content = re.sub(str(agency), '', content) # 竣工阶段标志:消防、物业、安保、装修、通风系统、排烟、第三方检测 # if re.search("消防|物业|安保|装修|通风系统|排烟|第三方检测", content) is not None: # return '竣工阶段' # 替换混淆词 _content = re.sub("设计院|设计总院|造价咨询有限公司", "", content) list_stage = [] for stage_search in re.finditer(_pattern, _content): for k,v in stage_search.groupdict().items(): if v is not None: list_stage.append([k, priority_dict.get(k)]) if len(list_stage) > 0: list_stage.sort(key=lambda x: x[1]) stage = list_stage[0][0] # 用product判断竣工阶段是否合法 if product: if not re.search('施工|工程|建设', str(product)): stage = None for s in list_stage: if s[0] != '竣工阶段': stage = s[0] break # 立项排除立项目 if stage == '立项阶段': sub_content = re.sub('立项目', '', _content) for stage_search in re.finditer(_pattern, sub_content): for k,v in stage_search.groupdict().items(): if v is not None: list_stage.append([k, priority_dict.get(k)]) if len(list_stage)>0: list_stage.sort(key=lambda x: x[1]) stage = list_stage[0][0] return stage return None def get_project_name_refind(project_name, doctitle, tenderee='', agency= '', min_len=3): # 跳过部分 re_str11 = '网上超市|服务市场采购|印刷服务|复印纸|车辆维修和保养|商品房预售|办公家具定点|直接订购|定点议价' \ '|政府采购意向|信息技术服务定点议价|信息技术服务定点采购|法人章刻制中介机构|专用设备|办公设备采购' \ '|线上摇号选取' re_str12 = '物业' re_str13 = '公共资源交易平台' re_str19 = '环境影响评价(文件|)(审批|审核|受理)|拟作出的建设' # 干扰部分 re_str1 = "<.*?>[.*?]|{.*?}|〔.*?〕|《.*?》|【.*?】|\(.*?\)|\[.*?\]|(.*?)|\d{1,2}月\d{1,2}[日号]|\d{1,2}:\d{2}" re_str4 = '[,.:;,。:;\'\"“”‘’\-/<>#@!$%&*+=·¥|??|-+#"﹝﹒!]' re_str5 = '[工程项目建设拟对年批第作出的个及在已]|标段|EPC|总承包|招标|文件|开标|记录|公示|验收|勘察|编制|公开选取|准予|论证|各单位|附件|建筑业' \ '|责任|诚挚|拟作出审批意见|生产|涉及|消防|政府|投资|方案|技术支持|文件|研发|申请报告|出具|现代|产业|依法|报批|行政|审批|许可|开展' \ '|活动|开放日|系列|某部|零星工程|某(地产|型号|单位)|权限内|办理了|外商|我院|召开|我市|启动|我单位|我(县|区|会)|成功|举办|举行' \ '|被评为|征(求|询)|包括|不包括|层层分包|合同估算价|万元以内|组织|全(市|区|县)|承接|积极|针对|企业|小规模|安全|助推|装修|改造' \ '|新建|居住|技术|建设|建筑|安装|园林|绿化|信息化|采购|商品房|预售|许可|房产|测量|报告|业务|零星|维修|水土保持|扩建|夜间|工地' \ '|整治|高速公路|备案|加油站|设施|环境|保护|合同|履约|在线|询价|面积|受理|经济中心|服务|食品|加工|利用|公开|选取|动物|疫苗|框架' \ '|协议|房屋|中国|不动产|实验室|限额|以下|单位|入围|审查|合格|意见|新能源|常规|许可|申请|加工|制品|建议书|可研|结算|审核|遴选' \ '' re_str6 = '总承包|设计|环评|监理|施工|竣工|项目|工程|EPC|验收|勘察设计|全过程造价咨询|造价咨询|勘察|可行性研究报告|初步设计|社会稳定风险评估|测绘' \ '|(地震安全性|环境影响|水土保持)评(价估)' re_str7 = '许可信息公开表|办理结果公示|审批信息公开表|验收公示表|信息披露|备案登记表|验收结果公示|' \ '审批意见公开|受理公示|施工许可|情况说明|合同纠纷调解|施工许可双公示|施工许可证|政策' re_str8 = '[〔〕()\[\]《》()【】{}{}[]<>]' re_str14 = '[〔(\[(【{{[<](采购结果|设计|环评|监理|施工|竣工|工程|EPC|验收|勘察设计|全过程造价咨询|造价咨询|勘察)[〕)\]》)】}}]>]' # 截取部分 re_str2 = '机场快线|科技园|产业园|工业园|工程|项目|施工|竣工|总承包|改造|监理|可研|验收|勘察设计|全过程造价咨询|造价咨询|勘察|可行性研究报告|EPC|初步设计|社会稳定风险评估' re_str3 = '关于(公布核准的|一种用于|核发|作出|同意|开展|调整|请求|规范|要求|进一步|遴选|领取)|关于公[布开示]|关于[为对]|关于|公司|集团|局|委托' re_str9 = '改扩建|建设|迁改|土建|测绘|(地震安全性|环境影响|水土保持)评(价估)' # 混淆部分 re_str10 = '局部' re_str17 = '(工程|信息|)有限|公司|集团|局|大学|院|学校|学院|中心' re_str18 = '(设计|造价|咨询|建设|项目|管理|工程)+有限|(信息|职业|技术|管理)+(大学|学校|学院|中心|院|所)' re_str26 = '服务类|(设计|造价|咨询|建设|项目|管理|工程)+(大学|学校|学院|中心|院|所|集团|局)' # 需判断删除部分 re_str15 = '[ 、#※.?<|①=-_—]|=-|##|\*+|[\((](001|[0-9]|[一二三四五六七八九十]+|)[\))]|(0001|0000|001|002|01|02)+' re_str16 = '[0-9][.、]' # 删除特定表达 re_str20 = '公共资源交易中心.*关于' re_str21 = '[\u4e00-\u9fff]{2,}市[\u4e00-\u9fff]{2,}区' re_str22 = '[\u4e00-\u9fff]{2,4}区[^至]' re_str23 = '.{1,2}招标公告|(PDF|pdf)(版|)' re_str25 = '(小区)$' re_str27 = '[\u4e00-\u9fff]{2,3}省|[\u4e00-\u9fff]{2,3}市' re_str_area = '华北|华南|华东|华中|西南|东北|西北' re_str_province = '北京|天津|河北|山西|内蒙古|广东|海南|广西|上海|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|重庆|四川|贵州|云南|西藏|黑龙江|辽宁|吉林|陕西|甘肃|青海|宁夏|新疆|台湾|香港|澳门' re_str_city = '东城|和平|石家庄|唐山|秦皇岛|邯郸|邢台|保定|张家口|承德|沧州|廊坊|衡水|太原|大同|阳泉|长治' \ '|晋城|朔州|晋中|运城|忻州|临汾|吕梁|呼和浩特|包头|乌海|赤峰|通辽|鄂尔多斯|呼伦贝尔|巴彦淖尔' \ '|乌兰察布|兴安盟|锡林郭勒盟|阿拉善盟|广州|韶关|深圳|珠海|汕头|佛山|江门|湛江|茂名|肇庆|惠州' \ '|梅州|汕尾|河源|阳江|清远|潮州|揭阳|云浮|海口|三亚|南宁|柳州|桂林|梧州|北海|防城港|钦州|贵港' \ '|玉林|百色|贺州|河池|来宾|崇左|黄浦|南京|无锡|徐州|常州|苏州|南通|连云港|淮安|盐城|扬州|镇江' \ '|泰州|宿迁|杭州|宁波|温州|嘉兴|湖州|绍兴|金华|衢州|舟山|台州|丽水|合肥|芜湖|蚌埠|淮南|马鞍山' \ '|淮北|铜陵|安庆|黄山|滁州|阜阳|宿州|六安|亳州|池州|宣城|福州|厦门|莆田|三明|泉州|漳州|南平' \ '|龙岩|宁德|南昌|景德镇|萍乡|九江|新余|鹰潭|赣州|吉安|宜春|抚州|上饶|济南|青岛|淄博|枣庄' \ '|东营|烟台|潍坊|济宁|泰安|威海|日照|临沂|德州|聊城|滨州|菏泽|郑州|开封|洛阳|平顶山|安阳|鹤壁' \ '|新乡|焦作|濮阳|许昌|漯河|三门峡|南阳|商丘|信阳|周口|驻马店|武汉|黄石|十堰|宜昌|襄阳|鄂州' \ '|荆门|孝感|荆州|黄冈|咸宁|随州|恩施土家族|长沙|株洲|湘潭|衡阳|邵阳|岳阳|常德|张家界|益阳' \ '|郴州|永州|怀化|娄底|湘西土家族|万州|成都|自贡|攀枝花|泸州|德阳|绵阳|广元|遂宁|内江|乐山' \ '|南充|眉山|宜宾|广安|达州|雅安|巴中|资阳|阿坝藏族羌族|甘孜藏族|凉山彝族|贵阳|六盘水|遵义' \ '|安顺|铜仁|黔西南布依族|毕节|黔东南苗族|黔南布依族|昆明|曲靖|玉溪|保山|昭通|丽江|普洱|临沧' \ '|楚雄彝族|红河哈尼族|文山|西双版纳傣族|大理白族|德宏傣族景颇族|怒江傈僳族|迪庆藏族|拉萨|昌都' \ '|山南|日喀则|那曲|阿里地区|林芝|哈尔滨|齐齐哈尔|鸡西|鹤岗|双鸭山|大庆|伊春|佳木斯|七台河' \ '|牡丹江|黑河|绥化|大兴安岭|沈阳|大连|鞍山|抚顺|本溪|丹东|锦州|营口|阜新|辽阳|盘锦|铁岭' \ '|朝阳|葫芦岛|长春|吉林|四平|辽源|通化|白山|松原|白城|延边朝鲜族|西安|铜川|宝鸡|咸阳|渭南' \ '|延安|汉中|榆林|安康|商洛|兰州|嘉峪关|金昌|白银|天水|武威|张掖|平凉|酒泉|庆阳|定西|陇南' \ '|临夏回族自治州|甘南藏族|西宁|海东|海北藏族|黄南藏族|海南藏族|果洛藏族|玉树藏族|海西蒙古族' \ '|银川|石嘴山|吴忠|固原|中卫|乌鲁木齐|克拉玛依|吐鲁番|哈密|昌吉|博尔塔拉蒙古|巴音郭楞蒙古' \ '|阿克苏|克孜勒苏柯尔克孜|喀什|和田地区|伊犁|伊犁哈萨克|塔城地区|阿勒泰|中山|东莞|天门|仙桃|潜江' \ '|石河子|五家渠|阿拉尔|图木舒克|三沙|儋州|涪陵|永川|西城|朝阳|丰台|石景山|海淀|门头沟' \ '|房山|通州|顺义|昌平|大兴|怀柔|平谷|密云|延庆|河东|河西|河北区|红桥|东丽|西青|津南|北辰' \ '|武清|宝坻|滨海|宁河|静海|蓟州|渝中|大渡口|江北|沙坪坝|九龙坡|南岸|北碚|綦江|大足|渝北' \ '|巴南|黔江|长寿|江津|合川|南川|璧山|铜梁|潼南|荣昌|开州|徐汇|长宁|静安|普陀|虹口|杨浦' \ '|闵行|宝山|嘉定|浦东新|金山|松江|青浦|奉贤|崇明|济源|神农架林区|五指山|文昌|琼海|万宁' \ '|东方|定安|屯昌|澄迈|临高|白沙黎族|昌江黎族|乐东黎族|陵水黎族|保亭黎族|琼中黎族|梁平' \ '|丰都|城口|垫江|忠县|云阳|奉节|巫山|巫溪|石柱|秀山|武隆|酉阳|彭水|南开|北屯|铁门关' \ '|双河|可克达拉|昆玉|胡杨河' re_str28 = '({})(地区)?|({})省?|({})[区市]?'.format(re_str_area, re_str_province, re_str_city) re_str29 = '(({})(地区)?({})省?)|(({})省?({})[区市]?)'.format(re_str_area, re_str_province, re_str_province, re_str_city) # 直接删除部分 re_str24 = '(的|)(竞争性谈判|竞争性磋商|磋商|中标|单一来源|招标|更正)(采购|)(公告|)' add_col = project_name if project_name else '' + doctitle if doctitle else '' if re.search(re_str11, add_col) and not re.search(re_str12, add_col): return '', '' from_col_list = [project_name, doctitle] name_refind_flag_dict = {'True': [], 'False': []} for col in from_col_list: name_refind = "" match_flag = False if col is not None and len(col) > 0: name_refind = col # 部分跳过 if re.search(re_str13, name_refind): continue # 替换特定表达 match = re.search(re_str20, name_refind) if match: name_refind = name_refind[match.span()[1]:] # 去掉干扰 name_refind = re.sub('年度', '年', name_refind) name_refind = re.sub(re_str4, '', name_refind) name_refind = re.sub(re_str14, '', name_refind) # print('name_refind', name_refind) # 连续截取工程前的,看哪一部分最适合当refind match = re.finditer(re_str2, name_refind) prob_name_list = [] last_index = 0 project_word_in_org = [] for m in match: # 混淆词,设施工程中的施工 if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']: continue # 判断是不是公司名里的工程 if re.search(re_str26, name_refind[m.span()[1]:]): project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))]) continue if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]): project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))]) continue if re.search(re_str18, name_refind[m.span()[1]:]): project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))]) continue match_flag = True prob_name_list.append(name_refind[last_index:m.span()[1]]) last_index = m.span()[1] # print('match_flag', match_flag, name_refind) # 找不到则用第二套截取 if not prob_name_list: match = re.finditer(re_str9, name_refind) last_index = 0 for m in match: # 混淆词,设施工程中的施工 if m.span()[0] > 0 and name_refind[m.span()[0]-1] in ['设']: continue # 判断是不是公司名里的工程 if re.search(re_str26, name_refind[m.span()[1]:]): project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))]) continue if re.search(re_str17, name_refind[m.span()[1]:m.span()[1]+3]): project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))]) continue if re.search(re_str18, name_refind[m.span()[1]:]): project_word_in_org.append(name_refind[max(0, m.span()[0]-1):min(m.span()[1]+1, len(name_refind))]) continue match_flag = True prob_name_list.append(name_refind[last_index:m.span()[1]]) last_index = m.span()[1] if not prob_name_list: prob_name_list = [name_refind] # print('prob_name_list', prob_name_list) # print('project_word_in_org', project_word_in_org) # 一开始不去掉括号里的内容,截取后再去掉 for i, name in enumerate(prob_name_list): # 括号内容大于一半字数,则不去掉括号中的字 match = re.search(re_str1, name) # print('name', name) # print('match', match) if match and len(match.group()) < len(name) / 2: name = re.sub(re_str1, "", name) name = re.sub(re_str8, "", name) prob_name_list[i] = name # 判断refind是否合法 # print('prob_name_list2', prob_name_list) name_refind = '' for name in prob_name_list: # 截取公司后的 match = re.finditer(re_str3, name) prob_name_list2 = [] for m in match: # 排除混淆的情况 if m.group() in re_str10 and re.search(re_str10, name): continue prob_name_list2.append(name[m.span()[1]:]) if prob_name_list2: name = prob_name_list2[-1] # 剔除工程类判断词 match1 = re.finditer(re_str6, name) for m1 in match1: # 混淆词,设施工程中的施工 if m1.span()[0] > 0 and name[m1.span()[0]-1] in ['设']: continue s_index, e_index = m1.span() word = name[s_index:e_index] s_index = s_index - 1 if s_index > 0 else 0 e_index = e_index + 1 if e_index < len(name) else len(name) word1 = name[s_index:e_index] if word1 in project_word_in_org: continue name = re.sub(re.escape(word), '=' * len(word), name) name = re.sub('={2,}', "", name) # 剔除一些无关词占用长度 if len(re.findall('[\u4e00-\u9fff]', name)) >= min_len \ and len(re.findall('[\u4e00-\u9fff]', re.sub(re_str5, '', name))) >= min_len: name_refind = name break if match_flag: name_refind_flag_dict['True'] += [name_refind] else: name_refind_flag_dict['False'] += [name_refind] # print('name_refind_flag_dict', name_refind_flag_dict) true_list = name_refind_flag_dict.get('True') false_list = name_refind_flag_dict.get('False') name_refind_candidate_list = [] if true_list: true_list.sort(key=lambda x: len(x), reverse=True) name_refind = true_list[0] name_refind_candidate_list += true_list # else: # name_refind = '' if false_list: false_list.sort(key=lambda x: len(x), reverse=True) name_refind_candidate_list += false_list # 对候选name_refind循环 name_refind = '' show_name_refind = '' for name_refind in name_refind_candidate_list: # 直接判断删除数字 match = re.match(re_str16, name_refind) if match and not re.match('[0-9]', name_refind[match.span()[1]:match.span()[1]+1]): name_refind = name_refind[match.span()[1]:] # 删除开头奇怪数字 match = re.match(re_str15, name_refind) if match and not re.match('[a-zA-Z地块号]', name_refind[match.span()[1]:match.span()[1]+1]): name_refind = name_refind[match.span()[1]:] # 删除期数 name_refind = re.sub('[1-9一二三四五六七八九十]期', '', name_refind) # 跳过'xx省xx市' if re.search(re_str21, name_refind): sub_word = re.sub(re_str21, '', name_refind) sub_word = re.sub(re_str2 + '|' + re_str9, '', sub_word) if len(sub_word) <= 1: name_refind = '' continue match27 = re.search(re_str27, name_refind) if match27 and len(match27.group()) == len(name_refind): name_refind = '' continue match28 = re.search(re_str28, name_refind) if match28 and len(match28.group()) == len(name_refind): name_refind = '' continue match29 = re.search(re_str29, name_refind) if match29 and len(match29.group()) == len(name_refind): name_refind = '' continue # 删除类似'招标公告'表达 match2 = re.match(re_str23, name_refind) if match2: name_refind = name_refind[match2.span()[1]:] name_refind = re.sub(re_str24, '', name_refind) # 跳过文件审批 if re.search(re_str19, name_refind): name_refind = '' continue # 跳过网上超市 if re.search(re_str11, name_refind): name_refind = '' continue show_name_refind = copy.deepcopy(name_refind) # 删除区 match2 = re.match(re_str22, name_refind) if match2: name_refind = name_refind[match2.span()[1]-1:] # 删除'小区表达' if len(name_refind) >= min_len + 2: name_refind = re.sub(re_str25, '', name_refind) # 判断name_refind是否是从公司中来的,过滤 if tenderee in [None, 'None', '-', '']: tenderee = '' if agency in [None, 'None', '-', '']: agency = '' try: if len(name_refind) >= 4 and (re.search(re.escape(name_refind[-4:]), tenderee) or re.search(re.escape(name_refind[-4:]), agency)): name_refind = '' show_name_refind = '' except: pass # 判断长度 if len(name_refind) < min_len: name_refind = '' show_name_refind = '' continue break return name_refind, show_name_refind def extract_industry(content, _pattern): list_stage = [] stage_dict = {} for stage_search in re.finditer(_pattern, content): for k,v in stage_search.groupdict().items(): if v is not None: list_stage.append(k) if k in stage_dict.keys(): stage_dict[k] += 1 else: stage_dict[k] = 1 if len(list_stage)>0: stage_cnt_list = [[x, stage_dict.get(x)] for x in stage_dict.keys()] stage_cnt_list.sort(key=lambda x: x[1], reverse=True) # print('extract_industry ' + str(stage_cnt_list)) return stage_cnt_list[0][0] # return list_stage[0] return None def extract_project_code_name(list_entity): project_code = None project_name = None for p_entity in list_entity: if p_entity.entity_type == "name": project_name = p_entity.entity_text elif p_entity.entity_type == "code": project_code = p_entity.entity_text if project_name and project_code: break return project_code, project_name def extract_tenderee(list_entity): tenderee = None for p_entity in list_entity: if str(p_entity.label) == "0": tenderee = p_entity.entity_text break return tenderee def extract_project_digest(content): _pattern = "(?P(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})" _pattern_search = re.search(_pattern,content) _projectDigest = "" _find = "" if _pattern_search is not None: _find = _pattern_search.groupdict().get("projectDigest","") if len(_find)>0: _projectDigest = "。".join(_find.split("。")[0:3]) # 截掉中标信息 if len(_projectDigest) >= 10: _projectDigest = cut_win_bid_part(_projectDigest) if len(_projectDigest) < 10: _projectDigest = "" return _projectDigest def extract_project_address(list_sentence, list_entity): reg1 = "(项目|建设|工程)所在区域(位于|为|)[::]?" reg2 = "(项目|建设|工程)(地址|地点|)(位于|起于)[::]?" reg3 = "(项目|建设|工程)(地址|地点)[::]?(位于|起于)" reg4 = "(项目|建设|工程)(地址|地点)[为::]+" address_list = [] candidate_list = [] for sentence in list_sentence: for reg in [reg1, reg2, reg3, reg4]: content = sentence.sentence_text match = re.finditer(reg, content) for m in match: end_index = m.span()[1] for p_entity in list_entity: if p_entity.sentence_index != sentence.sentence_index: continue if p_entity.entity_type != "location" and p_entity.entity_type != "org": continue text = p_entity.entity_text if text == content[end_index:end_index+len(text)] or text in content[end_index:end_index+len(text)+10]: address_list.append(text) else: candidate_list.append(content[max(0, end_index-10):end_index] + '@@@' + content[end_index:end_index+20] + '@@@' + text) if address_list: break if not address_list: # for can in candidate_list: # logging.info('candidate ' + can) return None else: address_list.sort(key=lambda x: len(x), reverse=True) # for address in address_list: # logging.info('address ' + address) address = address_list[0] return address_list[0] def extract_begin_end_time(list_sentence, list_entity): _begin_time = None _end_time = None for p_entity in list_entity: if p_entity.entity_type == "time": for _sentence in list_sentence: if _sentence.sentence_index == p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text) if re.search("开工(时间|日期)",_span[0]) is not None: _time_temp = timeFormat(p_entity.entity_text) if len(_time_temp)>0: _begin_time = _time_temp if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None: _time_temp = timeFormat(p_entity.entity_text) if len(_time_temp)>0: _end_time = _time_temp return _begin_time,_end_time def get_bid_location(content): """ 获取标段工程地点 """ import BiddingKG.dl.interface.Preprocessing as Preprocessing content = re.sub("[,,.。、\-\(\)()— #+~“”'﹙{}-]", "", content) # 预处理 content_html = '
' + content + "
" list_articles, list_sentences, list_entitys, list_outlines, _cost_time \ = Preprocessing.get_preprocessed([[0, content_html, "", "", content, "", ""]], useselffool=True) # 获取location location = None for entitys in list_entitys: for entity in entitys: if entity.entity_type == "location": if re.search("[街路村县区溪湖河城厂]|[小中大]学|学[校院]", entity.entity_text): location = entity.entity_text break # location还未空,寻找类似 '薛家湾(张家圪旦)至柳青'的表达 if not location: match = re.search('.{2,4}至.{2,4}', content) if match: location = match.group() # 判断location不在一些特定实体里 find_flag = False for entitys in list_entitys: for entity in entitys: if entity.entity_type in ["tenderee", 'agency', 'win_tenderer', 'second_tenderer', 'third_tenderer', 'company', 'org']: if location in entity.entity_text: find_flag = True break if find_flag: break if find_flag: location = None return location def extract_proportion(content, has_preffix=True): if not content: return "", "" # log("content") # log(content) suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))" reg_dict = { 0: "(?P(总((建筑|建设)面积|长|长度))" + suffix, 1: "(?P((建筑|建设)面积|全长)" + suffix, 2: "(?P((建筑|建设|区域)?面积|全长|项目规模)" + suffix } if not has_preffix: reg_dict[3] = "(?P" + suffix _proportion = "" for i in range(len(list(reg_dict.keys()))): if _proportion: break _pattern = reg_dict.get(i) # logging.info('content ' + str(content)) match = re.search(_pattern, str(content)) if match: _proportion = match.groupdict().get("proportion","") if not _proportion: return "", "" # 统一格式 multiple_cnt = 1 digit = "" # 确定具体数字 match = re.search('(?P[\d,]+)(?P(\.\d+)?)', _proportion) if match: # logging.info(str(_proportion) + ' ' + str(match.group())) d1 = match.group('d1') d2 = match.group('d2') try: d1 = int(re.sub(',', '', d1)) except: return "", "" if d2: d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:])))) # print('d1, d2', d1, d2) d1 += d2 digit = d1 # print('digit', digit) # 确定中文倍数 _proportion2 = re.sub(re.escape(match.group()), '', _proportion) match = re.search('[十百千万亿]+', _proportion2) _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000} if match: for c in match.group(): multiple_cnt *= _dict.get(c) _proportion3 = re.sub(re.escape(match.group()), '', _proportion2) else: _proportion3 = _proportion2 # print('multiple_cnt2', multiple_cnt) # 确定面积/长度 match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3) if match: unit = '㎡' else: unit = 'm' # 确定单位倍数 match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3) if match: if unit == 'm': if re.search('[kK千公]', match.group()): multiple_cnt *= 1000 elif re.search('[里]', match.group()): multiple_cnt *= Decimal(str(500)) else: if '亩' in match.group(): multiple_cnt *= Decimal(str(666.67)) elif '顷' in match.group(): multiple_cnt *= 10000 elif re.search('千米|公里|k[mM㎡]', match.group()): multiple_cnt *= 1000000 # print('multiple_cnt1', multiple_cnt) # 拼接 digit = str(digit * multiple_cnt) + unit return _proportion, digit def extract_has_elevator(content): reg = '电梯' match = re.search(reg, content) has_flag = 0 if match: start_index, end_index = match.span() has_flag = 1 if judge_yeji(match.span()[0], content): has_flag = 0 elif re.search('公司', content[end_index:end_index+8]): has_flag = 0 return has_flag def extract_project_property(content, property_pattern, property_priority_dict): property_list = [] for m in re.finditer(property_pattern, content): for k,v in m.groupdict().items(): if v is not None: property_list.append([k, property_priority_dict.get(k)]) _property = '新建' if len(property_list)>0: property_list.sort(key=lambda x: x[1]) _property = property_list[0][0] return _property def extract_several_money(list_sentence, html='', is_obj=True, show=0): start_time = time.time() start_time1 = time.time() money_type_list = ['总投资', '建安费', '工程造价'] money_list = [] all_before_sentence = '' soup = BeautifulSoup(html, 'lxml') tables_and_divs = soup.find_all(['table', 'div']) for i, sentence in enumerate(list_sentence): if show and i % 100 == 0: print('extract_several_money Loop', i, len(list_sentence), time.time()-start_time1) start_time1 = time.time() last_text = '' next_text = '' if is_obj: text = sentence.sentence_text all_before_sentence += text if i > 0: last_text = list_sentence[i-1].sentence_text[-30:] if i < len(list_sentence) - 1: next_text = list_sentence[i+1].sentence_text[:30] else: text = sentence all_before_sentence += text if i > 0: last_text = list_sentence[i-1][-30:] if i < len(list_sentence) - 1: next_text = list_sentence[i+1][:30] start_time2 = time.time() if judge_yeji(len(all_before_sentence), all_before_sentence, 300+len(text)): # print('sentence yeji before ' + text) continue if show: print('extract_several_money time0.1', time.time()-start_time2) start_time2 = time.time() # if '项目概算总投资为' in text: _list, _ = get_several_money(text, 0, False, tables_and_divs=tables_and_divs) if show: print('extract_several_money time0.2', time.time()-start_time2) start_time2 = time.time() # logging.info('get_several_money _list ' + str(_list)) temp_list = [] for l in _list: if l[-1] == '总投资': if re.search('业绩', last_text+text+next_text): continue temp_list.append(l) _list = temp_list if show: print('extract_several_money time0.3', time.time()-start_time2) start_time2 = time.time() money_list += _list # if money_list: # break if show: print('extract_several_money time1', time.time()-start_time) start_time = time.time() money_type_dict = {} for money, _, _, _, money_type in money_list: for _type in money_type_list: if _type != money_type: continue # 科学计数法 try: if 'E+' in money: times = int(money.split('E+')[-1]) _money = float(float(money.split('E+')[0]) * (10 ** times)) else: _money = float(money) except: continue if _type in money_type_dict.keys(): money_type_dict[_type] += [_money] else: money_type_dict[_type] = [_money] # logging.info('money_type_dict ' + str(money_type_dict)) if show: print('extract_several_money time2', time.time()-start_time) start_time = time.time() result_list = [] for _type in money_type_list: if money_type_dict.get(_type): if _type == '建安费': temp_list = money_type_dict.get(_type) temp_list = list(set(temp_list)) money = 0 for m in temp_list: money += m result_list.append(money) else: result_list.append(money_type_dict.get(_type)[0]) else: result_list.append(None) if show: print('extract_several_money time3', time.time()-start_time) start_time = time.time() for i in range(len(result_list)): if result_list[i] is None: result_list[i] = 0 result_list[i] = float(result_list[i]) return result_list def extract_max_floor(content, html=None): def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None): _match = re.finditer(_reg, _content) _floor_list = [] for m in _match: if 'reg6' in _reg: _floor1 = content[max(0, m.span('reg6')[0]-1):m.span('reg6')[1]+1] elif 'reg4' in _reg: _floor1 = content[max(0, m.span('reg4')[0]-1):m.span('reg4')[1]+1] else: _floor1 = content[m.span()[0]:m.span()[1]] if judge_yeji(m.span()[0], _content, 300, _tables_and_divs, _floor1): continue if 'reg6' in _reg: _floor = m.group('reg6') _floor = re.split('[-~~]', _floor) _floor = max(int(_floor[0]), int(_floor[1])) elif 'reg4' in _reg: _floors = re.findall('\d+', m.group()) _floors = [int(x) for x in _floors] _floors1 = re.findall('[一二两三四五六七八九十]+', m.group()) _floors1 = [chinese_to_arabic(x) for x in _floors1] _floor = max(_floors + _floors1) elif '-' in m.group(): _floor = ''.join(re.findall('\d+-\d+', m.group())) if len(_floor) < 1: continue _floor = _floor.split('-') _floor = max(int(_floor[0]), int(_floor[1])) elif '/' in m.group(): _floor = m.group() _floor = re.sub('层', '', _floor) _floor = ''.join(re.findall('\d+/\d+', _floor)) # print('@1', _floor) if len(_floor) < 1: continue _floor = _floor.split('/') _floor = max(int(_floor[0]), int(_floor[1])) else: _floor = ''.join(re.findall('\d+', m.group())) if len(_floor) < 1: _floor = ''.join(re.findall('[一二两三四五六七八九十]+', m.group())) if len(_floor) < 1: continue _floor = chinese_to_arabic(_floor) _floor = int(_floor) if _reg2: _floor_list2 = match_floor(_reg2, _content[m.span()[1]:m.span()[1]+35]) # print('@2', _floor_list2) if _floor_list2: _floor2 = int(_floor_list2[0]) _floor = _floor + _floor2 _floor_list.append(_floor) return _floor_list reg = '(建筑|)(物|)(层数最大|最大层数|最高层|总层数|最大层|层数)[共为::]?(\d{1,3}|\d{1,3}层?/\d{1,3}|[一两二三四五六七八九十]{1,3})[层Ff]' reg0 = '局部(建筑|)(层数|)[共为]?(\d{1,3}|[一二两三四五六七八九十]{1,3})层' reg1 = '地上(建筑|)(层数|)[共为]?(\d{1,3}层?/\d{1,3}|\d{1,3}-\d{1,3}|\d{1,3}|[一二两三四五六七八九十]{1,3})层' reg2 = '地下(建筑|)(层数|)[共为]?(\d{1,3}层?/\d{1,3}|\d{1,3}-\d{1,3}|\d{1,3}|[一二两三四五六七八九十]{1,3})层' reg3 = '[到至]\d{1,3}层' reg4 = '层数分别.{1,20}(?P(\d{1,3}|[一二两三四五六七八九十]{1,3}))层' reg5 = '共(\d{1,3}|[一二两三四五六七八九十]{1,3})层' reg6 = '地上.{1,10}(?P\d{1,3}[-~~]\d{1,3})层' if html: soup = BeautifulSoup(html, 'lxml') tables_and_divs = soup.find_all(['table', 'div', 'p']) else: tables_and_divs = [] floor_list = [] # 常规 floor_list += match_floor(reg, content, _tables_and_divs=tables_and_divs) # 局部 floor_list += match_floor(reg0, content, _tables_and_divs=tables_and_divs) # x层到x层 floor_list += match_floor(reg3, content, _tables_and_divs=tables_and_divs) # 地上地下 floor_list += match_floor(reg1, content, _reg2=reg2, _tables_and_divs=tables_and_divs) floor_list += match_floor(reg2, content, _reg2=reg1, _tables_and_divs=tables_and_divs) # 层数分别为... xx层 floor_list += match_floor(reg4, content, _tables_and_divs=tables_and_divs) # 共xx层 floor_list += match_floor(reg5, content, _tables_and_divs=tables_and_divs) # 地上... xx~xx层 floor_list += match_floor(reg6, content, _reg2=reg2, _tables_and_divs=tables_and_divs) if floor_list: floor_list.sort(key=lambda x: x) floor = floor_list[-1] if floor <= 0: return None else: return floor_list[-1] else: return None def extract_structure(content, html=None, structure_keyword_list=None): # reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构' reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是::])([^结]{2,8}结构)' reg = '|'.join([x[:-1] for x in structure_keyword_list]) reg = reg_word_sort(reg) # logging.info(reg) if html: soup = BeautifulSoup(html, 'lxml') tables_and_divs = soup.find_all(['table', 'div', 'p']) else: tables_and_divs = [] # match = re.finditer(reg1, content) # structure_list = [] # for m in match: # structure = m.group(4) # structure1 = content[max(0, m.span(4)[0]-1):m.span(4)[1]+1] # if judge_yeji(m.span()[0], content, 300, tables_and_divs, structure1): # continue # structure_list.append(structure) # if structure_list: # structure_list = list(set(structure_list)) # structure_list.sort(key=lambda x: x) # return ','.join(structure_list) # else: structure_list = [] match = re.finditer(reg, content) for m in match: structure = m.group() structure1 = content[max(0, m.span()[0]-1):m.span()[1]+1] if judge_yeji(m.span()[0], content, 300, tables_and_divs, structure1): continue if structure in ['钢结构']: if re.search('公司', content[m.span()[1]:m.span()[1]+8]): continue structure_list.append(structure) if structure_list: structure_list = list(set(structure_list)) structure_list.sort(key=lambda x: x) return ','.join(structure_list) else: return None def extract_has_steel_structure(content): """ :param content: 传入已提取完的structure :return: """ if not content: return 0 reg = '钢结构|门式钢架结构|钢框架结构|钢桁架结构|钢网架结构|钢框结构钢架结构|钢骨架结构|钢骨架式结构|钢管桁架结构|轻钢骨架结构|钢桁架式结构|轻钢门架结构|轻钢门式结构|门式钢屋架结构|钢桁架框架结构|钢框架—支撑结构|钢桁梁结构|网架结构|网壳结构|索膜结构|塔桅结构' match = re.search(reg, content) has_flag = 0 if match: has_flag = 1 return has_flag def extract_wall_type(doctitle, content): reg1 = '(幕墙|外墙)' wall_class_dict = { '玻璃幕墙': '玻璃|玻璃砖', '金属幕墙': '铝合金|铝单板|仿木纹铝单板|夹芯保温铝板|铝复合板|蜂窝铝板|仿石材铝板|铝板|彩钢板|不锈钢板|彩涂钢板|珐琅钢板|钛合金板|铝合金装饰网格|镀锌钢网|铜合金|镀层钢板|锌板|钛板', '陶土板幕墙': '陶土板', '涂料': '涂料|乳胶漆|弹性涂料|质感涂料|真石漆|水包水|水包砂|岩片漆|金属漆|氟碳漆|仿石涂料|封闭底漆|腻子|罩光清漆|柔性耐水腻子|罩面漆|仿瓷涂料|有机硅丙烯酸涂料|氟碳树脂涂料|硅溶胶外墙涂料|无机纤维喷涂', '石材': '石材|花岗岩|大理石|砂岩|板岩|岩棉板|岩板', '瓷砖': '陶瓷砖|劈开砖|釉面砖|通体砖|抛光砖|马赛克砖|陶砖|玻璃砖', '混凝土': '钢筋混凝土|现浇混凝土|预制混凝土|混凝土砌块|蒸压多孔混凝土|纤维混凝土|UHPC板|UHPC|UHPC装饰板|超高性能混凝土|清水混凝土|混凝土|加气混凝土砌块', '复合墙板': '复合外墙板|复合墙板|GRC|金属面岩棉板|复合保温幕墙板|金属面复合保温幕墙|金属面岩棉夹心外墙板|夹心保温外墙|金属面岩棉装饰节能一体板|PU板幕墙|保温装饰一体板|保温一体板|岩棉保温装饰一体板|EPS保温板|岩棉保温板|STP保温板|酚醛保温板|聚氨酯保温板|钛锌塑铝复合板|自保温板外墙板|岩棉夹心板外墙板|玻璃棉|EPS板|XPS板|聚氨酯外墙保温板|外墙保温板|保温岩棉复合板|水泥发泡保温板|发泡陶瓷保温板|石墨改性水泥基保温板|挤塑聚苯板|挤塑板|聚苯乙烯板|玻化微珠岩棉板|聚苯板|聚氨酯板|热固复合聚苯乙烯泡沫保温板', '水泥': '白水泥|白色硅酸盐水泥', '文化砖': '文化砖', '木材': 'LVL|单板层积材|高压热固化木纤维板', '陶棍幕墙': '陶棍幕墙', '风动幕墙': '风铃幕墙|风动幕墙', '光电幕墙': '光伏幕墙|光电幕墙', '织物幕墙': '织物幕墙', '生态幕墙': '生态幕墙|气循环幕墙|呼吸幕墙|绿色幕墙|健康幕墙', } for key in wall_class_dict.keys(): reg = wall_class_dict[key] reg = reg_word_sort(reg) wall_class_dict[key] = '(' + reg + ')' wall_list = [] wall_list2 = [] match = re.search(reg1, doctitle) if match: for first_class in wall_class_dict.keys(): reg2 = wall_class_dict.get(first_class) if first_class in ['陶棍幕墙', '风动幕墙', '光电幕墙', '织物幕墙', '生态幕墙']: continue reg2 = reg2 + '(幕墙|外墙|)' match1 = re.search(reg2, content) if match1: wall_list.append(first_class) wall_list2.append(match1.group()) for first_class in wall_class_dict.keys(): reg2 = wall_class_dict.get(first_class) # 这5类特殊,只要存在就直接提取 if first_class in ['陶棍幕墙', '风动幕墙', '光电幕墙', '织物幕墙', '生态幕墙']: match = re.search(reg2, content) if match: wall_list.append(first_class) wall_list2.append(match.group()) # 其他类使用两种规则 else: reg3 = reg1 + '.{0,10}' + reg2 reg4 = reg2 + reg1 match = re.search(reg3, content) if match: wall_list.append(first_class) match1 = re.search(reg2, match.group()) if match1: wall_list2.append(match1.group()) match = re.search(reg4, content) if match: wall_list.append(first_class) wall_list2.append(match.group()) if wall_list: wall_list = list(set(wall_list)) wall_list.sort(key=lambda x: x) wall_list = ','.join(wall_list) else: wall_list = None if wall_list2: wall_list2 = list(set(wall_list2)) wall_list2.sort(key=lambda x: x) wall_list2 = ','.join(wall_list2) else: wall_list2 = None return wall_list, wall_list2 def cut_win_bid_part(_str): """ 截掉项目概述里的中标相关信息 """ origin_str = _str reg_list = [ "(评标|中标|中选)(结果|报告)(公示|)[::,,]", "第[一二三四1-9]名", "中标候选人(基本情况|附件)", "(排序|第[一二三1-3]|推荐)中标候选人|中标(候选人|结果)(排序|公示|信息)" ] for reg in reg_list: match = re.finditer(reg, _str) _index = len(origin_str) for m in match: _index = m.span()[0] break if _index == len(origin_str): continue match = re.finditer("[,,。;;]", _str[:_index]) _index = 0 for m in match: _index = m.span()[0] _str = _str[:_index] match = re.finditer("中[标选]|投标(人|企业|单位)|乙方|供应商|成交", _str) for m in match: index_start = m.span()[0] cut_str = re.split("[,,。;;]", _str[index_start:])[0] if len(cut_str) < 25: cut_str = _str[index_start:index_start+25] # cut_str = _str[index_start:index_start+15] # print("cut_str", cut_str) match2 = re.search("(¥|)[\d,.]+[万元亿]|公司|业绩|最终得分", cut_str) # print("match2", match2) if match2: # 排除混淆 match3 = re.search("中标法|获取招标文件|交接|支付|账户|结算|限价|承担|服务费|\*|×", cut_str) # print("match3", match3) if not match3: match4 = re.finditer("[,,。;;]", _str[:index_start]) index_stop = 0 for m4 in match4: index_stop = m4.span()[0] _str = _str[:index_stop] break # print(_str) return _str def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=None): def is_yeji_table(_tables_and_divs, _entity_text, show=0): start_time = time.time() if not _tables_and_divs: return 0 is_yeji = 0 reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况' for index3, obj in enumerate(_tables_and_divs): # if ' max_col_span: max_col_span = len(row.find_all('td')) # print('max_col_span', max_col_span) for index, row in enumerate(rows): if re.search(re.escape(_entity_text), str(row.get_text())): cells = row.find_all('td') if len(cells) == 0: continue # print('cells', cells[0]) # 第一列是否含业绩 # if re.search(reg_yeji, str(cells[0])): if re.search(reg_yeji, str(cells[0].get_text())): # logging.info('is_yeji_table 1') is_yeji = 1 # 前面几行是否有占多列的有业绩 else: if index > 0: for row2 in rows[:index][::-1]: # print('len(rows[index2])', len(row2.find_all('td'))) # if len(row2.find_all('td')) <= max_col_span / 2: # print(re.search('业绩', str(row2)), str(row2)) if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())): # logging.info('is_yeji_table 2') is_yeji = 1 break # 前面都找不到,那么找表格上方的两行 div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]] if not is_yeji and re.search(reg_yeji, ' '.join(div_list)): # logging.info('is_yeji_table 3') is_yeji = 1 break if show: print('is_yeji_table time', time.time()-start_time) return is_yeji # 先判断表格业绩 if tables_and_divs: yeji_table_flag = is_yeji_table(tables_and_divs, entity) if yeji_table_flag: # logging.info('yeji_table_flag 1') return yeji_table_flag if len(content) == 0: return 0 if end_index == 0: return 0 reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成' if re.search(reg_yeji, content[:end_index][-judge_len:]): # if '业绩' in content[:end_index][-judge_len:]: return 1 else: return 0 def get_several_money(sentence_text, found_yeji, in_attachment=False, tables_and_divs=[], show=0): def getDigitsDic(_unit): ''' @summary:拿到中文对应的数字 ''' DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} return DigitsDic.get(_unit) def getMultipleFactor(_unit): ''' @summary:拿到单位对应的值 ''' MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)} return MultipleFactor.get(_unit) def getUnifyMoney(money): ''' @summary:将中文金额字符串转换为数字金额 @param: money:中文金额字符串 @return: decimal,数据金额 ''' MAX_MONEY = 1000000000000 MAX_NUM = 12 #去掉逗号 money = re.sub("[,,]","",money) money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money) result = Decimal(0) chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"] chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千'] LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$") BigMoneypattern = re.compile("^零?(?P[%s])$"%("".join(chnDigits))) try: if re.search(LowMoneypattern,money) is not None: return Decimal(money) elif re.search(BigMoneypattern,money) is not None: return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney")) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money) if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None: if MAX_MONEY/getMultipleFactor(factorUnit)1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None: result += Decimal(subMoneys[1]) elif len(subMoneys[1])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None: result += Decimal(getDigitsDic(subMoneys[1])) else: result += Decimal(getUnifyMoney(subMoneys[1])) break except Exception as e: # traceback.print_exc() return Decimal(0) return result start_time = time.time() # 提取表格用于判断业绩 # if tables_and_divs: # soup = BeautifulSoup(html, 'lxml') # # tables = soup.find_all('table') # tables_and_divs = soup.find_all(['table', 'div']) # else: # tables_and_divs = [] # if show: # print('get_several_money time1', time.time()-start_time) # start_time = time.time() money_list = [] # 使用正则识别金额 entity_type = "money" list_money_pattern = {"cn": "(()(?P百分之)?(?P[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())", "key_word": "((?P(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P(E-?\d+))?[百千]{,1})(?:[(\(]?(?P[%%‰折])*\s*,?((金额)?单位[::])?(?P[万亿]?(?:[美日欧]元|元)?(?P[台只吨斤棵株页亩方条天]*))\s*[)\)]?))", "front_m": "((?P(?:[(\(]?\s*(?P[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P(E-?\d+))?(?:,?)[百千]*)())", "behind_m": "(()()(?P\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P[万亿]?(?:[美日欧]元|元)(?P[台个只吨斤棵株页亩方条米]*))[\))]?)"} # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。 pattern_money = re.compile("%s|%s|%s|%s" % ( list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"], list_money_pattern["front_m"])) # 修复 元千万元 match = re.search('元[千百十]万元', sentence_text) if match: sentence_text = re.sub(re.escape(match.group()), match.group()[1:], sentence_text) # 修复 千多万元 match = re.search('[千百十]多万元', sentence_text) if match: sentence_text = re.sub(re.escape(match.group()), match.group()[0] + match.group()[2:], sentence_text) if show: print('get_several_money time2', time.time()-start_time) start_time = time.time() if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text): found_yeji += 1 if found_yeji >= 2: # 过滤掉业绩后面的所有金额 all_match = [] else: ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text) # 过滤掉收费标准里面的金额 if ser: all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))) else: all_match = re.finditer(pattern_money, sentence_text) if show: print('get_several_money time3', time.time()-start_time) start_time = time.time() for _match in all_match: if len(_match.group()) > 0: notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位 unit = "" entity_text = "" start_index = "" end_index = "" text_beforeMoney = "" filter = "" filter_unit = False notSure = False science = "" if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额 # print('金额在业绩后面: ', _match.group(0)) found_yeji += 1 break yeji_table_flag = 0 for k, v in _match.groupdict().items(): if v != "" and v is not None: if k == 'text_key_word': notSure = True if k.split("_")[0] == "money": entity_text = v # if is_yeji_table(tables_and_divs, entity_text): if judge_yeji(len(sentence_text), sentence_text, 300, tables_and_divs, entity_text): yeji_table_flag = 1 break # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)]) if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉 entity_text = entity_text[:-3] if k.split("_")[0] == "unit": if v == '万元' or unit == "": # 处理 预算金额(元):160万元 这种出现前后单位不一致情况 unit = v if k.split("_")[0] == "text": # print('text_before: ', _match.group(k)) text_beforeMoney = v if k.split("_")[0] == "filter": filter = v if re.search("filter_unit", k) is not None: filter_unit = True if k.split("_")[0] == 'science': science = v if yeji_table_flag: continue if filter != "": continue start_index, end_index = _match.span() start_index += len(text_beforeMoney) '''过滤掉手机号码作为金额''' if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney): # print('过滤掉手机号码作为金额') continue elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880' # print('过滤掉手机号码作为金额') continue if unit == "": # 2021/7/21 有明显金额特征的补充单位,避免被过滤 if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)): if entity_text.endswith('万元'): unit = '万元' entity_text = entity_text[:-2] else: unit = '元' # print('1明显金额特征补充单位 元') elif re.search('USD[::]?$', text_beforeMoney): unit = '美元' elif re.search('EUR[::]?$', text_beforeMoney): unit = '欧元' elif re.search('JPY[::]?$', text_beforeMoney): unit = '日元' elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]): # print('两个金额连接后面的有单位,用后面单位') unit = '万元' elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None: if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text): unit = '万元' # print('金额较小且句子中有万元的,补充单位为万元') elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None: unit = '万元' else: unit = '元' # print('金额前面紧接关键词的补充单位 元') elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text): unit = '元' # print('3明显金额特征补充单位 元') else: # print('过滤掉没单位金额: ',entity_text) continue elif unit == '万元': if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text): unit = '元' elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元 unit = '元' if unit.find("万") >= 0 and entity_text.find("万") >= 0: # 2021/7/19修改为金额文本有万,不计算单位 # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit) unit = "元" if re.search('.*万元万元', entity_text): # 2021/7/19 修正两个万元 # print(' 修正两个万元',entity_text) entity_text = entity_text.replace('万元万元', '万元') else: if filter_unit: continue entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text) # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney) if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额|投资估算|投资概算', sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]): # 2021/8/5过滤掉总投资金额 # print('总投资金额: ', _match.group(0)) notes = '总投资' elif re.search('建安费|建安工程费|建筑安装工程费|建安工程造价', sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]): # 2021/11/18 投资金额不作为招标金额 notes = '建安费' elif re.search('总工程造价|总造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/12/20 工程造价不作为招标金额 notes = '工程造价' # 直接判断前面是否有业绩 if judge_yeji(max(0, _match.span()[0] - 10), sentence_text): continue if len(unit) > 0: if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8: # 2021/7/19 修正万元金额过大的情况 entity_text = str( getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000) unit = '元' # 修正金额后单位 重置为元 else: entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0])) else: if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len( entity_text.split('.')[0]) >= 8: entity_text = str(getUnifyMoney(entity_text) / 10000) # print('修正金额字段含万 过大的情况') else: entity_text = str(getUnifyMoney(entity_text)) if science and re.search('^E-?\d+$', science): # 科学计数 entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal( entity_text + science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算 if float(entity_text) > 100000000000: # float(entity_text)<100 or 2022/3/4 取消最小金额限制 continue if notSure and unit == "" and float(entity_text) > 100 * 10000: # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit) continue # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney, # filter, filter_unit)) if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额 # print('过滤掉可能是费率的金额') continue money_list.append((entity_text, start_index, end_index, unit, notes)) if show: print('get_several_money time4', time.time()-start_time) start_time = time.time() # 排除过小的金额 temp_list = [] for money in money_list: if int(float(money[0])) < 1000: continue temp_list.append(money) money_list = temp_list return money_list, found_yeji def chinese_to_arabic(ch_str): chinese_number_dict = { '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, '拾': 10, '百': 100, '千': 1000, } no_list = [] for c in ch_str: if c not in chinese_number_dict.keys(): return None no_list.append(chinese_number_dict.get(c)) arabic_num = 0 mul_no = None for i, no in enumerate(no_list): if no in [10, 100, 1000]: if mul_no is None: arabic_num += no else: arabic_num += no * mul_no mul_no = None else: mul_no = no if mul_no: arabic_num += mul_no return arabic_num def reg_word_sort(reg): ss = reg.split('|') ss.sort(key=lambda x: len(x), reverse=True) reg = '|'.join(ss) return reg def get_stage_pattern(): stage_dict = { "立项阶段": "立项|项目投资", "可研阶段": "可行性研究|可研", "环评阶段": "环境评价|环境影响|环境评测|环评|(水保|环保|环境保护)(编制|验收|监测)", "稳评阶段": "稳定风险|社会稳定|风险评估", "咨询阶段": "(水影响|能源|交通影响|地质灾害|地址灾害|地震安全性|地震安全性|气象|雷击风险|安全|海洋|森林环境)(评[价估测])|水土保持|(水|交|灾|震|能|气|安|海|林)评", "造价阶段": "(决算书|预算|结算|造价|决算)(编制|咨询)", "设计阶段": "(施工图(纸|)|初步|项目|工程)(方案|)设计|测绘|规划设计", "勘察阶段": "(勘察|勘查)设计|勘察技术|勘查|勘察", "施工图审": "(施工图(纸|)|防雷|消防|人防)审查|施工图审", "施工许可": "施工许可证", "施工准备": "施工准备|监理|资格预审|资审", "施工在建": "施工", "竣工阶段": "竣工|验收", "EPC总承包": "总承包|EPC" } stage_priority_dict = { "立项阶段": 1, "可研阶段": 3, "环评阶段": 2, "稳评阶段": 3, "咨询阶段": 2, "造价阶段": 2, "设计阶段": 4, "勘察阶段": 4, "施工图审": 2, "施工许可": 2, "施工准备": 3, "施工在建": 5, "竣工阶段": 3, "EPC总承包": 4 } list_stage_v = [] for k,v in stage_dict.items(): list_stage_v.append("(?P<%s>%s)"%(k,v)) stage_pattern = "|".join(list_stage_v) return stage_pattern, stage_priority_dict def get_industry_pattern(): filename = os.path.abspath(os.path.dirname(__file__)) + "/proposedBuildingKeyword.xlsx" df = pd.read_excel(filename) dict_industry_keywords = {} for _industry, _keyword in zip(df["类别"], df["关键词"]): if _industry not in dict_industry_keywords: dict_industry_keywords[_industry] = set() dict_industry_keywords[_industry].add(_keyword) list_industry_p = [] for k, v in dict_industry_keywords.items(): if len(v) > 0: list_industry_p.append("(?P<%s>%s)" % (k, "|".join(list(v)))) # logging.info('get_industry_pattern ' + str(list_industry_p)) _pattern = re.compile("|".join(list_industry_p)) return _pattern def get_property_pattern(): property_dict = { "复合性质": "扩迁建|扩改建|扩翻建|改扩建|改翻建|迁扩建|迁改建|迁扩建|翻改建|翻扩建", "迁建": "迁建|搬迁重建", "扩建": "扩建|增建|加建|扩大", "翻建": "翻建|拆除重建", "改建": "改造|改建|技改|提升|改进|整改", "装饰装修": "装修|室内装饰|室外装饰|装饰工程|装饰改造工程|装饰装修|装修装饰|幕墙工程|修缮|外墙翻新|翻新工程|整修|修补|装饰|维修", "拆除": "拆除", "恢复重建": "恢复重建|灾后重建", '其他': '整治|修复|环境治理|更换' } property_priority_dict = { "复合性质": 1, "迁建": 2, "扩建": 3, "翻建": 4, "改建": 5, "装饰装修": 6, "拆除": 7, "恢复重建": 8, '其他': 9, "新建": 10 } list_property_v = [] for k,v in property_dict.items(): list_property_v.append("(?P<%s>%s)"%(k,v)) property_pattern = "|".join(list_property_v) return property_pattern, property_priority_dict