|
@@ -1057,10 +1057,10 @@ class PREMPredict():
|
|
|
if _entity.entity_type == "money" and _entity.notes == '招标或中标金额' and _entity.label == 2:
|
|
|
# if channel_dic['docchannel'] == "招标公告":
|
|
|
if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
|
|
|
- _entity.values[0] = 0.51
|
|
|
+ _entity.values[0] = 0.55
|
|
|
_entity.set_Money(0, _entity.values) # 2021/11/18 根据公告类别把费用改为招标或中投标金额
|
|
|
else:
|
|
|
- _entity.values[1] = 0.51
|
|
|
+ _entity.values[1] = 0.55
|
|
|
_entity.set_Money(1, _entity.values)
|
|
|
|
|
|
def predict(self,list_sentences,list_entitys):
|
|
@@ -1486,7 +1486,7 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
|
|
|
"^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
|
- "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
|
|
|
+ "|^:贵公司参与|^:?你方于|^(胜出)?(中标|成交)[,。]|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
|
|
|
"|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
|
|
|
self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \
|
|
|
"|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \
|
|
@@ -3542,6 +3542,8 @@ class ProductAttributesPredictor():
|
|
|
i += 1
|
|
|
# print('过滤:产品单价包含金额外的字符数大于5个', tds[id3])
|
|
|
continue
|
|
|
+ else:
|
|
|
+ unitPrice = tds[id3]
|
|
|
if id4 != "":
|
|
|
if re.search('\w', tds[id4]):
|
|
|
brand = tds[id4]
|
|
@@ -5876,7 +5878,7 @@ class DistrictPredictor():
|
|
|
text = str(text).replace('(', '(').replace(')', ')')
|
|
|
text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
|
|
|
text = re.sub(
|
|
|
- '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村', # 570445994 广州路北侧 预测为 广州 路北
|
|
|
+ '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村|电信|联通|北京现代', # 570445994 广州路北侧 预测为 广州 路北
|
|
|
' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
@@ -5909,18 +5911,18 @@ class DistrictPredictor():
|
|
|
score = 2
|
|
|
else:
|
|
|
score = 1
|
|
|
- if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
+ if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
|
|
|
, text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
|
|
|
text[max(0, it.start(k) - 1):]):
|
|
|
score += 1
|
|
|
- # score += it.end(k) / len(text) / 10
|
|
|
+ score += it.end(k) / len(text) / 10
|
|
|
province_l.append((v, score * weight))
|
|
|
elif k in ['city', 'city1']:
|
|
|
if v in full_dic['city']:
|
|
|
score = 2
|
|
|
else:
|
|
|
score = 1
|
|
|
- if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
+ if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
|
|
|
, text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
|
|
|
text[max(0, it.start(k) - 1):]):
|
|
|
score += 1
|
|
@@ -5933,7 +5935,7 @@ class DistrictPredictor():
|
|
|
score = 2
|
|
|
else:
|
|
|
score = 0.5
|
|
|
- if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
+ if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站|地区|区域)'
|
|
|
, text[it.end(k):]) or (
|
|
|
re.match('\s*%s' % v, text) and it.start(k) < 2) or re.search(
|
|
|
'^((%s)|\-%s)' % (v, v), text[max(0, it.start(k) - 1):]):
|
|
@@ -6140,12 +6142,14 @@ class DistrictPredictor():
|
|
|
addr_bidsend = addr_dic.get('addr_bidsend', '')
|
|
|
addr_contact = addr_dic.get('addr_contact', '')
|
|
|
in_content = False
|
|
|
+ not_sure = True # 是否不确定地区
|
|
|
province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
|
|
|
# print('关键词1:', province_l, city_l, district_l)
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
- if pred_city == "" or prob < 0.7 or max_score<2:
|
|
|
+ if pred_city_1 == "" or prob < 0.7 or max_score<2:
|
|
|
ree, addr = self.get_ree_addr(prem)
|
|
|
if ree in title:
|
|
|
ree = '##'
|
|
@@ -6160,27 +6164,33 @@ class DistrictPredictor():
|
|
|
city_l.extend(city_l2)
|
|
|
district_l.extend(district_l2)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis = big_area_2, pred_pro_2, pred_city_2, pred_dis_2
|
|
|
# print('关键词2:', province_l, city_l, district_l)
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
- if pred_city == "" or prob < 0.7 or max_score<2:
|
|
|
+ if re.search('省|市|自治', addr_project) and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址
|
|
|
+ not_sure = False
|
|
|
+ big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
|
|
|
+ if not_sure and (pred_city_2 == "" or prob < 0.7 or max_score<2):
|
|
|
province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s'%(addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
|
|
|
province_l.extend(province_l3)
|
|
|
city_l.extend(city_l3)
|
|
|
district_l.extend(district_l3)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis = big_area_3, pred_pro_3, pred_city_3, pred_dis_3
|
|
|
# print('关键词3:', province_l, city_l, district_l)
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
- if pred_city == "" or prob < 0.6 or max_score < 2:
|
|
|
+ if not_sure and (pred_city_3 == "" or prob < 0.6 or max_score < 2):
|
|
|
all_addr, tenderees = self.get_all_addr(list_entity)
|
|
|
province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
|
|
|
province_l.extend(province_l4)
|
|
|
city_l.extend(city_l4)
|
|
|
district_l.extend(district_l4)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
|
|
|
- if prob < 0.6 or max_score < 4:
|
|
|
+ big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis = big_area_4, pred_pro_4, pred_city_4, pred_dis_4
|
|
|
+ if pred_pro_3 != pred_pro_4 and (prob < 0.6 or max_score < 2):
|
|
|
in_content = True
|
|
|
# print('关键词4:', province_l, city_l, district_l)
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
@@ -6828,7 +6838,7 @@ class TablePremExtractor(object):
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
- 'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况',
|
|
|
+ 'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况|^中标结果$',
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
@@ -6929,6 +6939,11 @@ class TablePremExtractor(object):
|
|
|
header_dic['tenderer'] = other_tenderer2
|
|
|
if 'win_sort' not in header_dic:
|
|
|
not_sure_winner = True
|
|
|
+ elif 'tenderer' not in header_dic and 'win_or_not' in header_dic:
|
|
|
+ if other_tenderer!="":
|
|
|
+ header_dic['tenderer'] = other_tenderer
|
|
|
+ elif other_tenderer2!="":
|
|
|
+ header_dic['tenderer'] = other_tenderer2
|
|
|
if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
|
|
|
header_dic.pop('win_sort')
|
|
|
if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
@@ -7347,7 +7362,7 @@ class CandidateExtractor(object):
|
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论|^选择设备$', # 补充站源特别表达:例:577351909 选择设备 1 为中标 0 非中标
|
|
|
- "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$", #补充 368295593 投标个人/单位 提取
|
|
|
+ "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$|供应商单位名称$", #补充 368295593 投标个人/单位 提取
|
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
@@ -7383,6 +7398,8 @@ class CandidateExtractor(object):
|
|
|
if re.search(v, text):
|
|
|
if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer'] and re.search('是否', text):
|
|
|
continue
|
|
|
+ elif k == 'win_or_not' and re.search('是否(中标|成交)候选人', text): # 修复 584112560 把第二作第一错误
|
|
|
+ continue
|
|
|
header_dic[k] = (i, text)
|
|
|
# if k != 'candidate': # candidate 可与前三候选重复
|
|
|
num += 1
|
|
@@ -7419,7 +7436,7 @@ class CandidateExtractor(object):
|
|
|
:param nlp_enterprise: 公告中的角色实体列表
|
|
|
:return:
|
|
|
'''
|
|
|
- text = re.sub('主报名人:|联合报名人:|联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
+ text = re.sub('主报名人:|联合报名人:|联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]|(联合体(牵头|成员)单位)'
|
|
|
, ',', text)
|
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
|
|
@@ -7836,11 +7853,11 @@ def get_header_line(list_item):
|
|
|
x.append(getPredictor("form").encode(item))
|
|
|
predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
|
for item, values in zip(list_item, list(predict_y)):
|
|
|
- item = str(item)
|
|
|
+ item = str(item).replace(' ', '')
|
|
|
lb = 1 if values[1] > 0.5 else 0
|
|
|
- if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准', '合格']:
|
|
|
+ if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准', '合格', '民间投资', '备案']:
|
|
|
lb = 0
|
|
|
- elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺'] or re.search('^比例\d{1,2}%$', item):
|
|
|
+ elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺', '环境影响评价技术服务机构', '报告全本'] or re.search('^比例\d{1,2}%$', item):
|
|
|
lb = 1
|
|
|
elif lb == 0 and item in header_set:
|
|
|
lb = 1
|
|
@@ -7901,6 +7918,10 @@ class ApprovalPredictor():
|
|
|
"total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额|项目概算", # 总投资
|
|
|
}
|
|
|
|
|
|
+ self.head_rule_dic = {**self.role_type, **self.person_type, **self.date_type, **self.addr_type, **self.money_type}
|
|
|
+ self.head_rule_dic.update({k: v.split(':')[0] for k,v in self.other_part.items()})
|
|
|
+ self.tb = TableTag2List()
|
|
|
+
|
|
|
def recursive_text(self, tag):
|
|
|
'''
|
|
|
递归获取 soup 节点文本
|
|
@@ -7923,8 +7944,190 @@ class ApprovalPredictor():
|
|
|
texts.append(re.sub('\s', '', child.strip().replace(':', ':').replace('(', '(').replace(')', ')')))
|
|
|
return texts
|
|
|
|
|
|
- def predict(self, list_sentences, list_entitys, html, span=12):
|
|
|
- soup = BeautifulSoup(html)
|
|
|
+ def get_table_info(self, df, nlp_enterprise):
|
|
|
+ def get_header_index(datas):
|
|
|
+ '''
|
|
|
+ 根据表格表头判断结果0/1 得到哪些行和列是表头
|
|
|
+ :param datas: 表格内容表头判断结果数据[[1,1,1,1],[0,0,0,0]]
|
|
|
+ :return: 表头所在的行和列序号
|
|
|
+ '''
|
|
|
+ header_row = []
|
|
|
+ header_col = []
|
|
|
+ df_h = pd.DataFrame(datas) # 表头判断数据 , columns=columns
|
|
|
+ for i in df_h.index:
|
|
|
+ line = df_h.loc[i].values
|
|
|
+ if sum(line) == len(line):
|
|
|
+ header_row.append((i, sum(line) / len(line)))
|
|
|
+ elif sum(line) / len(line) > 0.8:
|
|
|
+ header_row.append((i, sum(line) / len(line)))
|
|
|
+ elif len(line) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
|
|
|
+ re.findall('10', ''.join([str(it) for it in line]))):
|
|
|
+ header_row.append((i, sum(line) / len(line)))
|
|
|
+ for i in df_h.columns:
|
|
|
+ col = df_h[i].values
|
|
|
+ if sum(col) == len(col):
|
|
|
+ header_col.append((i, sum(col) / len(col)))
|
|
|
+ elif sum(col) / len(col) > 0.8:
|
|
|
+ header_col.append((i, sum(col) / len(col)))
|
|
|
+ elif len(col) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
|
|
|
+ re.findall('10', ''.join([str(it) for it in line]))):
|
|
|
+ header_col.append((i, sum(col) / len(col)))
|
|
|
+ return header_row, header_col
|
|
|
+
|
|
|
+ def get_header(l, head_rule_dic):
|
|
|
+ header_dic = {}
|
|
|
+ for i in range(len(l)):
|
|
|
+ text = l[i].replace(' ', '') # 修复54969575 项目 名称 被空格分割
|
|
|
+ num = 0
|
|
|
+ tmp_dic = {}
|
|
|
+ for k, v in head_rule_dic.items():
|
|
|
+ if re.search(v, text):
|
|
|
+ tmp_dic[k] = i
|
|
|
+ num += 1
|
|
|
+ for k, v in tmp_dic.items():
|
|
|
+ if k not in header_dic:
|
|
|
+ header_dic[k] = v
|
|
|
+ return header_dic
|
|
|
+
|
|
|
+ result_l = []
|
|
|
+ datas = []
|
|
|
+ for i in df.index:
|
|
|
+ line = get_header_line(df.loc[i].values)
|
|
|
+ datas.append(line)
|
|
|
+ header_row, header_col = get_header_index(datas)
|
|
|
+ if len(header_col) == 1 and header_col[0][0] > 1: # 列表头不可能在第1列后面开始
|
|
|
+ header_col = []
|
|
|
+ if len(header_row) >= 1 and len(header_col) == 0: # 有行表头无列表头
|
|
|
+ i = 0
|
|
|
+ while i < len(header_row):
|
|
|
+ idx, ratio = header_row[i]
|
|
|
+ if idx + 1 >= len(df):
|
|
|
+ break
|
|
|
+ header_dic = get_header(df.loc[idx].values, self.head_rule_dic)
|
|
|
+ i += 1
|
|
|
+ range_from = idx + 1
|
|
|
+ range_to = len(df)
|
|
|
+ if i < len(header_row):
|
|
|
+ next_header = i
|
|
|
+ for j in range(i, len(header_row)):
|
|
|
+ idx2, ratio2 = header_row[j]
|
|
|
+ if idx2 - idx == 1:
|
|
|
+ header_dic2 = get_header(df.loc[idx2].values, self.head_rule_dic)
|
|
|
+ if set(df.loc[idx].values) & set(df.loc[idx2].values) != set():
|
|
|
+ header_dic.update(header_dic2)
|
|
|
+ else:
|
|
|
+ header_dic = header_dic2
|
|
|
+ range_from = idx2 + 1
|
|
|
+ range_to = len(df)
|
|
|
+ next_header = j + 1
|
|
|
+ idx = idx2
|
|
|
+ else:
|
|
|
+ range_from = idx + 1
|
|
|
+ range_to = idx2
|
|
|
+ next_header = j
|
|
|
+ break
|
|
|
+ i = next_header
|
|
|
+ if len(header_dic) >= 2 and 'project_name' in header_dic:
|
|
|
+ for index in range(range_from, range_to):
|
|
|
+ if len(set(df.loc[index, :])) <= 2: # 修复 56873031 补全内容跟表头错误连接
|
|
|
+ continue
|
|
|
+ tmp_dic = {}
|
|
|
+ for k, v in header_dic.items():
|
|
|
+ if k.startswith('time_'):
|
|
|
+ content = timeFormat(df.loc[index, v], default_first_day=False) if k in [
|
|
|
+ 'time_completion'] else timeFormat(df.loc[index, v])
|
|
|
+ elif k in self.role_type:
|
|
|
+ content = get_role(df.loc[index, v], nlp_enterprise)
|
|
|
+ elif k == 'moneysource':
|
|
|
+ content = turnMoneySource(df.loc[index, v])
|
|
|
+ else:
|
|
|
+ content = df.loc[index, v]
|
|
|
+ if content != '':
|
|
|
+ tmp_dic[k] = content
|
|
|
+ if len(tmp_dic) > 1 and 'project_name' in tmp_dic and tmp_dic not in result_l:
|
|
|
+ result_l.append(tmp_dic)
|
|
|
+ elif len(header_row) == 0 and len(header_col) >= 1:
|
|
|
+ return result_l # 不提取列向表格,容易出错 例 53489774 作多标段
|
|
|
+ i = 0
|
|
|
+ while i < len(header_col):
|
|
|
+ idx, ratio = header_col[i]
|
|
|
+ if idx + 1 >= len(df.columns):
|
|
|
+ break
|
|
|
+ header_dic = get_header(df[idx].values, self.head_rule_dic)
|
|
|
+ i += 1
|
|
|
+ range_from = idx + 1
|
|
|
+ range_to = len(df.columns)
|
|
|
+ if i < len(header_col):
|
|
|
+ next_header = i
|
|
|
+ for j in range(i, len(header_col)):
|
|
|
+ idx2, ratio2 = header_col[j]
|
|
|
+ if idx2 - idx == 1:
|
|
|
+ header_dic2 = get_header(df[idx2].values, self.head_rule_dic)
|
|
|
+ if set(df[idx].values) & set(df[idx2].values) != set():
|
|
|
+ header_dic.update(header_dic2)
|
|
|
+ else:
|
|
|
+ header_dic = header_dic2
|
|
|
+ range_from = idx2 + 1
|
|
|
+ range_to = len(df.columns)
|
|
|
+ next_header = j + 1
|
|
|
+ idx = idx2
|
|
|
+ else:
|
|
|
+ range_from = idx + 1
|
|
|
+ range_to = idx2
|
|
|
+ next_header = j
|
|
|
+ break
|
|
|
+ i = next_header
|
|
|
+ if len(header_dic) >= 2 and 'project_name' in header_dic:
|
|
|
+ for index in range(range_from, range_to):
|
|
|
+ if len(set(df.loc[:, index])) <= 2:
|
|
|
+ continue
|
|
|
+ tmp_dic = {}
|
|
|
+ for k, v in header_dic.items():
|
|
|
+ if k.startswith('time_'):
|
|
|
+ content = timeFormat(df.loc[v, index], default_first_day=False) if k in [
|
|
|
+ 'time_completion'] else timeFormat(df.loc[v, index])
|
|
|
+ elif k in self.role_type:
|
|
|
+ content = get_role(df.loc[v, index], nlp_enterprise)
|
|
|
+ elif k == 'moneysource':
|
|
|
+ content = turnMoneySource(df.loc[v, index])
|
|
|
+ else:
|
|
|
+ content = df.loc[v, index]
|
|
|
+ if content != '':
|
|
|
+ tmp_dic[k] = content
|
|
|
+ if len(tmp_dic) > 2 and 'project_name' in tmp_dic and tmp_dic not in result_l:
|
|
|
+ result_l.append(tmp_dic)
|
|
|
+ elif len(header_row) == 1 and len(header_col) == 1:
|
|
|
+ pass
|
|
|
+ return result_l
|
|
|
+
|
|
|
+ def predict_table(self, html, nlp_enterprise=[]):
|
|
|
+ html = re.sub("<html>|</html>|<body>|</body>", "", html)
|
|
|
+ html = re.sub("##attachment##", "", html)
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
+ richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
+ self.nlp_enterprise = nlp_enterprise
|
|
|
+ if richText:
|
|
|
+ richText = richText.extract() # 过滤掉附件
|
|
|
+ tables = soup.find_all('table')
|
|
|
+ if len(tables) == 0 and richText:
|
|
|
+ tables = richText.find_all('table')
|
|
|
+ tables.reverse()
|
|
|
+ data_list = []
|
|
|
+ for table in tables:
|
|
|
+ trs = self.tb.table2list(table)
|
|
|
+ if len(trs) > 1 and len(set(trs[0])) > 0 and len(set([len(tr) for tr in trs])) == 1: # 表格两行以上且每行列数一样才处理
|
|
|
+ df = pd.DataFrame(trs)
|
|
|
+ rs_l = self.get_table_info(df, nlp_enterprise)
|
|
|
+ for d in rs_l: # 53338603 项目名称+建设内容才是唯一
|
|
|
+ if d not in data_list:
|
|
|
+ data_list.append(d)
|
|
|
+ if rs_l:
|
|
|
+ table.extract()
|
|
|
+ return data_list
|
|
|
+
|
|
|
+ def predict(self, list_sentences, list_entitys, html, nlp_enterprise=[], span=12):
|
|
|
+ tabel_rs = self.predict_table(html, nlp_enterprise) # 表格多项目提取
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
texts_list = self.recursive_text(soup)
|
|
|
rs_dic = {k: "" for k in
|
|
|
self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
|
|
@@ -8076,6 +8279,15 @@ class ApprovalPredictor():
|
|
|
rs_l.append(multi_project)
|
|
|
if not_sure_role != '' and rs_dic.get('construct_company', '') == '' and not_sure_role not in org_set: # 补充,单位名称:这种作为建设单位 例:400069851014
|
|
|
rs_dic['construct_company'] = not_sure_role
|
|
|
+ if len(tabel_rs) > 1:
|
|
|
+ rs_dic_key = [k for k, v in rs_dic.items() if v != '']
|
|
|
+ keys = set(["approver", "publisher", "time_release", "phone", "doc_num"]) & set(rs_dic_key) - set(tabel_rs[0].keys())
|
|
|
+ if keys:
|
|
|
+ for d in tabel_rs:
|
|
|
+ for k in keys:
|
|
|
+ d[k] = rs_dic[k]
|
|
|
+ return tabel_rs
|
|
|
+
|
|
|
if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
|
|
|
for k in self.role_type.keys(): # 多项目无建设单位等通过整篇提取补充
|
|
|
if rs_dic.get(k, '') != '' and k not in rs_l[0].get(k, '') == '':
|
|
@@ -8083,7 +8295,7 @@ class ApprovalPredictor():
|
|
|
if d.get(k, '') == '':
|
|
|
d[k] = rs_dic[k]
|
|
|
return rs_l
|
|
|
- elif found_key == 1:
|
|
|
+ if found_key == 1:
|
|
|
district = getPredictor('district').get_area(
|
|
|
rs_dic['approver'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
|
|
|
if district['district']['province'] != '全国':
|
|
@@ -8153,6 +8365,14 @@ class ApprovalPredictor():
|
|
|
break
|
|
|
return approval
|
|
|
|
|
|
+ def add_codename2approval(self, approval, codeName):
|
|
|
+ if len(approval) == 1 and codeName: # 根据整个公告项目编号及名称补充审批信息
|
|
|
+ if 'project_code' not in approval[0] and codeName[0].get('code', []) != []:
|
|
|
+ approval[0]['project_code'] = codeName[0].get('code', [])[0]
|
|
|
+ if 'project_name' not in approval[0] and codeName[0].get('name', '') != '':
|
|
|
+ approval[0]['project_name'] = codeName[0].get('name', '')
|
|
|
+ return approval
|
|
|
+
|
|
|
class BiddingScore():
|
|
|
def __init__(self):
|
|
|
self.head_rule_dic = {
|
|
@@ -8362,8 +8582,8 @@ class EntityTypeRulePredictor():
|
|
|
def __init__(self):
|
|
|
self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址区]([((]网址[))])?[:为]'
|
|
|
self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
|
|
|
- self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?地[点址区]?[:为]'
|
|
|
- self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|所在(区域|地区):|存放地[点址]?[:为]'
|
|
|
+ self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?(地[点址区]?|区域)[:为]'
|
|
|
+ self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w]所[属在](区域|地区):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
|
|
|
self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
|
|
|
self.pattern_time_planned = '(计划|预计|预期)(招标|采购|发标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
|
self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|