|
@@ -5686,6 +5686,8 @@ class DistrictPredictor():
|
|
province_l = find_areas(p_pro, text)
|
|
province_l = find_areas(p_pro, text)
|
|
city_l = find_areas(p_city, text)
|
|
city_l = find_areas(p_city, text)
|
|
district_l = find_areas(p_dis, text)
|
|
district_l = find_areas(p_dis, text)
|
|
|
|
+ if len(province_l) == len(city_l) == 0:
|
|
|
|
+ district_l = [it for it in district_l if re.search('[市县旗区]$', it[0])] # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
|
|
|
|
|
|
province_l = chage_area2score(province_l, max_len=len(text))
|
|
province_l = chage_area2score(province_l, max_len=len(text))
|
|
city_l = chage_area2score(city_l, max_len=len(text))
|
|
city_l = chage_area2score(city_l, max_len=len(text))
|
|
@@ -5913,10 +5915,11 @@ class DistrictPredictor():
|
|
|
|
|
|
project_name = project_name + title if project_name not in title else title
|
|
project_name = project_name + title if project_name not in title else title
|
|
# project_name = project_name.replace(tenderee, '')
|
|
# project_name = project_name.replace(tenderee, '')
|
|
- entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
|
|
|
|
- for tup in entity_list[0]:
|
|
|
|
- if tup[2] in ['org', 'company']:
|
|
|
|
- project_name = project_name.replace(tup[3], '')
|
|
|
|
|
|
+ if len(project_name)>3:
|
|
|
|
+ entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
|
|
|
|
+ for tup in entity_list[0]:
|
|
|
|
+ if tup[2] in ['org', 'company']:
|
|
|
|
+ project_name = project_name.replace(tup[3], '')
|
|
|
|
|
|
text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
|
|
text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
|
|
|
|
|
|
@@ -6066,7 +6069,7 @@ class TablePremExtractor(object):
|
|
self.head_rule_dic = {
|
|
self.head_rule_dic = {
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
- "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
|
|
|
+ "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
"win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
"win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
@@ -6182,6 +6185,7 @@ class TablePremExtractor(object):
|
|
multi_same_package = False # 非连续的重复包号
|
|
multi_same_package = False # 非连续的重复包号
|
|
package_fix2raw = dict() # 处理后包号:处理前包号 字典
|
|
package_fix2raw = dict() # 处理后包号:处理前包号 字典
|
|
link_set = set()
|
|
link_set = set()
|
|
|
|
+ tenderer_list = [] # 保存所有中标人
|
|
not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
|
|
not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
|
|
'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
|
|
'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
|
|
|
|
|
|
@@ -6189,6 +6193,7 @@ class TablePremExtractor(object):
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
# print('没有包号及角色的不要')
|
|
# print('没有包号及角色的不要')
|
|
return {}
|
|
return {}
|
|
|
|
+
|
|
for i in df.index:
|
|
for i in df.index:
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
@@ -6260,9 +6265,9 @@ class TablePremExtractor(object):
|
|
|
|
|
|
if project_code != "":
|
|
if project_code != "":
|
|
uni_project_code= uniform_package_name(project_code)
|
|
uni_project_code= uniform_package_name(project_code)
|
|
- if uni_project_code != "" and package != "":
|
|
|
|
|
|
+ if uni_project_code != "" and package != "" and uni_project_code!=package:
|
|
# print('重组包号:', '%s_%s'%(uni_project_code, package))
|
|
# print('重组包号:', '%s_%s'%(uni_project_code, package))
|
|
- package = '%s_%s'%(uni_project_code, package)
|
|
|
|
|
|
+ package = '%s_%s'%(uni_project_code, package.replace('自增', ''))
|
|
if package_code_raw!='':
|
|
if package_code_raw!='':
|
|
if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
package_fix2raw[package] = package_code_raw
|
|
package_fix2raw[package] = package_code_raw
|
|
@@ -6341,6 +6346,7 @@ class TablePremExtractor(object):
|
|
"role_text": tenderer,
|
|
"role_text": tenderer,
|
|
"serviceTime": ""
|
|
"serviceTime": ""
|
|
})
|
|
})
|
|
|
|
+ tenderer_list.append(tenderer)
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
# break # 注释掉避免 400084571 某些包废标 中断匹配
|
|
# break # 注释掉避免 400084571 某些包废标 中断匹配
|
|
@@ -6348,6 +6354,32 @@ class TablePremExtractor(object):
|
|
for k, v in package_fix2raw.items():
|
|
for k, v in package_fix2raw.items():
|
|
if k in prem_dic:
|
|
if k in prem_dic:
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
|
|
+ if len(tenderer_list)>2 and len(set(tenderer_list))==1 and "package_code" not in headers: # 没提取到包号且中标人一样应该是错误多包,需去掉多包 例 244355092 281854766
|
|
|
|
+ total_money = 0
|
|
|
|
+ for v in prem_dic.values():
|
|
|
|
+ for d in v['roleList']:
|
|
|
|
+ if d['role_name'] == "win_tenderer":
|
|
|
|
+ total_money += d['role_money']['money']
|
|
|
|
+ return {'自增1': {
|
|
|
|
+ 'code': '',
|
|
|
|
+ 'name': '',
|
|
|
|
+ 'roleList': [{
|
|
|
|
+ "address": "",
|
|
|
|
+ "linklist": [],
|
|
|
|
+ "role_money": {
|
|
|
|
+ "discount_ratio": "",
|
|
|
|
+ "downward_floating_ratio": "",
|
|
|
|
+ "floating_ratio": "",
|
|
|
|
+ "money": total_money,
|
|
|
|
+ "money_unit": ''
|
|
|
|
+ },
|
|
|
|
+ "role_name": "win_tenderer",
|
|
|
|
+ "role_text": tenderer_list[0],
|
|
|
|
+ "serviceTime": ""
|
|
|
|
+ }],
|
|
|
|
+ 'tendereeMoney': 0,
|
|
|
|
+ 'tendereeMoneyUnit': ""
|
|
|
|
+ }}
|
|
return prem_dic
|
|
return prem_dic
|
|
|
|
|
|
def update_prem(self, rs_dic, tmp_dic):
|
|
def update_prem(self, rs_dic, tmp_dic):
|
|
@@ -6417,11 +6449,11 @@ class TablePremExtractor(object):
|
|
self.update_prem(table_prem, prem_)
|
|
self.update_prem(table_prem, prem_)
|
|
i = j - 1
|
|
i = j - 1
|
|
i += 1
|
|
i += 1
|
|
- if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
|
|
|
|
+ if table_prem and len(trs) == 2 and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
sib = table.find_previous_sibling()
|
|
sib = table.find_previous_sibling()
|
|
sib_text = sib.get_text()
|
|
sib_text = sib.get_text()
|
|
ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
|
|
ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
|
|
- if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
|
|
|
|
|
|
+ if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
|
|
package_sib = ser_sib.group(0)
|
|
package_sib = ser_sib.group(0)
|
|
package_sib = uniform_package_name(package_sib)
|
|
package_sib = uniform_package_name(package_sib)
|
|
table_prem[package_sib] = table_prem.pop('自增1')
|
|
table_prem[package_sib] = table_prem.pop('自增1')
|
|
@@ -6437,16 +6469,18 @@ class TablePremExtractor(object):
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
self.nlp_enterprise = nlp_enterprise
|
|
self.nlp_enterprise = nlp_enterprise
|
|
|
|
+ in_attachment = False
|
|
if richText:
|
|
if richText:
|
|
richText = richText.extract() # 过滤掉附件
|
|
richText = richText.extract() # 过滤掉附件
|
|
prem = self.get_prem(soup, web_source_name)
|
|
prem = self.get_prem(soup, web_source_name)
|
|
if prem == {} and richText:
|
|
if prem == {} and richText:
|
|
prem = self.get_prem(richText, web_source_name)
|
|
prem = self.get_prem(richText, web_source_name)
|
|
|
|
+ in_attachment = True
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
k = list(prem)[0]
|
|
k = list(prem)[0]
|
|
- if k == '1' or len(k) > 2:
|
|
|
|
|
|
+ if k.startswith('自增'):
|
|
prem['Project'] = prem.pop(k)
|
|
prem['Project'] = prem.pop(k)
|
|
- return prem
|
|
|
|
|
|
+ return prem, in_attachment
|
|
|
|
|
|
class CandidateExtractor(object):
|
|
class CandidateExtractor(object):
|
|
def __init__(self):
|
|
def __init__(self):
|
|
@@ -6719,7 +6753,6 @@ class CandidateExtractor(object):
|
|
candidate_set = set()
|
|
candidate_set = set()
|
|
for table in tables:
|
|
for table in tables:
|
|
trs = self.tb.table2list(table)
|
|
trs = self.tb.table2list(table)
|
|
- table.extract()
|
|
|
|
i = 0
|
|
i = 0
|
|
headers = ""
|
|
headers = ""
|
|
while i < len(trs) - 1:
|
|
while i < len(trs) - 1:
|
|
@@ -6745,6 +6778,15 @@ class CandidateExtractor(object):
|
|
candidate_set.update(candidate_set_)
|
|
candidate_set.update(candidate_set_)
|
|
i = j - 1
|
|
i = j - 1
|
|
i += 1
|
|
i += 1
|
|
|
|
+ if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
|
|
+ sib = table.find_previous_sibling()
|
|
|
|
+ sib_text = sib.get_text()
|
|
|
|
+ ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
|
|
|
|
+ if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
|
|
|
|
+ package_sib = ser_sib.group(0)
|
|
|
|
+ package_sib = uniform_package_name(package_sib)
|
|
|
|
+ rs_dic[package_sib] = rs_dic.pop('Project')
|
|
|
|
+ table.extract()
|
|
return rs_dic, candidate_set
|
|
return rs_dic, candidate_set
|
|
|
|
|
|
def get_candidates_from_text(self, list_sentences, list_entitys):
|
|
def get_candidates_from_text(self, list_sentences, list_entitys):
|
|
@@ -6772,14 +6814,16 @@ class CandidateExtractor(object):
|
|
html = re.sub("##attachment##","",html)
|
|
html = re.sub("##attachment##","",html)
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
|
+ in_attachment = False
|
|
if richText:
|
|
if richText:
|
|
richText = richText.extract() # 过滤掉附件
|
|
richText = richText.extract() # 过滤掉附件
|
|
prem, candidate_set = self.get_prem(soup)
|
|
prem, candidate_set = self.get_prem(soup)
|
|
if prem == {} and richText:
|
|
if prem == {} and richText:
|
|
prem, candidate_set = self.get_prem(richText)
|
|
prem, candidate_set = self.get_prem(richText)
|
|
|
|
+ in_attachment = True
|
|
if prem == {} and candidate_set == set():
|
|
if prem == {} and candidate_set == set():
|
|
candidate_set = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
candidate_set = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
- return prem, {'candidate': ','.join(candidate_set)}
|
|
|
|
|
|
+ return prem, {'candidate': ','.join(candidate_set)}, in_attachment
|
|
|
|
|
|
def role_special_predictor(web_source_name, content, nlp_enterprise):
|
|
def role_special_predictor(web_source_name, content, nlp_enterprise):
|
|
if web_source_name == '中国电子科技集团有限公司电子采购平台':
|
|
if web_source_name == '中国电子科技集团有限公司电子采购平台':
|