|
@@ -1417,7 +1417,7 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金") # |建安费用 不作为招标金额
|
|
|
|
|
|
+ self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|服务金额|采购成本价") # |建安费用 不作为招标金额
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
@@ -1476,6 +1476,8 @@ class RoleRulePredictor():
|
|
_label = 5
|
|
_label = 5
|
|
elif _label == 2 and re.search('为$', before) and re.match('\w', after): # 排除错误 前文为结尾,后文不是标点符号结尾的,如 353824459 供应商为社会团体的, 供应商为玉田县中医医院提供安保服务
|
|
elif _label == 2 and re.search('为$', before) and re.match('\w', after): # 排除错误 前文为结尾,后文不是标点符号结尾的,如 353824459 供应商为社会团体的, 供应商为玉田县中医医院提供安保服务
|
|
_label = 5
|
|
_label = 5
|
|
|
|
+ elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人
|
|
|
|
+ _label = 5
|
|
if _label == 5:
|
|
if _label == 5:
|
|
_label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text) # 前后文匹配
|
|
_label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text) # 前后文匹配
|
|
keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
|
|
keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
|
|
@@ -1720,7 +1722,7 @@ class RoleRulePredictor():
|
|
if re.search('(含|在|包括)(\d+)?$', _span[0]):
|
|
if re.search('(含|在|包括)(\d+)?$', _span[0]):
|
|
continue
|
|
continue
|
|
if re.search(',\w{2,}', _span[0]):
|
|
if re.search(',\w{2,}', _span[0]):
|
|
- _span[0] = _span[0].split(',')[-1] #避免多个价格在一起造成误判
|
|
|
|
|
|
+ _span[0] = _span[0].split(',')[-1] if len(_span[0].split(',')[-1])>4 else _span[0][-8:] #避免多个价格在一起造成误判
|
|
if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
|
|
if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
|
|
self.pattern_money_other, _span[0]) is None:
|
|
self.pattern_money_other, _span[0]) is None:
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
@@ -1822,7 +1824,7 @@ class RoleRuleFinalAdd():
|
|
text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
|
|
text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
|
|
text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
- sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
|
|
|
+ sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
@@ -1858,7 +1860,7 @@ class RoleRuleFinalAdd():
|
|
ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
|
|
|
|
if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
|
|
if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
|
|
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None) \
|
|
|
|
|
|
+ or re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) == None) \
|
|
and ent_re not in agency_list and ent_re not in agency_set:
|
|
and ent_re not in agency_list and ent_re not in agency_set:
|
|
n = 0
|
|
n = 0
|
|
for i in range(len(ents) - 1, -1, -1):
|
|
for i in range(len(ents) - 1, -1, -1):
|
|
@@ -5572,8 +5574,8 @@ class TablePremExtractor(object):
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
|
|
|
- tenderee = self.get_role(tenderee, self.nlp_enterprise)
|
|
|
|
- tenderer = self.get_role(tenderer, self.nlp_enterprise)
|
|
|
|
|
|
+ tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
|
|
|
|
+ tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
|
|
|
|
|
|
if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
break
|
|
break
|
|
@@ -5608,7 +5610,7 @@ class TablePremExtractor(object):
|
|
prem_dic[package]['name'] = project_name
|
|
prem_dic[package]['name'] = project_name
|
|
|
|
|
|
if budget_ != "":
|
|
if budget_ != "":
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
@@ -5638,7 +5640,7 @@ class TablePremExtractor(object):
|
|
"serviceTime": ""
|
|
"serviceTime": ""
|
|
})
|
|
})
|
|
if tenderer and not same_package:
|
|
if tenderer and not same_package:
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
|
|
|
|
@@ -5914,7 +5916,7 @@ class CandidateExtractor(object):
|
|
header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
money, money_unit = money_process(text, header)
|
|
money, money_unit = money_process(text, header)
|
|
@@ -5949,7 +5951,7 @@ class CandidateExtractor(object):
|
|
'tendereeMoney': 0,
|
|
'tendereeMoney': 0,
|
|
'tendereeMoneyUnit': ""
|
|
'tendereeMoneyUnit': ""
|
|
}
|
|
}
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
|
|
|
|
@@ -6499,24 +6501,17 @@ if __name__=="__main__":
|
|
rs = product_attr.predict(docid='', html=html, page_time="")
|
|
rs = product_attr.predict(docid='', html=html, page_time="")
|
|
print(rs)
|
|
print(rs)
|
|
|
|
|
|
- # docid = ""
|
|
|
|
- # title = ''
|
|
|
|
- # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
|
- # html = f.read()
|
|
|
|
- # tb_extract = TablePremExtractor()
|
|
|
|
- # rs = tb_extract.predict(html, [
|
|
|
|
- # "河钢集团供应链管理有限公司邯郸分公司",
|
|
|
|
- # "石家庄中达科技有限公司",
|
|
|
|
- # "河北骥驰耐磨材料有限公司",
|
|
|
|
- # "衡水奥诺工矿机械设备有限公司",
|
|
|
|
- # "河北勤鹏机械设备科技有限公司",
|
|
|
|
- # "邯郸市华北不锈钢厂有限公司",
|
|
|
|
- # "邯郸市芳林机械备件制造有限公司",
|
|
|
|
- # "济南宏鲁新型材料有限公司",
|
|
|
|
- # "邯郸海博机械设备有限公司",
|
|
|
|
- # "河北万革新能源科技有限公司"
|
|
|
|
- # ])
|
|
|
|
- # print(rs)
|
|
|
|
|
|
+ docid = ""
|
|
|
|
+ title = ''
|
|
|
|
+ with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
|
+ html = f.read()
|
|
|
|
+ tb_extract = TablePremExtractor()
|
|
|
|
+ rs = tb_extract.predict(html, [
|
|
|
|
+ "广东省广裕集团嘉顺实业有限责任公司",
|
|
|
|
+ "广州顺为招标采购有限公司",
|
|
|
|
+ "中华人民共和国"
|
|
|
|
+ ])
|
|
|
|
+ print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|