|
@@ -1404,10 +1404,10 @@ class RoleRulePredictor():
|
|
|
def __init__(self):
|
|
|
# (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
|
|
|
self.pattern_tenderee_left_55 = "(?P<tenderee_left_55>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \
|
|
|
- "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
|
|
|
+ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
|
|
|
"[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
|
self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
|
|
|
- "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\
|
|
|
+ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行))"\
|
|
|
"[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
|
|
|
self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
|
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
|
|
@@ -1419,19 +1419,19 @@ class RoleRulePredictor():
|
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
|
self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
|
|
|
- "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
|
+ "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
|
"|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
|
|
|
- "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \
|
|
|
+ "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
|
|
|
"|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
|
|
|
"|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)" # 承办单位:不作为中标 83914772
|
|
|
self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
|
|
|
- "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
|
|
|
+ "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)|第[一1]名,?投标(人|单位|银行|公司):$" \
|
|
|
"(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$)" # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
|
|
|
- self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
|
|
|
+ self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)" \
|
|
|
"(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
|
|
|
"|结果公示如下:摇出球号:\d+号,中介机构:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标 # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
|
|
|
|
|
|
- self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
|
|
|
+ self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
|
|
|
"^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
|
"|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
|
|
|
"|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
|
|
@@ -1440,13 +1440,13 @@ class RoleRulePredictor():
|
|
|
"|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
|
|
|
"|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
+ self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
+ self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
|
|
|
|
|
|
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
+ self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
+ self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
|
|
|
|
|
|
- self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
|
|
|
+ self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
|
|
|
|
|
|
self.pattern_left = [
|
|
|
self.pattern_tenderee_left_60,
|
|
@@ -1557,7 +1557,7 @@ class RoleRulePredictor():
|
|
|
return (_label, _prob, _flag, keyword)
|
|
|
|
|
|
|
|
|
- def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
|
|
|
+ def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5, all_winner=False):
|
|
|
|
|
|
for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
|
|
|
list_codenames):
|
|
@@ -1689,6 +1689,25 @@ class RoleRulePredictor():
|
|
|
entity_text = p_entity.entity_text
|
|
|
_label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text)
|
|
|
|
|
|
+ if _label == 5 and re.search(':(1[.、])?$', before) and re.search('^[、;,&/。]', after) and re.search(
|
|
|
+ '(中标|成交|中选))?(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取)结果', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]): # 补充召回 例:514053647 标段1:中国建设银行西安南大街支行,标段2:中国农业银行股份有限公司西安分行,
|
|
|
+ _flag = True
|
|
|
+ _label = 2
|
|
|
+ _prob = 0.5
|
|
|
+ elif _label == 5 and all_winner==1 or (all_winner==2 and re.search('(排[名序]|名次|顺序|第):?[0-9一二三四五六七八九十]+', before)==None):
|
|
|
+ if re.search('(中标|中选|成交|入围|入选)(人|单位|供应商|银行)(名称)?:', before) and re.search('未(中标|中选|成交|入围|入选)', before)==None:
|
|
|
+ _flag = True
|
|
|
+ _label = 2
|
|
|
+ _prob = 0.55
|
|
|
+ elif re.search('(:|[::,]\d{1,2}[.、])$', before) and re.search('^[、;,&/。]', after) and re.search('(入围|合格)(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]):
|
|
|
+ _flag = True
|
|
|
+ _label = 2
|
|
|
+ _prob = 0.51
|
|
|
+ elif re.search('(候选|投标|应答|响应)(人|单位|供应商|银行)(名称)?:', before):
|
|
|
+ _flag = True
|
|
|
+ _label = 2
|
|
|
+ _prob = 0.5
|
|
|
+
|
|
|
# if _label in [0, 1, 2, 3, 4]:
|
|
|
# self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
|
|
|
# entity.entity_text,
|
|
@@ -2593,15 +2612,15 @@ class ProductPredictor():
|
|
|
p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
|
|
|
"(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
|
|
|
"|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
|
|
|
- sentence_range = []
|
|
|
- if len(out_lines) >= 3: # 三个以上大纲
|
|
|
- for i in range(len(out_lines)-1):
|
|
|
- text, s1, b1 = out_lines[i]
|
|
|
- _, s2, b2 = out_lines[i+1]
|
|
|
- if 3<text.find(':')<20:
|
|
|
- text = text.split(':')[0]
|
|
|
- if re.search(p, text[:15]):
|
|
|
- sentence_range.append((s1, s2))
|
|
|
+ # sentence_range = [] #20240827 取消,修复线上接口产品耗时长问题
|
|
|
+ # if len(out_lines) >= 3: # 三个以上大纲
|
|
|
+ # for i in range(len(out_lines)-1):
|
|
|
+ # text, s1, b1 = out_lines[i]
|
|
|
+ # _, s2, b2 = out_lines[i+1]
|
|
|
+ # if 3<text.find(':')<20:
|
|
|
+ # text = text.split(':')[0]
|
|
|
+ # if re.search(p, text[:15]):
|
|
|
+ # sentence_range.append((s1, s2))
|
|
|
|
|
|
with self.sess.as_default() as sess:
|
|
|
with self.sess.graph.as_default():
|
|
@@ -2669,24 +2688,24 @@ class ProductPredictor():
|
|
|
if len(list_sentence)==0:
|
|
|
result.append({"product":[]})
|
|
|
continue
|
|
|
-
|
|
|
- if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
|
|
|
- new_list = []
|
|
|
- word_num = 0
|
|
|
- for sentence in list_sentence:
|
|
|
- if sentence.sentence_index<2:
|
|
|
- new_list.append(sentence)
|
|
|
- continue
|
|
|
- for s1, s2 in sentence_range:
|
|
|
- if sentence.sentence_index < s1:
|
|
|
- continue
|
|
|
- elif s1<=sentence.sentence_index <=s2:
|
|
|
- new_list.append(sentence)
|
|
|
- word_num += len(sentence.sentence_text)
|
|
|
- elif sentence.sentence_index >= s2:
|
|
|
- break
|
|
|
- if word_num > 100:
|
|
|
- list_sentence = new_list
|
|
|
+ # 20240827 取消,修复线上接口产品耗时长问题
|
|
|
+ # if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
|
|
|
+ # new_list = []
|
|
|
+ # word_num = 0
|
|
|
+ # for sentence in list_sentence:
|
|
|
+ # if sentence.sentence_index<2:
|
|
|
+ # new_list.append(sentence)
|
|
|
+ # continue
|
|
|
+ # for s1, s2 in sentence_range:
|
|
|
+ # if sentence.sentence_index < s1:
|
|
|
+ # continue
|
|
|
+ # elif s1<=sentence.sentence_index <=s2:
|
|
|
+ # new_list.append(sentence)
|
|
|
+ # word_num += len(sentence.sentence_text)
|
|
|
+ # elif sentence.sentence_index >= s2:
|
|
|
+ # break
|
|
|
+ # if word_num > 100:
|
|
|
+ # list_sentence = new_list
|
|
|
|
|
|
list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
|
|
|
_begin_index = 0
|
|
@@ -6389,6 +6408,24 @@ class TableTag2List():
|
|
|
if self._output[i][j] == "":
|
|
|
self._output[i][j] = val
|
|
|
|
|
|
+def is_head_line(list_item):
|
|
|
+ '''
|
|
|
+ 调用表头识别模型判断是否为表头行
|
|
|
+ :param list_item: 行内容 例: ['技术参数、要求', '变更项']
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ x = []
|
|
|
+ for item in list_item:
|
|
|
+ x.append(getPredictor("form").encode(item))
|
|
|
+ predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
|
+ count = 0
|
|
|
+ for item, values in zip(list_item, list(predict_y)):
|
|
|
+ print(item, values[1])
|
|
|
+ if values[1] > 0.6:
|
|
|
+ count += 1
|
|
|
+ if count/len(list_item)>0.6:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
|
|
|
class TablePremExtractor(object):
|
|
|
def __init__(self):
|
|
@@ -6399,10 +6436,10 @@ class TablePremExtractor(object):
|
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
|
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|存放金额",
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|中标存款|存放金额|分配额度",
|
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -6411,23 +6448,29 @@ class TablePremExtractor(object):
|
|
|
self.tb = TableTag2List()
|
|
|
|
|
|
|
|
|
- def find_header(self, td_list):
|
|
|
+ def find_header(self, td_list, all_winner=False, first_line=False):
|
|
|
fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
contain_header = False
|
|
|
- if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
|
|
|
+ not_sure_winner = False # 是否 不确定中标的中标人表达方式
|
|
|
+ for text in set(fix_td_list) - self.headerset:
|
|
|
+ if len(text)<10 and re.search(self.head_rule_dic['bid_amount'], text):
|
|
|
+ self.headerset.add(text)
|
|
|
+ if len(set(fix_td_list))>0 and (first_line or len(set(fix_td_list) & self.headerset)>=2) and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
|
|
|
+ other_tenderer = ""
|
|
|
+ other_tenderer2 = ""
|
|
|
flag = True
|
|
|
- need_replace = 0 # 是否需要替换表头名称
|
|
|
for i in range(len(td_list)) :
|
|
|
text = td_list[i]
|
|
|
- text = re.sub('\s', '', text)
|
|
|
+ text = re.sub('\s|[((]排名不分先后[))]', '', text)
|
|
|
+ text = re.sub('^人选', '入选', text)
|
|
|
if text == '备选中标人':
|
|
|
text = '第二候选人'
|
|
|
if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
|
|
|
continue
|
|
|
- if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
- return flag, contain_header, dict()
|
|
|
+ if re.search('未(中标|成交|中选|入围)原因', text): # 不提取此种表格
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
num = 0
|
|
|
for k, v in self.head_rule_dic.items():
|
|
|
if re.search('评分|得分|分数|分值', text):
|
|
@@ -6437,6 +6480,8 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
if k == 'budget' and re.search('量', text): # 预算工作量 预算采购量 等不作为预算
|
|
|
continue
|
|
|
+ elif k == 'bid_amount' and re.search('分配方案|基准利率|BP值', text): # 517987084 中标资金分配方案
|
|
|
+ continue
|
|
|
elif k in header_dic:
|
|
|
if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text): # 总价替换单价
|
|
|
header_dic[k] = (i, text)
|
|
@@ -6447,9 +6492,13 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
header_dic[k] = (i, text)
|
|
|
num += 1
|
|
|
+ elif re.search('^((中标|成交|中选|入围|入选)(候选)?)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)(名称)?$', text) and re.search('未', text)==None:
|
|
|
+ other_tenderer = (i, text)
|
|
|
+ elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
|
|
|
+ other_tenderer2 = (i, text)
|
|
|
if num>1:
|
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
- return flag, contain_header, dict()
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
|
for i in range(len(td_list)):
|
|
@@ -6463,21 +6512,30 @@ class TablePremExtractor(object):
|
|
|
if re.search('^金额((万?元))?$', text):
|
|
|
header_dic['budget'] = (i, text)
|
|
|
break
|
|
|
+ if all_winner and 'tenderer' not in header_dic: # 标题有存款、入库、入围等公告补充其他表达做中标人
|
|
|
+ if other_tenderer!="":
|
|
|
+ header_dic['tenderer'] = other_tenderer
|
|
|
+ elif other_tenderer2!="":
|
|
|
+ header_dic['tenderer'] = other_tenderer2
|
|
|
+ if 'win_sort' not in header_dic:
|
|
|
+ not_sure_winner = True
|
|
|
+ if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
|
|
|
+ header_dic.pop('win_sort')
|
|
|
if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
|
- return flag, contain_header, header_dic
|
|
|
+ return flag, contain_header, header_dic, not_sure_winner
|
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
|
if 'win_sort' in header_dic: # 有排名的 用候选人提取类
|
|
|
- return flag, contain_header, dict()
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
|
# print('只有供应商名称 没排名和包号的去掉')
|
|
|
- return flag, contain_header, dict()
|
|
|
- return flag,contain_header, header_dic
|
|
|
- elif 'tenderer' in header_dic and re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]): # 有中标人,且有明确中标关键词的进行提取
|
|
|
- return flag, contain_header, header_dic
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
+ return flag,contain_header, header_dic, not_sure_winner
|
|
|
+ elif 'tenderer' in header_dic and (re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]) or all_winner): # 有中标人,且有明确中标关键词的进行提取
|
|
|
+ return flag, contain_header, header_dic, not_sure_winner
|
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
contain_header = True
|
|
|
- return flag, contain_header, dict()
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
|
|
|
def get_role(self, text, nlp_enterprise):
|
|
|
'''
|
|
@@ -6508,7 +6566,7 @@ class TablePremExtractor(object):
|
|
|
else:
|
|
|
return ''
|
|
|
|
|
|
- def extract_from_df(self, df, headers, web_source_name):
|
|
|
+ def extract_from_df(self, df, headers, web_source_name, all_winner=False):
|
|
|
prem_dic = {}
|
|
|
previous_package = "" # 上一行包号
|
|
|
multi_same_package = False # 非连续的重复包号
|
|
@@ -6563,7 +6621,7 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
|
continue
|
|
|
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None:
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
|
|
|
tenderer = ""
|
|
|
|
|
|
if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
|
|
@@ -6625,11 +6683,11 @@ class TablePremExtractor(object):
|
|
|
prem_dic[package]['name'] = project_name
|
|
|
|
|
|
if budget_ != "":
|
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
prem_dic.pop(package)
|
|
|
break
|
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
|
- budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
|
+ budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
|
|
|
|
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
|
budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
@@ -6656,12 +6714,12 @@ class TablePremExtractor(object):
|
|
|
"serviceTime": ""
|
|
|
})
|
|
|
if tenderer:
|
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '',
|
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
prem_dic.pop(package)
|
|
|
break
|
|
|
|
|
|
- bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
+ bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
prem_dic.pop(package)
|
|
@@ -6761,7 +6819,7 @@ class TablePremExtractor(object):
|
|
|
else:
|
|
|
rs_dic[pack] = tmp_dic[pack]
|
|
|
|
|
|
- def get_prem(self, soup, web_source_name=''):
|
|
|
+ def get_prem(self, soup, web_source_name='', all_winner=False):
|
|
|
tables = soup.find_all('table')
|
|
|
tables.reverse()
|
|
|
|
|
@@ -6769,10 +6827,15 @@ class TablePremExtractor(object):
|
|
|
for table in tables:
|
|
|
|
|
|
text = table.text.strip()
|
|
|
- previous = table.findPreviousSibling()
|
|
|
- text2 = previous.text.strip() if previous else ""
|
|
|
- # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
|
|
|
- if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
|
|
|
+ pre_text = ""
|
|
|
+ previous = None
|
|
|
+ if table.findPreviousSibling() != None:
|
|
|
+ previous = table.findPreviousSibling()
|
|
|
+ pre_text = previous.text.strip()
|
|
|
+ if pre_text == "" and table.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
|
|
|
+ previous = table.findPreviousSibling().findPreviousSibling()
|
|
|
+ pre_text = previous.text.strip()
|
|
|
+ if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+pre_text): # 包含业绩的表格过滤掉,不进行处理
|
|
|
tb_ex = table.extract()
|
|
|
if previous:
|
|
|
sib = previous.extract()
|
|
@@ -6784,13 +6847,19 @@ class TablePremExtractor(object):
|
|
|
headers = ""
|
|
|
table_prem = {}
|
|
|
while i < len(trs) - 1:
|
|
|
- flag_, contain_header_, headers_ = self.find_header(trs[i])
|
|
|
+ flag_, contain_header_, headers_, not_sure_winner = self.find_header(trs[i], all_winner, first_line=i==0)
|
|
|
+
|
|
|
+ if flag_ and 'tenderer' in headers_ and not_sure_winner and re.search('中标|成交|中选|入围|入选', pre_text)==None:
|
|
|
+ # print('过滤:',headers_)
|
|
|
+ flag_ = False
|
|
|
+ headers_ = {}
|
|
|
+
|
|
|
if flag_ and headers_ != dict():
|
|
|
table_items = []
|
|
|
headers = headers_
|
|
|
for j in range(i + 1, len(trs)):
|
|
|
if len(trs[j]) == len(trs[i]):
|
|
|
- flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
|
|
|
+ flag_2, contain_header_2, headers_2, not_sure_winner = self.find_header(trs[j], all_winner)
|
|
|
if flag_2 or contain_header_2:
|
|
|
if j == i+1 and flag_2:
|
|
|
if len(headers_)<=len(headers_2):
|
|
@@ -6808,7 +6877,7 @@ class TablePremExtractor(object):
|
|
|
break
|
|
|
if len(table_items) > 0:
|
|
|
df = pd.DataFrame(table_items)
|
|
|
- prem_ = self.extract_from_df(df, headers, web_source_name)
|
|
|
+ prem_ = self.extract_from_df(df, headers, web_source_name, all_winner)
|
|
|
# rs_dic.update(prem_)
|
|
|
# table_prem.update(prem_)
|
|
|
self.update_prem(table_prem, prem_)
|
|
@@ -6828,7 +6897,7 @@ class TablePremExtractor(object):
|
|
|
table.extract()
|
|
|
return rs_dic
|
|
|
|
|
|
- def predict(self, html, nlp_enterprise, web_source_name=""):
|
|
|
+ def predict(self, html, nlp_enterprise, web_source_name="", all_winner=False):
|
|
|
html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
|
html = re.sub("##attachment##","",html)
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
@@ -6838,10 +6907,10 @@ class TablePremExtractor(object):
|
|
|
if richText:
|
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
|
|
|
- prem = self.get_prem(soup, web_source_name)
|
|
|
+ prem = self.get_prem(soup, web_source_name, all_winner)
|
|
|
if prem == {} and richText:
|
|
|
del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
|
- prem = self.get_prem(richText, web_source_name)
|
|
|
+ prem = self.get_prem(richText, web_source_name, all_winner)
|
|
|
in_attachment = True
|
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
|
k = list(prem)[0]
|
|
@@ -6858,7 +6927,7 @@ class CandidateExtractor(object):
|
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
|
|
|
- "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
+ "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
@@ -6866,7 +6935,7 @@ class CandidateExtractor(object):
|
|
|
}
|
|
|
'''非表格候选人正则'''
|
|
|
# self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
|
- self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
|
|
|
+ self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答|响应)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
|
|
|
self.tb = TableTag2List()
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
self.headerset = pickle.load(f)
|
|
@@ -7841,14 +7910,14 @@ if __name__=="__main__":
|
|
|
# print(rs)
|
|
|
|
|
|
docid = ""
|
|
|
- title = ''
|
|
|
+ title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
|
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
html = f.read()
|
|
|
tb_extract = TablePremExtractor()
|
|
|
rs = tb_extract.predict(html, [
|
|
|
"江苏中联铸本混凝土有限公司",
|
|
|
"鼓楼区协荣机械设备经销部"
|
|
|
- ], web_source_name = '河钢供应链管理平台')
|
|
|
+ ], web_source_name = '', all_winner=True)
|
|
|
print('标段数:',len(rs[0]))
|
|
|
print(rs)
|
|
|
|