|
@@ -642,9 +642,11 @@ class PREMPredict():
|
|
|
|
|
|
def __init__(self,config=None):
|
|
|
#self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
|
|
|
- self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
|
|
|
+ # self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
|
|
|
self.model_role = Model_role_classify_word(config=config)
|
|
|
self.model_money = Model_money_classify(config=config)
|
|
|
+ # self.role_file = open('/data/python/lsm/role_model_predict.txt', 'a', encoding='utf-8')
|
|
|
+ # self.money_file = open('/data/python/lsm/money_model_predict.txt', 'a', encoding='utf-8')
|
|
|
|
|
|
return
|
|
|
|
|
@@ -774,10 +776,16 @@ class PREMPredict():
|
|
|
front, middle, behind = text_tup
|
|
|
whole = "".join(text_tup)
|
|
|
# print('模型预测角色:', front, entity.entity_text, behind,label, values)
|
|
|
+ # if label in [0, 1, 2, 3, 4]:
|
|
|
+ # self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(front, entity.entity_text, behind,label, entity.doc_id))
|
|
|
if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
|
label = 5
|
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
|
label = 5
|
|
|
+ elif label == 0:
|
|
|
+ if re.search('拟邀请$', front):
|
|
|
+ label = 2
|
|
|
+ values[label] = 0.501
|
|
|
elif label == 2:
|
|
|
if re.search('中标单位和.{,25}签订合同', whole):
|
|
|
label = 0
|
|
@@ -851,11 +859,13 @@ class PREMPredict():
|
|
|
front, middle, behind = text_tup
|
|
|
whole = "".join(text_tup)
|
|
|
# print('金额: ', entity.entity_text, label, values, front, middle, behind)
|
|
|
+ # if label in [0, 1]:
|
|
|
+ # self.money_file.write("{0} {1} {2} {3}\n".format(front, entity.entity_text, behind, label))
|
|
|
if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
|
|
|
# print('模型预测金额: ', entity.entity_text, label, values, front, middle, behind)
|
|
|
label = 2
|
|
|
elif label == 1: # 错误中标金额处理
|
|
|
- if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
|
|
|
+ if re.search('[::,。](总金额|总价|单价|合价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
|
|
|
values[label] = 0.49
|
|
|
elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
|
|
|
values[label] = 0.49
|
|
@@ -1283,23 +1293,23 @@ class RoleRulePredictor():
|
|
|
|
|
|
def __init__(self):
|
|
|
# (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
|
|
|
- self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
|
|
|
+ self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置)" \
|
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
|
|
|
"[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
|
- self.pattern_tenderee_left_w0 = "(?P<tenderee_left>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
|
|
|
+ self.pattern_tenderee_left_w0 = "(?P<tenderee_left>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置)" \
|
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
|
|
|
"[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
|
|
|
self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
|
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
"(是|为|:|:|\s*)+$)"
|
|
|
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
|
|
|
+ self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
|
|
|
self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
|
|
|
self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
|
self.pattern_winTenderer_left = "(?P<winTenderer_left>" \
|
|
|
- "(乙|竞得|受让|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
|
+ "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
|
"|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致)[::是为]+$" \
|
|
|
"|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$" \
|
|
|
"|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
|
|
@@ -1314,44 +1324,71 @@ class RoleRulePredictor():
|
|
|
"^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
|
"|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格" \
|
|
|
"|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))"
|
|
|
- self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标" \
|
|
|
+ self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \
|
|
|
+ "|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \
|
|
|
+ "|拟邀请[\w()]{5,20}(进行)?单一来源谈判" \
|
|
|
"|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
|
|
|
+ self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
|
|
|
+ self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
- self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)[::是为]+$)"
|
|
|
-
|
|
|
- self.pattern_whole = [self.pattern_tenderee_left_w1,
|
|
|
- self.pattern_tenderee_left,
|
|
|
- self.pattern_tenderee_left_w0,
|
|
|
- self.pattern_tenderee_center,
|
|
|
- self.pattern_tenderee_right,
|
|
|
- self.pattern_tendereeORagency_right,
|
|
|
- self.pattern_agency_left,
|
|
|
- self.pattern_agency_right,
|
|
|
- self.pattern_winTenderer_left_w1,
|
|
|
- self.pattern_winTenderer_left,
|
|
|
- self.pattern_winTenderer_left_w0,
|
|
|
- self.pattern_winTenderer_whole,
|
|
|
- self.pattern_winTenderer_right,
|
|
|
- self.pattern_secondTenderer_left,
|
|
|
- self.pattern_secondTenderer_right,
|
|
|
- self.pattern_thirdTenderer_left,
|
|
|
- self.pattern_thirdTenderer_right
|
|
|
- ] # 需按顺序排列, 第二、三中标要在中标正则后面
|
|
|
+ self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|盖章)?[::是为]+$)"
|
|
|
+
|
|
|
+ # self.pattern_whole = [self.pattern_tenderee_left_w1,
|
|
|
+ # self.pattern_tenderee_left,
|
|
|
+ # self.pattern_tenderee_left_w0,
|
|
|
+ # self.pattern_tenderee_center,
|
|
|
+ # self.pattern_tenderee_right,
|
|
|
+ # self.pattern_tendereeORagency_right,
|
|
|
+ # self.pattern_agency_left,
|
|
|
+ # self.pattern_agency_right,
|
|
|
+ # self.pattern_winTenderer_left_w1,
|
|
|
+ # self.pattern_winTenderer_left,
|
|
|
+ # self.pattern_winTenderer_left_w0,
|
|
|
+ # self.pattern_winTenderer_whole,
|
|
|
+ # self.pattern_winTenderer_right,
|
|
|
+ # self.pattern_secondTenderer_left,
|
|
|
+ # self.pattern_secondTenderer_right,
|
|
|
+ # self.pattern_thirdTenderer_left,
|
|
|
+ # self.pattern_thirdTenderer_right
|
|
|
+ # ] # 需按顺序排列, 第二、三中标要在中标正则后面
|
|
|
+ self.pattern_left = [
|
|
|
+ self.pattern_tenderee_left_w1,
|
|
|
+ self.pattern_tenderee_left,
|
|
|
+ self.pattern_tenderee_left_w0,
|
|
|
+ self.pattern_agency_left,
|
|
|
+ self.pattern_secondTenderer_left,
|
|
|
+ self.pattern_thirdTenderer_left,
|
|
|
+ self.pattern_winTenderer_left_w1,
|
|
|
+ self.pattern_winTenderer_left,
|
|
|
+ self.pattern_winTenderer_left_w0,
|
|
|
+ ]
|
|
|
+
|
|
|
+ self.pattern_whole = [
|
|
|
+ self.pattern_winTenderer_whole,
|
|
|
+ self.pattern_tenderee_center,
|
|
|
+ ]
|
|
|
+ self.pattern_right = [
|
|
|
+ self.pattern_thirdTenderer_right,
|
|
|
+ self.pattern_secondTenderer_right,
|
|
|
+ self.pattern_agency_right,
|
|
|
+ self.pattern_tendereeORagency_right,
|
|
|
+ self.pattern_tenderee_right,
|
|
|
+ self.pattern_winTenderer_right,
|
|
|
+ ]
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
|
|
|
+ self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金") # |建安费用 不作为招标金额
|
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
|
-
|
|
|
+ # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
|
|
|
+
|
|
|
def _check_input(self,text, ignore=False):
|
|
|
if not text:
|
|
|
return []
|
|
@@ -1365,6 +1402,52 @@ class RoleRulePredictor():
|
|
|
|
|
|
return text
|
|
|
|
|
|
+ def ser_role(self, pattern_list, text, entity_text):
|
|
|
+ for _pattern in pattern_list:
|
|
|
+ for _iter in re.finditer(_pattern, text):
|
|
|
+ for _group, _v_group in _iter.groupdict().items():
|
|
|
+ if _v_group is not None and _v_group != "":
|
|
|
+ _role = _group.split("_")[0]
|
|
|
+ if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
|
|
|
+ # print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
|
+
|
|
|
+ if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
|
|
|
+ or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
|
|
|
+ _role = 'tenderee'
|
|
|
+ else:
|
|
|
+ _role = "agency"
|
|
|
+ _direct = _group.split("_")[1]
|
|
|
+ _weight = _group.split("_")[2] if len(_group.split("_")) == 3 else ""
|
|
|
+
|
|
|
+ _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
+ "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
+ return (_label, _iter.group(0))
|
|
|
+ return (5, '')
|
|
|
+
|
|
|
+
|
|
|
+ def rule_predict(self, before, center, after, entity_text):
|
|
|
+ # before = before if isinstance(before, str) else ""
|
|
|
+ # center = center if isinstance(center, str) else ""
|
|
|
+ # after = after if isinstance(after, str) else ""
|
|
|
+
|
|
|
+ _label, keyword = self.ser_role(self.pattern_left, before, entity_text) # 前文匹配
|
|
|
+ if _label == 2 and re.search(
|
|
|
+ '各.{,5}供应商|尊敬的供应商|[^\w]候选供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})',
|
|
|
+ # 135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
+ before) != None:
|
|
|
+ _label = 5
|
|
|
+ if _label == 5:
|
|
|
+ _label, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text) # 前后文匹配
|
|
|
+ if _label == 2 and re.search('以[^,。;]{10,30}为准', before + center + after)!=None:
|
|
|
+ _label = 5
|
|
|
+ if _label != 5 and self.ser_role(self.pattern_whole, before, entity_text)[0] != 5 or \
|
|
|
+ self.ser_role(self.pattern_whole, after, entity_text)[0] != 5:
|
|
|
+ _label = 5
|
|
|
+ if _label == 5:
|
|
|
+ _label, keyword = self.ser_role(self.pattern_right, after, entity_text) # 后文匹配
|
|
|
+ _flag = False if _label==5 else True
|
|
|
+ return (_label, _flag, keyword)
|
|
|
+
|
|
|
|
|
|
def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
|
|
|
|
|
@@ -1373,11 +1456,13 @@ class RoleRulePredictor():
|
|
|
list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
|
|
|
# list_name = list_codename["name"]
|
|
|
list_name = [] # 2022/1/5 改为实体列表内所有项目名称
|
|
|
+ name_entitys = [] # 2023/6/30 保存项目名称实体,直接通过位置判断角色是否在项目名称里面
|
|
|
candidates = [] # 保存不能确定为第几的候选人 2023/04/14
|
|
|
notfound_tenderer = True # 未找到前三候选人
|
|
|
for entity in list_entity:
|
|
|
if entity.entity_type == 'name':
|
|
|
list_name.append(entity.entity_text)
|
|
|
+ name_entitys.append(entity)
|
|
|
list_name = self._check_input(list_name) + [article.title]
|
|
|
for p_entity in list_entity:
|
|
|
|
|
@@ -1398,13 +1483,21 @@ class RoleRulePredictor():
|
|
|
find_flag = True
|
|
|
break
|
|
|
|
|
|
- for _name in list_name:
|
|
|
- if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
|
|
|
+ for _name in name_entitys:
|
|
|
+ if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
|
|
|
find_flag = True
|
|
|
if p_entity.values[0] > on_value:
|
|
|
p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
|
|
|
else:
|
|
|
p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
|
|
|
+
|
|
|
+ # for _name in list_name:
|
|
|
+ # if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
|
|
|
+ # find_flag = True
|
|
|
+ # if p_entity.values[0] > on_value:
|
|
|
+ # p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
|
|
|
+ # else:
|
|
|
+ # p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
|
|
|
if find_flag:
|
|
|
continue
|
|
|
|
|
@@ -1452,7 +1545,7 @@ class RoleRulePredictor():
|
|
|
tokens = list_sentence[s_index].tokens
|
|
|
begin_index = p_entity.begin_index
|
|
|
end_index = p_entity.end_index
|
|
|
- size = 15
|
|
|
+ size = 40 #15
|
|
|
spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
|
|
|
word_flag=True, use_text=False)
|
|
|
# _flag = False
|
|
@@ -1469,64 +1562,86 @@ class RoleRulePredictor():
|
|
|
except Exception as e:
|
|
|
print('正则报错:', e)
|
|
|
|
|
|
- # 使用正则+距离解决冲突
|
|
|
- # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
|
|
|
- list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
|
|
|
- for _i_span in range(len(list_spans)):
|
|
|
- _flag = False
|
|
|
- _prob_weight = 1
|
|
|
-
|
|
|
- # print(list_spans[_i_span],p_entity.entity_text)
|
|
|
- for _pattern in self.pattern_whole:
|
|
|
- for _iter in re.finditer(_pattern, list_spans[_i_span]):
|
|
|
- for _group, _v_group in _iter.groupdict().items():
|
|
|
- if _v_group is not None and _v_group != "":
|
|
|
- _role = _group.split("_")[0]
|
|
|
- if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
|
|
|
- # print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
|
- if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
|
|
|
- continue
|
|
|
- if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
|
|
|
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
|
|
|
- _role = 'tenderee'
|
|
|
- else:
|
|
|
- _role = "agency"
|
|
|
- _direct = _group.split("_")[1]
|
|
|
- _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
|
- # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
- # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
- list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
- _flag = True
|
|
|
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- _prob_weight = 1.2 if _weight=='w1' else 1
|
|
|
- # print('_v_group:',_group, _v_group, p_entity.entity_text)
|
|
|
-
|
|
|
- if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
|
|
|
- _flag = True
|
|
|
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- _prob_weight = 1.2 if _weight == 'w1' else 1
|
|
|
- # print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
-
|
|
|
- if _i_span == 2 and _direct == "right":
|
|
|
- _flag = True
|
|
|
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- _prob_weight = 1.2 if _weight == 'w1' else 1
|
|
|
- # print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
-
|
|
|
- # 得到结果
|
|
|
- if _flag:
|
|
|
- if _label in [2, 3, 4]:
|
|
|
- notfound_tenderer = False
|
|
|
- p_entity.label = _label
|
|
|
- p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
|
|
|
- # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
|
- break
|
|
|
- if _i_span == 0 and re.search(self.condadate_left, list_spans[_i_span]):
|
|
|
- candidates.append(p_entity)
|
|
|
+ before, center, after = spans[0], spans[1], spans[2]
|
|
|
+ entity_text = p_entity.entity_text
|
|
|
+ _label, _flag, kw = self.rule_predict(before, center, after, entity_text)
|
|
|
+
|
|
|
+ # if _label in [0, 1, 2, 3, 4]:
|
|
|
+ # self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
|
|
|
+ # entity.entity_text,
|
|
|
+ # after,
|
|
|
+ # _label,
|
|
|
+ # entity.doc_id))
|
|
|
+ # 得到结果
|
|
|
+ if _flag:
|
|
|
+ if _label in [2, 3, 4]:
|
|
|
+ notfound_tenderer = False
|
|
|
+ p_entity.label = _label
|
|
|
+ p_entity.values[int(_label)] = on_value + p_entity.values[
|
|
|
+ int(_label)] / 10
|
|
|
+ # log('正则召回实体: %s, %s, %d, %.4f, %s'%(kw, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], before+" "+after))
|
|
|
+ break
|
|
|
+ if re.search(self.condadate_left, before):
|
|
|
+ candidates.append(p_entity)
|
|
|
+
|
|
|
+ # # 使用正则+距离解决冲突
|
|
|
+ # # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
|
|
|
+ # list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
|
|
|
+ # for _i_span in range(len(list_spans)):
|
|
|
+ # _flag = False
|
|
|
+ # _prob_weight = 1
|
|
|
+ #
|
|
|
+ # # print(list_spans[_i_span],p_entity.entity_text)
|
|
|
+ # for _pattern in self.pattern_whole:
|
|
|
+ # for _iter in re.finditer(_pattern, list_spans[_i_span]):
|
|
|
+ # for _group, _v_group in _iter.groupdict().items():
|
|
|
+ # if _v_group is not None and _v_group != "":
|
|
|
+ # _role = _group.split("_")[0]
|
|
|
+ # if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
|
|
|
+ # # print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
|
+ # if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
|
|
|
+ # continue
|
|
|
+ # if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
|
|
|
+ # or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
|
|
|
+ # _role = 'tenderee'
|
|
|
+ # else:
|
|
|
+ # _role = "agency"
|
|
|
+ # _direct = _group.split("_")[1]
|
|
|
+ # _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
|
+ # # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
+ # # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
+ # if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
+ # list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
+ # _flag = True
|
|
|
+ # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
+ # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
+ # _prob_weight = 1.2 if _weight=='w1' else 1
|
|
|
+ # # print('_v_group:',_group, _v_group, p_entity.entity_text)
|
|
|
+ #
|
|
|
+ # if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
|
|
|
+ # _flag = True
|
|
|
+ # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
+ # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
+ # _prob_weight = 1.2 if _weight == 'w1' else 1
|
|
|
+ # # print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
+ #
|
|
|
+ # if _i_span == 2 and _direct == "right":
|
|
|
+ # _flag = True
|
|
|
+ # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
+ # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
+ # _prob_weight = 1.2 if _weight == 'w1' else 1
|
|
|
+ # # print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
+
|
|
|
+ # # 得到结果
|
|
|
+ # if _flag:
|
|
|
+ # if _label in [2, 3, 4]:
|
|
|
+ # notfound_tenderer = False
|
|
|
+ # p_entity.label = _label
|
|
|
+ # p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
|
|
|
+ # # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
|
+ # break
|
|
|
+ # if _i_span == 0 and re.search(self.condadate_left, list_spans[_i_span]):
|
|
|
+ # candidates.append(p_entity)
|
|
|
|
|
|
elif str(p_entity.label) in ['2', '3', '4']:
|
|
|
notfound_tenderer = False
|
|
@@ -1567,8 +1682,7 @@ class RoleRulePredictor():
|
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
|
p_entity.label = 0
|
|
|
# print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
|
|
|
-
|
|
|
- if notfound_tenderer and len(candidates) == 1 and re.search(
|
|
|
+ if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search(
|
|
|
'(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
|
|
|
article.content[:100]):
|
|
|
for p_entity in candidates:
|
|
@@ -1760,7 +1874,7 @@ class TendereeRuleRecall():
|
|
|
self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
|
|
|
"(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
"(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
|
|
|
- self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
|
|
|
+ self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|选取|抽取|抽选|出售|标卖|比价|处置)" \
|
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
|
"[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
|
|
|
# 未识别实体尾部判断
|
|
@@ -2109,7 +2223,7 @@ class MoneyGrade():
|
|
|
self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
|
|
|
self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
|
|
|
|
|
|
- self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
|
|
|
+ self.pattern_list = [self.tenderee_money_left_8, self.tenderer_money_left_8, self.tenderee_money_left_9, self.tenderer_money_left_9]
|
|
|
|
|
|
def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
|
|
|
sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
|
|
@@ -2127,6 +2241,8 @@ class MoneyGrade():
|
|
|
if ser:
|
|
|
groupdict = pattern.split('>')[0].replace('(?P<', '')
|
|
|
_role, _direct, _prob = groupdict.split('_')
|
|
|
+ if re.search('单价', context[-4:]) or float(entity.entity_text):
|
|
|
+ _prob = 6
|
|
|
_label = role2id.get(_role)
|
|
|
if _label != entity.label:
|
|
|
continue
|
|
@@ -2139,7 +2255,13 @@ class MoneyGrade():
|
|
|
# print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
|
|
|
break
|
|
|
if not_found and entity.values[entity.label] > min_prob:
|
|
|
- _prob = min_prob - 0.1 if in_att else min_prob
|
|
|
+ if re.search('单价', context[-4:]) or float(entity.entity_text)<100:
|
|
|
+ _prob = 0.6
|
|
|
+ elif in_att:
|
|
|
+ _prob = min_prob - 0.1
|
|
|
+ else:
|
|
|
+ _prob = min_prob
|
|
|
+ # _prob = min_prob - 0.1 if in_att else min_prob
|
|
|
entity.values[entity.label] = _prob + entity.values[entity.label] / 20
|
|
|
# print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
|
|
|
|
|
@@ -2671,9 +2793,9 @@ class ProductAttributesPredictor():
|
|
|
:return: 返回数量及单位
|
|
|
'''
|
|
|
quantity = quantity_text
|
|
|
- quantity = re.sub('[()(),,约]', '', quantity)
|
|
|
quantity = re.sub('[一壹]', '1', quantity)
|
|
|
- ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
|
|
|
+ quantity = re.sub('[,,约]|(\d+)', '', quantity)
|
|
|
+ ser = re.search('^(\d+\.?\d*)(?([㎡\w/]{,5})', quantity)
|
|
|
if ser:
|
|
|
quantity = str(ser.group(1))
|
|
|
quantity_unit = ser.group(2)
|
|
@@ -3302,7 +3424,7 @@ class DocChannel():
|
|
|
self.type_dic = {
|
|
|
'土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
|
|
|
'拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
|
|
|
- '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
|
|
|
+ '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
|
|
|
'采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
|
|
|
}
|
|
|
|
|
@@ -4929,6 +5051,8 @@ class TablePremExtractor(object):
|
|
|
return flag, contain_header, dict()
|
|
|
num = 0
|
|
|
for k, v in self.head_rule_dic.items():
|
|
|
+ if re.search('评分|得分|分数|分值', text):
|
|
|
+ continue
|
|
|
if re.search(v, text):
|
|
|
if k in ['tenderer'] and re.search('是否', text):
|
|
|
continue
|
|
@@ -4954,6 +5078,9 @@ class TablePremExtractor(object):
|
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
|
return flag, contain_header, header_dic
|
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
|
+ if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
|
+ # print('只有供应商名称 没排名和包号的去掉')
|
|
|
+ return flag, contain_header, dict()
|
|
|
return flag,contain_header, header_dic
|
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
contain_header = True
|
|
@@ -4979,9 +5106,11 @@ class TablePremExtractor(object):
|
|
|
:param nlp_enterprise: 公告中的角色实体列表
|
|
|
:return:
|
|
|
'''
|
|
|
+ text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
+ , ',', text)
|
|
|
if text in nlp_enterprise:
|
|
|
return text
|
|
|
- if len(text) > 25 or len(text)<4:
|
|
|
+ if len(text) > 50 or len(text)<4:
|
|
|
return ''
|
|
|
ners = getNers([text], useselffool=True)
|
|
|
roles = []
|
|
@@ -5033,8 +5162,8 @@ class TablePremExtractor(object):
|
|
|
project_name = ''
|
|
|
previous_package = package_code
|
|
|
|
|
|
- if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
|
- continue
|
|
|
+ if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
|
|
|
+ break
|
|
|
if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
|
|
|
continue
|
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
@@ -5084,7 +5213,11 @@ class TablePremExtractor(object):
|
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
break
|
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
|
- budget, money_unit = money_process(budget_, budget_header)
|
|
|
+ budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
|
+
|
|
|
+ if (re.search('费率|下浮率|[%%‰折]',
|
|
|
+ budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
|
+ budget = 0
|
|
|
if budget > 0:
|
|
|
if same_package and prem_dic[package]['tendereeMoney'] != budget: #
|
|
|
prem_dic[package]['tendereeMoney'] += budget
|
|
@@ -5110,7 +5243,13 @@ class TablePremExtractor(object):
|
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
|
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
break
|
|
|
- bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and 'bid_amount' in headers else (0, '')
|
|
|
+
|
|
|
+ bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
+
|
|
|
+ bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
|
+ if (re.search('费率|下浮率|[%%‰折]',
|
|
|
+ bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
|
+ bid_amount = 0
|
|
|
prem_dic[package]['roleList'].append({
|
|
|
"address": "",
|
|
|
"linklist": [],
|
|
@@ -5192,6 +5331,8 @@ class TablePremExtractor(object):
|
|
|
return rs_dic
|
|
|
|
|
|
def predict(self, html, nlp_enterprise):
|
|
|
+ html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
|
+ html = re.sub("##attachment##","",html)
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
self.nlp_enterprise = nlp_enterprise
|
|
@@ -5213,7 +5354,7 @@ class CandidateExtractor(object):
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
|
- "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
|
+ "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
@@ -5240,14 +5381,20 @@ class CandidateExtractor(object):
|
|
|
return flag, contain_header, dict()
|
|
|
num = 0
|
|
|
for k, v in self.head_rule_dic.items():
|
|
|
+ if k == 'candidate' and re.search('第[一二三]名|第[一二三](中标|成交)?候选人', text):
|
|
|
+ continue
|
|
|
+ if re.search('评分|得分|分数|分值', text):
|
|
|
+ continue
|
|
|
if re.search(v, text):
|
|
|
if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer'] and re.search('是否', text):
|
|
|
continue
|
|
|
header_dic[k] = (i, text)
|
|
|
- if k != 'candidate': # candidate 可与前三候选重复
|
|
|
- num += 1
|
|
|
+ # if k != 'candidate': # candidate 可与前三候选重复
|
|
|
+ num += 1
|
|
|
+ if 'win_tenderer'in header_dic and 'second_tenderer' in header_dic and 'candidate' in header_dic:
|
|
|
+ header_dic.pop('candidate')
|
|
|
if num>1:
|
|
|
- print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
+ # print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
return flag, contain_header, dict()
|
|
|
if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
|
|
|
return flag, contain_header, header_dic
|
|
@@ -5275,9 +5422,11 @@ class CandidateExtractor(object):
|
|
|
:param nlp_enterprise: 公告中的角色实体列表
|
|
|
:return:
|
|
|
'''
|
|
|
+ text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
+ , ',', text)
|
|
|
if text in nlp_enterprise:
|
|
|
return text
|
|
|
- if len(text) > 25 or len(text)<4:
|
|
|
+ if len(text) > 50 or len(text)<4:
|
|
|
return ''
|
|
|
ners = getNers([text], useselffool=True)
|
|
|
roles = []
|
|
@@ -5295,6 +5444,9 @@ class CandidateExtractor(object):
|
|
|
link_set = set()
|
|
|
candidate_set = set()
|
|
|
role_dic = dict() # 保存一二三候选人并排的情况
|
|
|
+ findtop3 = False
|
|
|
+ findmoney = False
|
|
|
+ line_num = 0
|
|
|
for i in df.index:
|
|
|
package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
|
|
|
candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
|
|
@@ -5306,9 +5458,11 @@ class CandidateExtractor(object):
|
|
|
second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
|
|
|
third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
|
|
|
|
|
|
- if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
|
|
|
+ if set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
|
|
|
+ # print('包含表头, 停止匹配')
|
|
|
break
|
|
|
- if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
|
|
|
+ if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
|
|
|
+ # print('全部为空或内容一样 停止匹配')
|
|
|
break
|
|
|
|
|
|
if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
|
|
@@ -5338,8 +5492,9 @@ class CandidateExtractor(object):
|
|
|
else:
|
|
|
candidate_set.add(candidate)
|
|
|
|
|
|
- if win_tenderer and second_tenderer and third_tenderer:
|
|
|
- if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
|
|
|
+ if win_tenderer and second_tenderer: # and third_tenderer 128778062 这篇只有 第一二候选人
|
|
|
+ if re.search("(候选人|投标人|单位|公司)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人|单位|公司)名?称?", df.loc[i, 1]):
|
|
|
+ findtop3 = True
|
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
|
text = self.get_role(text, self.nlp_enterprise)
|
|
@@ -5352,6 +5507,7 @@ class CandidateExtractor(object):
|
|
|
candidate_set.add(text)
|
|
|
|
|
|
elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]):
|
|
|
+ findmoney = True
|
|
|
header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
@@ -5359,13 +5515,20 @@ class CandidateExtractor(object):
|
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
break
|
|
|
money, money_unit = money_process(text, header)
|
|
|
+
|
|
|
+ if (re.search('费率|下浮率|[%%‰折]', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
|
+ money = 0
|
|
|
if money > 0:
|
|
|
if type not in role_dic:
|
|
|
role_dic[type] = dict()
|
|
|
role_dic[type]['money'] = money
|
|
|
role_dic[type]['money_unit'] = money_unit
|
|
|
else:
|
|
|
- break
|
|
|
+ line_num += 1
|
|
|
+ if findtop3 and findmoney:
|
|
|
+ break
|
|
|
+ if line_num > 3:
|
|
|
+ break
|
|
|
elif candidate and win_sort:
|
|
|
role_type = ""
|
|
|
if re.search('第[一1]|^[一1]$', win_sort):
|
|
@@ -5386,6 +5549,11 @@ class CandidateExtractor(object):
|
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
break
|
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
|
+
|
|
|
+ header = headers['bid_amount'][1] if "bid_amount" in headers else ''
|
|
|
+ if (re.search('费率|下浮率|[%%‰折]',
|
|
|
+ header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
|
+ bid_amount = 0
|
|
|
prem_dic[package]['roleList'].append({
|
|
|
"address": "",
|
|
|
"linklist": [],
|
|
@@ -5433,7 +5601,6 @@ class CandidateExtractor(object):
|
|
|
})
|
|
|
if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
|
|
|
prem_dic.pop(package)
|
|
|
-
|
|
|
return prem_dic, candidate_set
|
|
|
|
|
|
def get_prem(self, soup):
|
|
@@ -5461,9 +5628,10 @@ class CandidateExtractor(object):
|
|
|
else:
|
|
|
# print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
|
|
|
break
|
|
|
- if len(table_items) > 1:
|
|
|
+ if len(table_items) >= 1:
|
|
|
df = pd.DataFrame(table_items)
|
|
|
prem_, candidate_set_ = self.extract_from_df(df, headers)
|
|
|
+ # print('prem_: ', prem_)
|
|
|
rs_dic.update(prem_)
|
|
|
candidate_set.update(candidate_set_)
|
|
|
i = j - 1
|
|
@@ -5491,6 +5659,8 @@ class CandidateExtractor(object):
|
|
|
def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
|
|
|
self.nlp_enterprise = nlp_enterprise
|
|
|
html = html.replace('比选申请单位', '中标候选人') # 82347769
|
|
|
+ html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
|
+ html = re.sub("##attachment##","",html)
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
if richText:
|