|
@@ -17,6 +17,7 @@ sys.path.append(os.path.abspath("../.."))
|
|
|
from BiddingKG.dl.common.Utils import *
|
|
|
from BiddingKG.dl.interface.modelFactory import *
|
|
|
import tensorflow as tf
|
|
|
+import pandas as pd
|
|
|
from BiddingKG.dl.product.data_util import decode, process_data
|
|
|
from BiddingKG.dl.interface.Entitys import Entity
|
|
|
from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
|
|
@@ -49,7 +50,10 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
"channel": {"predictor": None, "Lock": RLock()},
|
|
|
"deposit_payment_way": {"predictor": None, "Lock": RLock()},
|
|
|
"total_unit_money": {"predictor": None, "Lock": RLock()},
|
|
|
- "industry": {"predictor": None, "Lock": RLock()}
|
|
|
+ "industry": {"predictor": None, "Lock": RLock()},
|
|
|
+ "rolegrade": {"predictor": None, "Lock": RLock()},
|
|
|
+ "moneygrade": {"predictor": None, "Lock": RLock()},
|
|
|
+ "district": {"predictor": None, "Lock": RLock()}
|
|
|
}
|
|
|
|
|
|
|
|
@@ -87,6 +91,12 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]["predictor"] = TotalUnitMoney()
|
|
|
if _type == 'industry':
|
|
|
dict_predictor[_type]["predictor"] = IndustryPredictor()
|
|
|
+ if _type == 'rolegrade':
|
|
|
+ dict_predictor[_type]["predictor"] = RoleGrade()
|
|
|
+ if _type == 'moneygrade':
|
|
|
+ dict_predictor[_type]["predictor"] = MoneyGrade()
|
|
|
+ if _type == 'district':
|
|
|
+ dict_predictor[_type]["predictor"] = DistrictPredictor()
|
|
|
return dict_predictor[_type]["predictor"]
|
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
@@ -1130,7 +1140,7 @@ class RoleRulePredictor():
|
|
|
self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
|
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
"(是|为|:|:|\s*)+$)"
|
|
|
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向))"
|
|
|
+ self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
|
|
|
self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
|
|
|
self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
@@ -1138,14 +1148,14 @@ class RoleRulePredictor():
|
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
|
self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
|
|
|
"(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
|
|
|
- "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
+ "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
|
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
|
- "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((]中标人名称[))]))"
|
|
|
- self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
+ "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名?称?[))]))"
|
|
|
+ self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
# self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
|
|
|
|
|
@@ -1176,8 +1186,8 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
|
|
|
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
|
|
|
+ self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|限价|拦标价|预算金额|标底|总计|限额")
|
|
|
+ self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
@@ -1226,7 +1236,7 @@ class RoleRulePredictor():
|
|
|
if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
|
|
|
find_flag = True
|
|
|
if p_entity.values[0] > on_value:
|
|
|
- p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
|
|
|
+ p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
|
|
|
else:
|
|
|
p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
|
|
|
if find_flag:
|
|
@@ -1352,8 +1362,10 @@ class RoleRulePredictor():
|
|
|
for _sentence in list_sentence:
|
|
|
if _sentence.sentence_index == p_entity.sentence_index:
|
|
|
_span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
- end_index=p_entity.end_index, size=20, center_include=True,
|
|
|
+ end_index=p_entity.end_index, size=10, center_include=True,
|
|
|
word_flag=True, text=p_entity.entity_text)
|
|
|
+ if re.search(',\w{2,}', _span[0]):
|
|
|
+ _span[0] = _span[0].split(',')[-1] #避免多个价格在一起造成误判
|
|
|
if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
|
|
|
self.pattern_money_other, _span[0]) is None:
|
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
@@ -1444,7 +1456,7 @@ class RoleRuleFinalAdd():
|
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
|
sear_ent2 = re.search('(户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
- sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|尊敬的供应商|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
|
sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
|
|
|
|
|
|
tenderee_notfound = True
|
|
@@ -1697,6 +1709,108 @@ class TendereeRuleRecall():
|
|
|
list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
break
|
|
|
|
|
|
+class RoleGrade():
|
|
|
+ def __init__(self):
|
|
|
+ self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|甲)(人|方|单位))"
|
|
|
+ self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
|
|
|
+ self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
|
|
|
+ self.agency_left_9 = "(?P<agency_left_9>代理)"
|
|
|
+ self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一])"
|
|
|
+ self.winTenderer_left_8 = "(?P<winTenderer_left_8>(供应商|供货商|候选人))"
|
|
|
+ self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名))"
|
|
|
+ self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名))"
|
|
|
+ self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
|
|
|
+ self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
|
|
|
+ def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
|
|
|
+ '''
|
|
|
+ 根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
|
|
|
+ :param list_articles:
|
|
|
+ :param list_sentences:
|
|
|
+ :param list_entitys:
|
|
|
+ :param codeName:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
|
|
|
+ role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
|
|
|
+ for entity in list_entitys[0]:
|
|
|
+ if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.6:
|
|
|
+ text = sentences[entity.sentence_index].sentence_text
|
|
|
+ in_att = sentences[entity.sentence_index].in_attachment
|
|
|
+ b = entity.wordOffset_begin
|
|
|
+ e = entity.wordOffset_end
|
|
|
+ not_found = 1
|
|
|
+ for pattern in self.pattern_list:
|
|
|
+ if 'left' in pattern:
|
|
|
+ context = text[max(0, b-span):b]
|
|
|
+ elif 'right' in pattern:
|
|
|
+ context = text[e:e+span]
|
|
|
+ elif 'center' in pattern:
|
|
|
+ context = text[max(0, b-span):e+span]
|
|
|
+ else:
|
|
|
+ print('规则错误', pattern)
|
|
|
+ ser = re.search(pattern, context)
|
|
|
+ if ser:
|
|
|
+ groupdict = pattern.split('>')[0].replace('(?P<', '')
|
|
|
+ _role, _direct, _prob = groupdict.split('_')
|
|
|
+ _label = role2id.get(_role)
|
|
|
+ if _label != entity.label:
|
|
|
+ continue
|
|
|
+ _prob = int(_prob)*0.1
|
|
|
+ # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
|
|
|
+ if in_att:
|
|
|
+ _prob = _prob - 0.2
|
|
|
+ entity.values[_label] = _prob + entity.values[_label] / 20
|
|
|
+ not_found = 0
|
|
|
+ # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
|
|
|
+ break
|
|
|
+ if not_found and entity.values[entity.label]> min_prob:
|
|
|
+ _prob = min_prob - 0.1 if in_att else min_prob
|
|
|
+ entity.values[entity.label] = _prob + entity.values[entity.label] / 20
|
|
|
+ # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values)
|
|
|
+
|
|
|
+
|
|
|
+class MoneyGrade():
|
|
|
+ def __init__(self):
|
|
|
+ self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
|
|
|
+ self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
|
|
|
+ self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同))"
|
|
|
+ self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
|
|
|
+
|
|
|
+ self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
|
|
|
+
|
|
|
+ def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
|
|
|
+ sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
|
|
|
+ role2id = {"tenderee": 0, "tenderer": 1}
|
|
|
+ for entity in list_entitys[0]:
|
|
|
+ if entity.entity_type in ['money'] and entity.label in [0, 1] and entity.values[entity.label]> 0.6:
|
|
|
+ text = sentences[entity.sentence_index].sentence_text
|
|
|
+ in_att = sentences[entity.sentence_index].in_attachment
|
|
|
+ b = entity.wordOffset_begin
|
|
|
+ e = entity.wordOffset_end
|
|
|
+ context = text[max(0, b - span):b]
|
|
|
+ not_found = 1
|
|
|
+ for pattern in self.pattern_list:
|
|
|
+ ser = re.search(pattern, context)
|
|
|
+ if ser:
|
|
|
+ groupdict = pattern.split('>')[0].replace('(?P<', '')
|
|
|
+ _role, _direct, _prob = groupdict.split('_')
|
|
|
+ _label = role2id.get(_role)
|
|
|
+ if _label != entity.label:
|
|
|
+ continue
|
|
|
+ _prob = int(_prob) * 0.1
|
|
|
+ # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values)
|
|
|
+ if in_att:
|
|
|
+ _prob = _prob - 0.2
|
|
|
+ entity.values[_label] = _prob + entity.values[_label] / 20
|
|
|
+ not_found = 0
|
|
|
+ # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
|
|
|
+ break
|
|
|
+ if not_found and entity.values[entity.label] > min_prob:
|
|
|
+ _prob = min_prob - 0.1 if in_att else min_prob
|
|
|
+ entity.values[entity.label] = _prob + entity.values[entity.label] / 20
|
|
|
+ # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
|
|
|
+
|
|
|
+
|
|
|
# 时间类别
|
|
|
class TimePredictor():
|
|
|
def __init__(self,config=None):
|
|
@@ -2566,13 +2680,13 @@ class DocChannel():
|
|
|
'公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
|
|
|
'候选人公示': '候选人公示|评标结果公示',
|
|
|
'中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
- '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购',
|
|
|
+ '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
|
|
|
'中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
'中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示',
|
|
|
# |确定成交供应商[:,\s]
|
|
|
'合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|签订日期)|(供应商乙方|乙方供应商):|合同总?金额',
|
|
|
- '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
|
- '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?原因|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人数|供应商|单位)不足|已终止'
|
|
|
+ '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
|
+ '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)'
|
|
|
}
|
|
|
self.title_life_dic = {
|
|
|
'采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
|
|
@@ -2843,9 +2957,9 @@ class DocChannel():
|
|
|
else:
|
|
|
html = html[:ser.start() + 500]
|
|
|
text = re.sub('<[^<]*?>', '', html).replace(' ', ' ')
|
|
|
- text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
|
|
|
+ # text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
|
|
|
text = re.sub('\s+', ' ', text)
|
|
|
- text = re.sub('[/|[()()]', '', text)
|
|
|
+ # text = re.sub('[/|[()()]', '', text)
|
|
|
text = cut_single_cn_space(text)
|
|
|
return text[:20000]
|
|
|
|
|
@@ -2948,7 +3062,6 @@ class DocChannel():
|
|
|
life_list = [k]
|
|
|
elif life_score[k] == max_score and life_score[k] > 0:
|
|
|
life_list.append(k)
|
|
|
-
|
|
|
if '采购意向' in life_kw_title or '采购意向' in life_list:
|
|
|
return '采购意向', msc
|
|
|
elif '招标预告' in life_kw_title or '招标预告' in life_list:
|
|
@@ -2976,18 +3089,23 @@ class DocChannel():
|
|
|
elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
|
|
|
if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
|
|
|
return '招标公告', msc
|
|
|
+ elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
|
|
|
+ return '废标公告', msc
|
|
|
return '候选人公示', msc
|
|
|
elif '合同公告' in life_kw_title or '合同公告' in life_list:
|
|
|
if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
|
|
|
return '招标公告', msc
|
|
|
+ elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
|
|
|
+ return '废标公告', msc
|
|
|
return '合同公告', msc
|
|
|
+
|
|
|
elif '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
|
if '招标公告' in life_kw_title and life_score.get('招标公告',
|
|
|
0) > 2: # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245
|
|
|
return '招标公告', msc
|
|
|
- elif '废标公告' in life_kw_title:
|
|
|
+ elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
|
|
|
return '废标公告', msc
|
|
|
- elif life_score.get('候选人公示', 0) >= 3:
|
|
|
+ elif life_score.get('候选人公示', 0) > 3:
|
|
|
return '候选人公示', msc
|
|
|
elif life_score.get('合同公告', 0) > 5:
|
|
|
return '合同公告', msc
|
|
@@ -3050,7 +3168,7 @@ class DocChannel():
|
|
|
2、废标公告有中标人且标题无废标关键词,返回中标信息
|
|
|
3、答疑公告标题无答疑关键且原始为招标,返回原始类别
|
|
|
4、招标公告有中标人且原始为中标,返回中标信息
|
|
|
- 5、预测及原始均在招标、预告、意向,返回原始类别
|
|
|
+ 5、预测为招标,原始为预告、意向,返回原始类别
|
|
|
6、预测及原始均在变更、答疑,返回原始类别
|
|
|
7、预测为采招数据,原始为产权且有关键词,返回原始类别
|
|
|
8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
|
|
@@ -3073,8 +3191,8 @@ class DocChannel():
|
|
|
original_docchannel, '') == '中标信息':
|
|
|
result['docchannel']['docchannel'] = '中标信息'
|
|
|
msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
|
|
|
- elif result['docchannel']['docchannel'] in ['招标公告', '采购意向', '招标预告'] and origin_dic.get(
|
|
|
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['采购意向', '招标预告']:
|
|
|
result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
msc += '最终规则修改:预测及原始均在招标、预告、意向,返回原始类别'
|
|
|
elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
|
|
@@ -3774,6 +3892,120 @@ class IndustryPredictor():
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+class DistrictPredictor():
|
|
|
+ def __init__(self):
|
|
|
+ with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
|
|
|
+ dist_dic = pickle.load(f)
|
|
|
+ short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
|
|
|
+ full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
|
|
|
+ short2id = {}
|
|
|
+ full2id = {}
|
|
|
+ for k, v in dist_dic.items():
|
|
|
+ if v['简称'] not in short2id:
|
|
|
+ short2id[v['简称']] = [k]
|
|
|
+ else:
|
|
|
+ short2id[v['简称']].append(k)
|
|
|
+ if v['全称'] not in full2id:
|
|
|
+ full2id[v['全称']] = [k]
|
|
|
+ else:
|
|
|
+ full2id[v['全称']].append(k)
|
|
|
+ self.dist_dic = dist_dic
|
|
|
+ self.short_name = short_name
|
|
|
+ self.full_name = full_name
|
|
|
+ self.short2id = short2id
|
|
|
+ self.full2id = full2id
|
|
|
+
|
|
|
+ def predict(self, project_name, prem, web_source_name = ""):
|
|
|
+ def get_ree_addr(prem):
|
|
|
+ tenderee = ""
|
|
|
+ tenderee_address = ""
|
|
|
+ try:
|
|
|
+ for v in prem[0]['prem'].values():
|
|
|
+ for link in v['roleList']:
|
|
|
+ if link['role_name'] == 'tenderee' and tenderee == "":
|
|
|
+ tenderee = link['role_text']
|
|
|
+ tenderee_address = link['address']
|
|
|
+ except Exception as e:
|
|
|
+ print('解析prem 获取招标人、及地址出错')
|
|
|
+ return tenderee, tenderee_address
|
|
|
+ tenderee, tenderee_address = get_ree_addr(prem)
|
|
|
+ project_name = str(project_name).replace(str(tenderee), '')
|
|
|
+ text = "{} {} {}".format(project_name, tenderee, tenderee_address)
|
|
|
+ text = re.sub('复合肥|铁路|公路', ' ', text)
|
|
|
+ score_l = []
|
|
|
+ id_set = set()
|
|
|
+
|
|
|
+ if re.search(self.short_name, text):
|
|
|
+ for it in re.finditer(self.full_name, text):
|
|
|
+ name = it.group(0)
|
|
|
+ score = len(name) / len(text)
|
|
|
+ for _id in self.full2id[name]:
|
|
|
+ area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
|
|
|
+ # score_l.append([_id, score] + area)
|
|
|
+ w = self.dist_dic[_id]['权重']
|
|
|
+ score_l.append([_id, score+w]+ area)
|
|
|
+
|
|
|
+ flag = 0
|
|
|
+ for it in re.finditer(self.short_name, text):
|
|
|
+ if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
|
|
|
+ name = it.group(0)
|
|
|
+ score = (it.start() + len(name)) / len(text)
|
|
|
+ for _id in self.short2id[name]:
|
|
|
+ score2 = 0
|
|
|
+ w = self.dist_dic[_id]['权重']
|
|
|
+ _type = self.dist_dic[_id]['类型']
|
|
|
+ area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
|
|
|
+ if area[0] in ['2', '16', '20', '30']:
|
|
|
+ _type += 10
|
|
|
+ score2 += w
|
|
|
+ if _id not in id_set:
|
|
|
+ if _type == 20:
|
|
|
+ type_w = 3
|
|
|
+ elif _type == 30:
|
|
|
+ type_w = 2
|
|
|
+ else:
|
|
|
+ type_w = 1
|
|
|
+ id_set.add(_id)
|
|
|
+ score2 += w * type_w
|
|
|
+ score_l.append([_id, score * w + score2] + area)
|
|
|
+
|
|
|
+ if flag == 1:
|
|
|
+ pass
|
|
|
+ # print('score', score)
|
|
|
+ if re.search('公司', web_source_name) == None:
|
|
|
+ for it in re.finditer(self.short_name, web_source_name):
|
|
|
+ name = it.group(0)
|
|
|
+ for _id in self.short2id[name]:
|
|
|
+ area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
|
|
|
+ w = self.dist_dic[_id]['权重']
|
|
|
+ score = w * 0.2
|
|
|
+ score_l.append([_id, score] + area)
|
|
|
+ area_dic = {'area': '全国', 'province': '未知', 'city': '未知', 'district': '未知'}
|
|
|
+ if len(score_l) == 0:
|
|
|
+ return {'district':area_dic}
|
|
|
+ else:
|
|
|
+ df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
|
|
|
+ df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
|
|
|
+ pro_id = df_pro.index[0]
|
|
|
+ # if df_pro.loc[pro_id, 'score'] < 0.1: # 省级评分小于0.1的不要
|
|
|
+ # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
|
|
|
+ # return area_dic
|
|
|
+ area_dic['province'] = self.dist_dic[pro_id]['地区']
|
|
|
+ area_dic['area'] = self.dist_dic[pro_id]['大区']
|
|
|
+ df = df[df['city'] != ""]
|
|
|
+ df = df[df['province'] == pro_id]
|
|
|
+ if len(df) > 0:
|
|
|
+ df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
|
|
|
+ city_id = df_city.index[0]
|
|
|
+ area_dic['city'] = self.dist_dic[city_id]['地区']
|
|
|
+ df = df[df['district'] != ""]
|
|
|
+ df = df[df['city'] == city_id]
|
|
|
+ if len(df) > 0:
|
|
|
+ df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
|
|
|
+ dist_id = df_dist.index[0]
|
|
|
+ area_dic['district'] = self.dist_dic[dist_id]['地区']
|
|
|
+ # print(area_dic)
|
|
|
+ return {'district':area_dic}
|
|
|
|
|
|
|
|
|
def getSavedModel():
|