|
@@ -28,6 +28,7 @@ import calendar
|
|
|
import datetime
|
|
|
from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
|
from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
|
|
|
+from BiddingKG.dl.interface.getAttributes import turnMoneySource
|
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
|
|
|
|
cpu_num = int(os.environ.get("CPU_NUM",0))
|
|
@@ -70,7 +71,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
'candidate': {"predictor": None, "Lock": RLock()},
|
|
|
'websource_tenderee': {"predictor": None, "Lock": RLock()},
|
|
|
'project_label': {"predictor": None, "Lock": RLock()},
|
|
|
- 'pb_extract': {"predictor": None, "Lock": RLock()}
|
|
|
+ 'pb_extract': {"predictor": None, "Lock": RLock()},
|
|
|
+ 'approval': {"predictor": None, "Lock": RLock()} # 审批项目预测
|
|
|
}
|
|
|
|
|
|
|
|
@@ -124,6 +126,8 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]['predictor'] = ProjectLabel()
|
|
|
if _type == 'pb_extract':
|
|
|
dict_predictor[_type]['predictor'] = PBPredictor()
|
|
|
+ if _type == 'approval':
|
|
|
+ dict_predictor[_type]['predictor'] = ApprovalPredictor()
|
|
|
return dict_predictor[_type]["predictor"]
|
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
@@ -1596,15 +1600,12 @@ class RoleRulePredictor():
|
|
|
_list_name = self._check_input(list_name, ignore=True)
|
|
|
find_flag = False
|
|
|
for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
|
|
|
- if str(_name).find(re.sub(")", ")", re.sub("(", "(",
|
|
|
- p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
|
|
|
+ if str(_name).find(p_entity.entity_text) >= 0 and p_entity.sentence_index < 4:
|
|
|
for _sentence in list_sentence:
|
|
|
if _sentence.sentence_index == p_entity.sentence_index:
|
|
|
_span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
end_index=p_entity.end_index, size=20, center_include=True,
|
|
|
- word_flag=True, use_text=True, text=re.sub(")", ")",
|
|
|
- re.sub("(", "(",
|
|
|
- p_entity.entity_text)))
|
|
|
+ word_flag=True, use_text=True, text=p_entity.entity_text)
|
|
|
if _span[2].startswith(":"): # 实体后面为冒号的不作为招标人,避免项目名称出错中标变招标 368122675 陇西兴恒建建筑有限责任公司:线路安全保护区内环境治理专项整改(第二标段)项目
|
|
|
break
|
|
|
if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
|
|
@@ -6102,7 +6103,7 @@ class TablePremExtractor(object):
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
"win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
@@ -6141,7 +6142,6 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
elif k in header_dic:
|
|
|
if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text): # 总价替换单价
|
|
|
- print('总价替换单价')
|
|
|
header_dic[k] = (i, text)
|
|
|
num += 1
|
|
|
elif k == 'project_code' and text != header_dic[k][1] and 'package_code' not in header_dic\
|
|
@@ -6151,7 +6151,7 @@ class TablePremExtractor(object):
|
|
|
header_dic[k] = (i, text)
|
|
|
num += 1
|
|
|
if num>1:
|
|
|
- print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
+ # print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
return flag, contain_header, dict()
|
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
@@ -6272,7 +6272,7 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
|
continue
|
|
|
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐的?(中标|成交)候选人', headers['tenderer'][1])==None:
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None:
|
|
|
tenderer = ""
|
|
|
|
|
|
if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
|
|
@@ -6958,6 +6958,134 @@ class WebsourceTenderee():
|
|
|
]}
|
|
|
return prem
|
|
|
|
|
|
+class ApprovalPredictor():
|
|
|
+ def __init__(self):
|
|
|
+ self.other_part = {
|
|
|
+ "project_name": "(项目|工程|采购|招标)名称:(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?",
|
|
|
+ "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标)编[号码]):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?",
|
|
|
+ "doc_num": "((审[批查]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案)文号|综合受理号):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)[,。]?(\w{2,10}:|$)?",
|
|
|
+ "pro_type": "(申报类型|项目所属行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?",
|
|
|
+ "year_limit": "((建设|工程|服务)年限):(?P<main>[\d个年月日.-]{2,20})[,。](\w{2,10}:|$)?",
|
|
|
+ "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|建设规模(如下)?):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
|
|
|
+ "approval_items": "((审[批查]|批[复准])事项|事项名称):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
|
|
|
+ "properties": "((建设|工程)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
|
|
|
+ "approval_result": "((审[批查]|批[复准])(结果|决定)):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
|
|
|
+ "phone": "联系电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|"
|
|
|
+ '\+86.?1[3-9]\d{9}|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[2-9]\d{6}\d?)|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=[2-9]\d{6,7})|'
|
|
|
+ '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
|
|
|
+ '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
|
|
|
+ '400\d{7}转\d{1,4}|'
|
|
|
+ '[2-9]\d{6,7})[,。](\w{2,10}:|$)?'
|
|
|
+ }
|
|
|
+
|
|
|
+ self.role_type = {
|
|
|
+ "declare_company": "申报(部门|机关|单位|企业|公司)", # 申报单位
|
|
|
+ "construct_company": "(业主|建设|用地))?(部门|机关|单位|企业|公司)|主送机关|法人单位", # 建设单位
|
|
|
+ "approver": "(审批|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司)", # 审批部门
|
|
|
+ "evaluation_agency": "环境影响评价机构|环评机构|评价机构|环评单位" # 环评机构
|
|
|
+ }
|
|
|
+ self.person_type = {
|
|
|
+ "legal_person": "项目法人|法定代表人" # 项目法人
|
|
|
+ }
|
|
|
+ self.date_type = {
|
|
|
+ "time_declare": "申报时间",
|
|
|
+ "time_commencement": "开工时间",
|
|
|
+ "time_completion": "竣工时间"
|
|
|
+ }
|
|
|
+
|
|
|
+ self.addr_type = {
|
|
|
+ "project_addr": "(建设|工程|项目)(地址|地点|位置)"
|
|
|
+ }
|
|
|
+
|
|
|
+ self.money_type = {
|
|
|
+ "total_tendereeMoney": "项目金额|项目投资|总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额",
|
|
|
+ }
|
|
|
+
|
|
|
+ def predict(self, list_sentences, list_entitys, span=12):
|
|
|
+ rs_dic = {k: "" for k in
|
|
|
+ self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
|
|
|
+ rs_dic['moneysource'] = ""
|
|
|
+ sentences = [it.sentence_text for it in sorted(list_sentences[0], key=lambda x: x.sentence_index)]
|
|
|
+ entities = [[] for _ in range(len(sentences))]
|
|
|
+ rs_l = []
|
|
|
+ found_key = 0
|
|
|
+ code_name_set = set() # 项目编号、名称集合
|
|
|
+ for entity in list_entitys[0]:
|
|
|
+ entities[entity.sentence_index].append(entity)
|
|
|
+
|
|
|
+ for i in range(len(sentences)):
|
|
|
+ multi_project = {k: "" for k in
|
|
|
+ self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
|
|
|
+ multi_project['moneysource'] = ''
|
|
|
+ text = sentences[i]
|
|
|
+ for k, v in self.other_part.items():
|
|
|
+ for iter in re.finditer(v, text):
|
|
|
+ rs_dic[k] = iter.group('main')
|
|
|
+ multi_project[k] = iter.group('main')
|
|
|
+ found_key = 1
|
|
|
+ break
|
|
|
+ for entity in entities[i]:
|
|
|
+ b, e = entity.wordOffset_begin, entity.wordOffset_end
|
|
|
+ if entity.entity_type in ['org', 'company']:
|
|
|
+ for k, v in self.role_type.items():
|
|
|
+ if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
+ rs_dic[k] = entity.entity_text
|
|
|
+ multi_project[k] = entity.entity_text
|
|
|
+ found_key = 1
|
|
|
+ elif entity.entity_type in ['person']:
|
|
|
+ for k, v in self.person_type.items():
|
|
|
+ if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
+ rs_dic[k] = entity.entity_text
|
|
|
+ multi_project[k] = entity.entity_text
|
|
|
+ found_key = 1
|
|
|
+ break
|
|
|
+ elif entity.entity_type in ['time']:
|
|
|
+ for k, v in self.date_type.items():
|
|
|
+ if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
+ rs_dic[k] = entity.entity_text
|
|
|
+ multi_project[k] = entity.entity_text
|
|
|
+ found_key = 1
|
|
|
+ elif entity.entity_type in ['location']:
|
|
|
+ for k, v in self.addr_type.items():
|
|
|
+ if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
+ rs_dic[k] = entity.entity_text
|
|
|
+ multi_project[k] = entity.entity_text
|
|
|
+ found_key = 1
|
|
|
+ elif entity.entity_type in ['money']:
|
|
|
+ for k, v in self.money_type.items():
|
|
|
+ if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
+ rs_dic[k] = entity.entity_text
|
|
|
+ multi_project[k] = entity.entity_text
|
|
|
+ found_key = 1
|
|
|
+ elif entity.entity_type in ['moneysource']:
|
|
|
+ rs_dic['moneysource'] = turnMoneySource(entity.entity_text)
|
|
|
+ multi_project['moneysource'] = turnMoneySource(entity.entity_text)
|
|
|
+ if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
|
|
|
+ code_name_set.add(multi_project['project_code']+multi_project['project_name'])
|
|
|
+ district = getPredictor('district').get_area(
|
|
|
+ multi_project['project_name'] + multi_project['project_addr'], '')
|
|
|
+ multi_project['area'] = district['district']['area']
|
|
|
+ multi_project['province'] = district['district']['province']
|
|
|
+ multi_project['city'] = district['district']['city']
|
|
|
+ multi_project['district'] = district['district']['district']
|
|
|
+ multi_project = {k:v for k,v in multi_project.items() if v != ''}
|
|
|
+ rs_l.append(multi_project)
|
|
|
+ if len(rs_l)>1:
|
|
|
+ return rs_l
|
|
|
+ elif found_key == 1:
|
|
|
+ district = getPredictor('district').get_area(
|
|
|
+ rs_dic['construct_company'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
|
|
|
+ rs_dic['area'] = district['district']['area']
|
|
|
+ rs_dic['province'] = district['district']['province']
|
|
|
+ rs_dic['city'] = district['district']['city']
|
|
|
+ rs_dic['district'] = district['district']['district']
|
|
|
+ rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
|
|
|
+ return [rs_dic]
|
|
|
+ return []
|
|
|
|
|
|
def getSavedModel():
|
|
|
#predictor = FormPredictor()
|