|
@@ -55,6 +55,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
"moneygrade": {"predictor": None, "Lock": RLock()},
|
|
|
"district": {"predictor": None, "Lock": RLock()},
|
|
|
'tableprem': {"predictor": None, "Lock": RLock()},
|
|
|
+ 'candidate': {"predictor": None, "Lock": RLock()},
|
|
|
}
|
|
|
|
|
|
|
|
@@ -100,6 +101,8 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]["predictor"] = DistrictPredictor()
|
|
|
if _type == 'tableprem':
|
|
|
dict_predictor[_type]["predictor"] = TablePremExtractor()
|
|
|
+ if _type == 'candidate':
|
|
|
+ dict_predictor[_type]["predictor"] = CandidateExtractor()
|
|
|
return dict_predictor[_type]["predictor"]
|
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
@@ -3499,7 +3502,7 @@ class DocChannel():
|
|
|
self.title_life_dic['废标公告'], title) == None:
|
|
|
result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
|
|
|
- elif result['docchannel']['doctype'] != '采招数据' and origin_dic.get(
|
|
|
+ elif result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
|
|
|
original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨', title)==None:
|
|
|
result['docchannel']['doctype'] = '采招数据'
|
|
|
msc += '最终规则修改:预测为非采招数据,原始为采招数据且无关键词,返回采招数据'
|
|
@@ -4320,23 +4323,8 @@ class DistrictPredictor():
|
|
|
rs = rs2
|
|
|
return rs
|
|
|
|
|
|
-class TablePremExtractor(object):
|
|
|
- def __init__(self):
|
|
|
- '''各要素表头规则'''
|
|
|
- self.head_rule_dic = {
|
|
|
- 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
- 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
- "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
|
|
|
- "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
- "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
- "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
- }
|
|
|
-
|
|
|
- with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
- self.headerset = pickle.load(f)
|
|
|
-
|
|
|
+class TableTag2List():
|
|
|
+ '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
|
|
|
def table2list(self, table):
|
|
|
self._output = []
|
|
|
row_ind = 0
|
|
@@ -4415,6 +4403,27 @@ class TablePremExtractor(object):
|
|
|
if self._output[i][j] == "":
|
|
|
self._output[i][j] = val
|
|
|
|
|
|
+
|
|
|
+class TablePremExtractor(object):
|
|
|
+ def __init__(self):
|
|
|
+ '''各要素表头规则'''
|
|
|
+ self.head_rule_dic = {
|
|
|
+ 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
+ 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
|
|
|
+ "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
+ "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
+ "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
+ "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ }
|
|
|
+
|
|
|
+ with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
+ self.headerset = pickle.load(f)
|
|
|
+
|
|
|
+ self.tb = TableTag2List()
|
|
|
+
|
|
|
+
|
|
|
def find_header(self, td_list):
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
@@ -4440,7 +4449,7 @@ class TablePremExtractor(object):
|
|
|
return flag, dict()
|
|
|
|
|
|
def is_role(self, text):
|
|
|
- if len(text) > 25 or len(text)<5:
|
|
|
+ if len(text) > 25 or len(text)<4:
|
|
|
return False
|
|
|
elif len(re.findall('有限责?任?公司', text)) > 1:
|
|
|
return False
|
|
@@ -4469,6 +4478,10 @@ class TablePremExtractor(object):
|
|
|
bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
|
|
|
win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
|
|
|
+ if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
|
|
|
+ package_code_raw = project_name
|
|
|
+ project_name = ""
|
|
|
+
|
|
|
package_code = package_code_raw
|
|
|
if re.search('合计|总计', package_code+project_code):
|
|
|
continue
|
|
@@ -4481,7 +4494,7 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
|
|
|
continue
|
|
|
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and 'bid_amount' in headers and re.search('(中标|成交)价', headers['bid_amount'][1])==None:
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
|
|
|
tenderer = ""
|
|
|
|
|
|
tenderee = tenderee if self.is_role(tenderee) else ""
|
|
@@ -4580,7 +4593,7 @@ class TablePremExtractor(object):
|
|
|
tables.reverse()
|
|
|
rs_dic = {}
|
|
|
for table in tables:
|
|
|
- trs = self.table2list(table)
|
|
|
+ trs = self.tb.table2list(table)
|
|
|
table.extract()
|
|
|
i = 0
|
|
|
headers = ""
|
|
@@ -4617,6 +4630,270 @@ class TablePremExtractor(object):
|
|
|
prem = self.get_prem(richText)
|
|
|
return prem
|
|
|
|
|
|
+class CandidateExtractor(object):
|
|
|
+ def __init__(self):
|
|
|
+ '''各要素表头规则'''
|
|
|
+ self.head_rule_dic = {
|
|
|
+ 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
+ "win_sort": "排名|排序|名次",
|
|
|
+ 'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
|
+ "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
|
+ "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ "win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
+ "second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
|
+ "third_tenderer": "第三名|第三(中标|成交)?候选人",
|
|
|
+ }
|
|
|
+ '''非表格候选人正则'''
|
|
|
+ self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
|
+ self.tb = TableTag2List()
|
|
|
+ with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
+ self.headerset = pickle.load(f)
|
|
|
+
|
|
|
+ def find_header(self, td_list):
|
|
|
+ header_dic = dict()
|
|
|
+ flag = False
|
|
|
+ if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
|
|
|
+ flag = True
|
|
|
+ for i in range(len(td_list)) :
|
|
|
+ text = td_list[i]
|
|
|
+ if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
+ continue
|
|
|
+ if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
+ return flag, dict()
|
|
|
+ num = 0
|
|
|
+ for k, v in self.head_rule_dic.items():
|
|
|
+ if re.search(v, text):
|
|
|
+ header_dic[k] = (i, text)
|
|
|
+ if k != 'candidate': # candidate 可与前三候选重复
|
|
|
+ num += 1
|
|
|
+ if num>1:
|
|
|
+ print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
+ return flag, dict()
|
|
|
+ if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
|
|
|
+ return flag, header_dic
|
|
|
+ return flag, dict()
|
|
|
+
|
|
|
+ def is_role(self, text):
|
|
|
+ if len(text) > 25 or len(text) < 4:
|
|
|
+ return False
|
|
|
+ elif len(re.findall('有限责?任?公司', text)) > 1:
|
|
|
+ return False
|
|
|
+ elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ ners = selffool.ner(text)
|
|
|
+ if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def money_process(self, money_text, header):
|
|
|
+ '''
|
|
|
+ 输入金额文本及金额列表头,返回统一数字化金额及金额单位
|
|
|
+ :param money_text:
|
|
|
+ :param header:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ money = 0
|
|
|
+ money_unit = ""
|
|
|
+ re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
|
|
|
+ if re_price:
|
|
|
+ money_text = re_price.group(0)
|
|
|
+ if '万元' in header and '万' not in money_text:
|
|
|
+ money_text += '万元'
|
|
|
+ money = float(str(getUnifyMoney(money_text)))
|
|
|
+ if money > 10000000000000: # 大于万亿的去除
|
|
|
+ money = 0
|
|
|
+ money_unit = '万元' if '万' in money_text else '元'
|
|
|
+ return (money, money_unit)
|
|
|
+
|
|
|
+ def extract_from_df(self, df, headers):
|
|
|
+ prem_dic = {}
|
|
|
+ link_set = set()
|
|
|
+ candidate_set = set()
|
|
|
+ role_dic = dict() # 保存一二三候选人并排的情况
|
|
|
+ for i in df.index:
|
|
|
+ package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
|
|
|
+ candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
|
|
|
+ win_or_not = df.loc[i, headers['win_or_not'][0]] if "win_or_not" in headers else ""
|
|
|
+ # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
|
|
|
+ bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
|
|
|
+ win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
+ win_tenderer = df.loc[i, headers['win_tenderer'][0]] if "win_tenderer" in headers else ""
|
|
|
+ second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
|
|
|
+ third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
|
|
|
+
|
|
|
+ package_code = package_code_raw
|
|
|
+
|
|
|
+ candidate = candidate_ if self.is_role(candidate_) else ""
|
|
|
+ # tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
+
|
|
|
+ # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
+ # break
|
|
|
+ if(candidate,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
|
|
|
+ continue
|
|
|
+ link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
|
|
|
+ package = package_code
|
|
|
+ package = uniform_package_name(package) if package !="" else "Project"
|
|
|
+ if candidate_:
|
|
|
+ if win_or_not and re.search('否|未入围', win_or_not):
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ candidate_set.add(candidate)
|
|
|
+
|
|
|
+ if win_tenderer and second_tenderer and third_tenderer:
|
|
|
+ if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
|
|
|
+ for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
+ [win_tenderer, second_tenderer, third_tenderer]):
|
|
|
+ if self.is_role(text):
|
|
|
+ if type not in role_dic:
|
|
|
+ role_dic[type] = dict()
|
|
|
+ role_dic[type]['role_text'] = text
|
|
|
+ if type in ['second_tenderer', 'third_tenderer']:
|
|
|
+ candidate_set.add(text)
|
|
|
+
|
|
|
+ elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]):
|
|
|
+ header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
|
+ for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
+ [win_tenderer, second_tenderer, third_tenderer]):
|
|
|
+ money, money_unit = self.money_process(text, header)
|
|
|
+ if money > 0:
|
|
|
+ if type not in role_dic:
|
|
|
+ role_dic[type] = dict()
|
|
|
+ role_dic[type]['money'] = money
|
|
|
+ role_dic[type]['money_unit'] = money_unit
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ elif candidate and win_sort:
|
|
|
+ role_type = ""
|
|
|
+ if re.search('第[一1]|^[一1]$', win_sort):
|
|
|
+ role_type = "win_tenderer"
|
|
|
+ elif re.search('第[二2]|^[二2]$', win_sort):
|
|
|
+ role_type = "second_tenderer"
|
|
|
+ elif re.search('第[三3]|^[三3]$', win_sort):
|
|
|
+ role_type = "third_tenderer"
|
|
|
+ if role_type != "":
|
|
|
+ if package not in prem_dic:
|
|
|
+ prem_dic[package] = {
|
|
|
+ 'code': '',
|
|
|
+ 'name': '',
|
|
|
+ 'roleList': [],
|
|
|
+ 'tendereeMoney': 0,
|
|
|
+ 'tendereeMoneyUnit': ""
|
|
|
+ }
|
|
|
+
|
|
|
+ bid_amount, money_unit = self.money_process(bid_amount_, df.loc[i, headers['bid_amount'][0]]) if "bid_amount" in headers else (0, "")
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": bid_amount,
|
|
|
+ "money_unit": money_unit
|
|
|
+ },
|
|
|
+ "role_name": role_type,
|
|
|
+ "role_text": candidate,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
|
|
|
+ prem_dic.pop(package)
|
|
|
+ if role_dic and prem_dic == dict():
|
|
|
+ if package not in prem_dic:
|
|
|
+ prem_dic[package] = {
|
|
|
+ 'code': '',
|
|
|
+ 'name': '',
|
|
|
+ 'roleList': [],
|
|
|
+ 'tendereeMoney': 0,
|
|
|
+ 'tendereeMoneyUnit': ""
|
|
|
+ }
|
|
|
+ for role_type, v in role_dic.items():
|
|
|
+ role_text = v.get('role_text', '')
|
|
|
+ if role_text == "":
|
|
|
+ continue
|
|
|
+ money = v.get('money', 0)
|
|
|
+ money_unit = v.get('money_unit', '')
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": money,
|
|
|
+ "money_unit": money_unit
|
|
|
+ },
|
|
|
+ "role_name": role_type,
|
|
|
+ "role_text": role_text,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
|
|
|
+ prem_dic.pop(package)
|
|
|
+
|
|
|
+ return prem_dic, candidate_set
|
|
|
+
|
|
|
+ def get_prem(self, soup):
|
|
|
+ tables = soup.find_all('table')
|
|
|
+ tables.reverse()
|
|
|
+ rs_dic = {}
|
|
|
+ candidate_set = set()
|
|
|
+ for table in tables:
|
|
|
+ trs = self.tb.table2list(table)
|
|
|
+ table.extract()
|
|
|
+ i = 0
|
|
|
+ headers = ""
|
|
|
+ while i < len(trs) - 1:
|
|
|
+ flag_, headers_ = self.find_header(trs[i])
|
|
|
+ if flag_ and headers_ != dict():
|
|
|
+ table_items = []
|
|
|
+ headers = headers_
|
|
|
+ for j in range(i + 1, len(trs)):
|
|
|
+ if len(trs[j]) == len(trs[i]):
|
|
|
+ flag_, headers_ = self.find_header(trs[j])
|
|
|
+ if flag_:
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ table_items.append(trs[j])
|
|
|
+ else:
|
|
|
+ print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
|
|
|
+ break
|
|
|
+ if len(table_items) > 1:
|
|
|
+ df = pd.DataFrame(table_items)
|
|
|
+ prem_, candidate_set_ = self.extract_from_df(df, headers)
|
|
|
+ rs_dic.update(prem_)
|
|
|
+ candidate_set.update(candidate_set_)
|
|
|
+ i = j - 1
|
|
|
+ i += 1
|
|
|
+ return rs_dic, candidate_set
|
|
|
+
|
|
|
+ def get_candidates_from_text(self, list_sentences, list_entitys):
|
|
|
+ candidates = set()
|
|
|
+ sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
|
|
|
+ for ent in list_entitys[0]:
|
|
|
+ if ent.entity_type in ['org', 'company']:
|
|
|
+ sen_index = ent.sentence_index
|
|
|
+
|
|
|
+ text = sentences[sen_index].sentence_text
|
|
|
+ b = ent.wordOffset_begin
|
|
|
+ e = ent.wordOffset_end
|
|
|
+ if isinstance(b, int) and isinstance(e, int):
|
|
|
+ foreword = text[max(0, b - 10):b]
|
|
|
+ if re.search(self.p, foreword):
|
|
|
+ candidates.add(ent.entity_text)
|
|
|
+ return candidates
|
|
|
+
|
|
|
+ def predict(self, html, list_sentences, list_entitys):
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
+ richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
+ if richText:
|
|
|
+ richText = richText.extract() # 过滤掉附件
|
|
|
+ prem, candidate_set = self.get_prem(soup)
|
|
|
+ if prem == {} and richText:
|
|
|
+ prem, candidate_set = self.get_prem(richText)
|
|
|
+ if prem == {} and candidate_set == set():
|
|
|
+ candidate_set = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
|
+ return prem, {'candidate': ','.join(candidate_set)}
|
|
|
+
|
|
|
|
|
|
def getSavedModel():
|
|
|
#predictor = FormPredictor()
|