|
@@ -45,12 +45,44 @@ file = os.path.dirname(__file__) + '/agency_set.pkl'
|
|
with open(file, 'rb') as f:
|
|
with open(file, 'rb') as f:
|
|
agency_set = pickle.load(f)
|
|
agency_set = pickle.load(f)
|
|
|
|
|
|
|
|
+with open(os.path.dirname(__file__) + '/header_set.pkl', 'rb') as f:
|
|
|
|
+ header_set = pickle.load(f)
|
|
|
|
+
|
|
def is_agency(entity_text):
|
|
def is_agency(entity_text):
|
|
if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[()\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[()\w]{,4}(中心|服务所)|法院$',
|
|
if re.search('(招投?标|采购|代理|咨询|管理|物资|事务所?|顾问|监理|拍卖)[()\w]{,4}(有限)?(责任)?公司|(采购|招投?标|交易|代理|咨询)[()\w]{,4}(中心|服务所)|法院$',
|
|
entity_text) or entity_text in agency_set:
|
|
entity_text) or entity_text in agency_set:
|
|
return True
|
|
return True
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
+def get_role(text, nlp_enterprise):
|
|
|
|
+ '''
|
|
|
|
+ 获取字符串text角色实体
|
|
|
|
+ :param text: 待获取实体字符串
|
|
|
|
+ :param nlp_enterprise: 公告中的角色实体列表
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
|
+ , ',', text)
|
|
|
|
+ text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
|
+ text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
|
|
+ if text in nlp_enterprise:
|
|
|
|
+ return text
|
|
|
|
+ if len(text) > 50 or len(text)<4:
|
|
|
|
+ return ''
|
|
|
|
+ ners = getNers([text], useselffool=True)
|
|
|
|
+ roles = []
|
|
|
|
+ if ners:
|
|
|
|
+ for ner in ners[0]:
|
|
|
|
+ if ner[2] in ['org', 'company']:
|
|
|
|
+ roles.append(ner[3])
|
|
|
|
+ elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
|
|
|
|
+ roles.append(ner[3])
|
|
|
|
+ if roles and len(''.join(roles)) > len(text)*0.8:
|
|
|
|
+ return roles[0]
|
|
|
|
+ else:
|
|
|
|
+ return ''
|
|
|
|
+
|
|
from threading import RLock
|
|
from threading import RLock
|
|
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
"prem":{"predictor":None,"Lock":RLock()},
|
|
"prem":{"predictor":None,"Lock":RLock()},
|
|
@@ -76,7 +108,9 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
'project_label': {"predictor": None, "Lock": RLock()},
|
|
'project_label': {"predictor": None, "Lock": RLock()},
|
|
'pb_extract': {"predictor": None, "Lock": RLock()},
|
|
'pb_extract': {"predictor": None, "Lock": RLock()},
|
|
'property_label': {"predictor": None, "Lock": RLock()},
|
|
'property_label': {"predictor": None, "Lock": RLock()},
|
|
- 'approval': {"predictor": None, "Lock": RLock()} # 审批项目预测
|
|
|
|
|
|
+ 'approval': {"predictor": None, "Lock": RLock()}, # 审批项目预测
|
|
|
|
+ 'bid_score': {"predictor": None, "Lock": RLock()}, # 评标评分
|
|
|
|
+ 'entity_type_rule': {"predictor": None, "Lock": RLock()}, # 地址、时间分类
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -134,6 +168,10 @@ def getPredictor(_type):
|
|
dict_predictor[_type]['predictor'] = PropertyLabel()
|
|
dict_predictor[_type]['predictor'] = PropertyLabel()
|
|
if _type == 'approval':
|
|
if _type == 'approval':
|
|
dict_predictor[_type]['predictor'] = ApprovalPredictor()
|
|
dict_predictor[_type]['predictor'] = ApprovalPredictor()
|
|
|
|
+ if _type == 'bid_score':
|
|
|
|
+ dict_predictor[_type]['predictor'] = BiddingScore()
|
|
|
|
+ if _type == 'entity_type_rule':
|
|
|
|
+ dict_predictor[_type]['predictor'] = EntityTypeRulePredictor()
|
|
return dict_predictor[_type]["predictor"]
|
|
return dict_predictor[_type]["predictor"]
|
|
raise NameError("no this type of predictor")
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
|
@@ -988,9 +1026,9 @@ class PREMPredict():
|
|
# elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
|
|
# elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
|
|
# label = 2
|
|
# label = 2
|
|
elif label ==0: # 错误招标金额处理
|
|
elif label ==0: # 错误招标金额处理
|
|
- if re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
|
|
|
|
|
|
+ if entity.notes in ["投资", "总投资","工程造价"] or re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
|
|
values[label] = 0.51
|
|
values[label] = 0.51
|
|
- elif entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$|注册资本', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
|
|
|
|
|
|
+ elif re.search('最低限价:?$|注册资本', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
label = 2
|
|
label = 2
|
|
elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+|(含)', behind):
|
|
elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+|(含)', behind):
|
|
@@ -999,6 +1037,9 @@ class PREMPredict():
|
|
# values[label] = 0.49
|
|
# values[label] = 0.49
|
|
# elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
|
|
# elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
|
|
# label = 2
|
|
# label = 2
|
|
|
|
+ elif re.search('招标金额|限价|预算|控制价|拦标价', front) == None and re.search('预计约?为?$',
|
|
|
|
+ front): # 20241206纠正 565894149(预计约2500元)预测为预算
|
|
|
|
+ label = 2
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
label = 1
|
|
label = 1
|
|
values[label] = 0.8
|
|
values[label] = 0.8
|
|
@@ -6785,35 +6826,6 @@ class TablePremExtractor(object):
|
|
contain_header = True
|
|
contain_header = True
|
|
return flag, contain_header, dict(), not_sure_winner
|
|
return flag, contain_header, dict(), not_sure_winner
|
|
|
|
|
|
- def get_role(self, text, nlp_enterprise):
|
|
|
|
- '''
|
|
|
|
- 获取字符串text角色实体
|
|
|
|
- :param text: 待获取实体字符串
|
|
|
|
- :param nlp_enterprise: 公告中的角色实体列表
|
|
|
|
- :return:
|
|
|
|
- '''
|
|
|
|
- text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
|
- , ',', text)
|
|
|
|
- text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
|
- text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
|
- text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
|
|
- if text in nlp_enterprise:
|
|
|
|
- return text
|
|
|
|
- if len(text) > 50 or len(text)<4:
|
|
|
|
- return ''
|
|
|
|
- ners = getNers([text], useselffool=True)
|
|
|
|
- roles = []
|
|
|
|
- if ners:
|
|
|
|
- for ner in ners[0]:
|
|
|
|
- if ner[2] in ['org', 'company']:
|
|
|
|
- roles.append(ner[3])
|
|
|
|
- elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
|
|
|
|
- roles.append(ner[3])
|
|
|
|
- if roles and len(''.join(roles)) > len(text)*0.8:
|
|
|
|
- return roles[0]
|
|
|
|
- else:
|
|
|
|
- return ''
|
|
|
|
-
|
|
|
|
def extract_from_df(self, df, headers, web_source_name, all_winner=False):
|
|
def extract_from_df(self, df, headers, web_source_name, all_winner=False):
|
|
prem_dic = {}
|
|
prem_dic = {}
|
|
previous_package = "" # 上一行包号
|
|
previous_package = "" # 上一行包号
|
|
@@ -6890,8 +6902,8 @@ class TablePremExtractor(object):
|
|
if len(pk_l) == 1:
|
|
if len(pk_l) == 1:
|
|
package = uniform_package_name(pk_l[0].group(0))
|
|
package = uniform_package_name(pk_l[0].group(0))
|
|
|
|
|
|
- tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
|
|
|
|
- tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
|
|
|
|
|
|
+ tenderee = get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
|
|
|
|
+ tenderer = get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
|
|
tenderee = cut_repeat_name(tenderee)
|
|
tenderee = cut_repeat_name(tenderee)
|
|
tenderer = cut_repeat_name(tenderer)
|
|
tenderer = cut_repeat_name(tenderer)
|
|
|
|
|
|
@@ -7695,10 +7707,13 @@ def get_header_line(list_item):
|
|
x.append(getPredictor("form").encode(item))
|
|
x.append(getPredictor("form").encode(item))
|
|
predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
for item, values in zip(list_item, list(predict_y)):
|
|
for item, values in zip(list_item, list(predict_y)):
|
|
|
|
+ item = str(item)
|
|
lb = 1 if values[1] > 0.5 else 0
|
|
lb = 1 if values[1] > 0.5 else 0
|
|
- if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准']:
|
|
|
|
|
|
+ if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准', '合格']:
|
|
lb = 0
|
|
lb = 0
|
|
- elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺']:
|
|
|
|
|
|
+ elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺'] or re.search('^比例\d{1,2}%$', item):
|
|
|
|
+ lb = 1
|
|
|
|
+ elif lb == 0 and item in header_set:
|
|
lb = 1
|
|
lb = 1
|
|
rs.append(lb)
|
|
rs.append(lb)
|
|
return rs
|
|
return rs
|
|
@@ -7976,6 +7991,267 @@ class ApprovalPredictor():
|
|
return [rs_dic]
|
|
return [rs_dic]
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
+class BiddingScore():
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.head_rule_dic = {
|
|
|
|
+ "tenderer": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
|
+ "score_price": "(价格|报价|单价|总价|经济)(部分|\w{,2})?([得评]分|评审)",
|
|
|
|
+ "score_technical": "技术(部分|\w{,2})?标?([得评]分|评审)",
|
|
|
|
+ "score_commercial": "商务(部分|\w{,2})?标?([得评]分|评审)",
|
|
|
|
+ "score_integrity": "诚信(部分|\w{,2})?([得评]分|评审)",
|
|
|
|
+ "score_comprehensive": "(综合(标|评估)?|总|最终)得?分$",
|
|
|
|
+ "ranking": "(得分)?排名",
|
|
|
|
+ "qualification_review": "资格性审查|是否通过资格",
|
|
|
|
+ "compliance_review": "符合性审查|是否通过符合"
|
|
|
|
+ }
|
|
|
|
+ self.tb = TableTag2List()
|
|
|
|
+
|
|
|
|
+ def get_table_info(self, df, nlp_enterprise):
|
|
|
|
+ def get_header_index(datas):
|
|
|
|
+ '''
|
|
|
|
+ 根据表格表头判断结果0/1 得到哪些行和列是表头
|
|
|
|
+ :param datas: 表格内容表头判断结果数据[[1,1,1,1],[0,0,0,0]]
|
|
|
|
+ :return: 表头所在的行和列序号
|
|
|
|
+ '''
|
|
|
|
+ header_row = []
|
|
|
|
+ header_col = []
|
|
|
|
+ df_h = pd.DataFrame(datas) # 表头判断数据 , columns=columns
|
|
|
|
+ for i in df_h.index:
|
|
|
|
+ line = df_h.loc[i].values
|
|
|
|
+ if sum(line) == len(line):
|
|
|
|
+ header_row.append((i, sum(line) / len(line)))
|
|
|
|
+ elif sum(line) / len(line) > 0.8:
|
|
|
|
+ header_row.append((i, sum(line) / len(line)))
|
|
|
|
+ elif len(line) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
|
|
|
|
+ re.findall('10', ''.join([str(it) for it in line]))):
|
|
|
|
+ header_row.append((i, sum(line) / len(line)))
|
|
|
|
+ for i in df_h.columns:
|
|
|
|
+ col = df_h[i].values
|
|
|
|
+ if sum(col) == len(col):
|
|
|
|
+ header_col.append((i, sum(col) / len(col)))
|
|
|
|
+ elif sum(col) / len(col) > 0.8:
|
|
|
|
+ header_col.append((i, sum(col) / len(col)))
|
|
|
|
+ elif len(col) > 3 and len(re.findall('11', ''.join([str(it) for it in line]))) > len(
|
|
|
|
+ re.findall('10', ''.join([str(it) for it in line]))):
|
|
|
|
+ header_col.append((i, sum(col) / len(col)))
|
|
|
|
+ return header_row, header_col
|
|
|
|
+
|
|
|
|
+ def get_header(l, head_rule_dic):
|
|
|
|
+ header_dic = {}
|
|
|
|
+ for i in range(len(l)):
|
|
|
|
+ text = l[i]
|
|
|
|
+ num = 0
|
|
|
|
+ tmp_dic = {}
|
|
|
|
+ for k, v in head_rule_dic.items():
|
|
|
|
+ # print('k : ', k)
|
|
|
|
+ if re.search(v, text):
|
|
|
|
+ tmp_dic[k] = i
|
|
|
|
+ num += 1
|
|
|
|
+ # if num > 1:
|
|
|
|
+ # if tmp_dic.keys() == set(['qualification_review', 'compliance_review']):
|
|
|
|
+ # for k, v in tmp_dic.items():
|
|
|
|
+ # if k not in header_dic:
|
|
|
|
+ # header_dic[k] = v
|
|
|
|
+ # elif tmp_dic:
|
|
|
|
+ for k, v in tmp_dic.items():
|
|
|
|
+ if k not in header_dic:
|
|
|
|
+ header_dic[k] = v
|
|
|
|
+ return header_dic
|
|
|
|
+
|
|
|
|
+ def get_score(text):
|
|
|
|
+ text = text.strip()
|
|
|
|
+ if re.search('^\d{1,2}(\.\d{2})$', text):
|
|
|
|
+ return text
|
|
|
|
+ elif re.search('^\d{1,2}(\.\d{2})?[\d,,;\.]*$', text):
|
|
|
|
+ return text
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+ result_l = []
|
|
|
|
+ datas = []
|
|
|
|
+ for i in df.index:
|
|
|
|
+ line = get_header_line(df.loc[i].values)
|
|
|
|
+ datas.append(line)
|
|
|
|
+ header_row, header_col = get_header_index(datas)
|
|
|
|
+ if len(header_col) == 1 and header_col[0][0] > 1: # 列表头不可能在第1列后面开始
|
|
|
|
+ header_col = []
|
|
|
|
+ if len(header_row) >= 1 and len(header_col) == 0: # 有行表头无列表头
|
|
|
|
+ i = 0
|
|
|
|
+ while i < len(header_row):
|
|
|
|
+ idx, ratio = header_row[i]
|
|
|
|
+ if idx + 1 >= len(df):
|
|
|
|
+ break
|
|
|
|
+ header_dic = get_header(df.loc[idx].values, self.head_rule_dic)
|
|
|
|
+ i += 1
|
|
|
|
+ range_from = idx + 1
|
|
|
|
+ range_to = len(df)
|
|
|
|
+ if i < len(header_row):
|
|
|
|
+ next_header = i
|
|
|
|
+ for j in range(i, len(header_row)):
|
|
|
|
+ idx2, ratio2 = header_row[j]
|
|
|
|
+ if idx2 - idx == 1:
|
|
|
|
+ header_dic2 = get_header(df.loc[idx2].values, self.head_rule_dic)
|
|
|
|
+ if set(df.loc[idx].values) & set(df.loc[idx2].values) != set():
|
|
|
|
+ header_dic.update(header_dic2)
|
|
|
|
+ else:
|
|
|
|
+ header_dic = header_dic2
|
|
|
|
+ range_from = idx2 + 1
|
|
|
|
+ range_to = len(df)
|
|
|
|
+ next_header = j + 1
|
|
|
|
+ idx = idx2
|
|
|
|
+ else:
|
|
|
|
+ range_from = idx + 1
|
|
|
|
+ range_to = idx2
|
|
|
|
+ next_header = j
|
|
|
|
+ break
|
|
|
|
+ i = next_header
|
|
|
|
+ if len(header_dic) >= 2 and 'tenderer' in header_dic:
|
|
|
|
+ for index in range(range_from, range_to):
|
|
|
|
+ tmp_dic = {}
|
|
|
|
+ for k, v in header_dic.items():
|
|
|
|
+ if k.startswith('score'):
|
|
|
|
+ content = get_score(df.loc[index, v])
|
|
|
|
+ elif k == 'tenderer':
|
|
|
|
+ content = get_role(df.loc[index, v], nlp_enterprise)
|
|
|
|
+ elif k == 'ranking':
|
|
|
|
+ content = df.loc[index, v] if re.search('^第?[\d一二三四五六七八九十]+名?$',df.loc[index, v]) else ''
|
|
|
|
+ else:
|
|
|
|
+ content = df.loc[index, v]
|
|
|
|
+ if content != '':
|
|
|
|
+ tmp_dic[k] = content
|
|
|
|
+ if len(tmp_dic) > 1 and 'tenderer' in tmp_dic and tmp_dic not in result_l:
|
|
|
|
+ result_l.append(tmp_dic)
|
|
|
|
+ elif len(header_row) == 0 and len(header_col) >= 1:
|
|
|
|
+ i = 0
|
|
|
|
+ while i < len(header_col):
|
|
|
|
+ idx, ratio = header_col[i]
|
|
|
|
+ if idx + 1 >= len(df.columns):
|
|
|
|
+ break
|
|
|
|
+ header_dic = get_header(df[idx].values, self.head_rule_dic)
|
|
|
|
+ i += 1
|
|
|
|
+ range_from = idx + 1
|
|
|
|
+ range_to = len(df.columns)
|
|
|
|
+ if i < len(header_col):
|
|
|
|
+ next_header = i
|
|
|
|
+ for j in range(i, len(header_col)):
|
|
|
|
+ idx2, ratio2 = header_col[j]
|
|
|
|
+ if idx2 - idx == 1:
|
|
|
|
+ header_dic2 = get_header(df[idx2].values, self.head_rule_dic)
|
|
|
|
+ if set(df[idx].values) & set(df[idx2].values) != set():
|
|
|
|
+ header_dic.update(header_dic2)
|
|
|
|
+ else:
|
|
|
|
+ header_dic = header_dic2
|
|
|
|
+ range_from = idx2 + 1
|
|
|
|
+ range_to = len(df.columns)
|
|
|
|
+ next_header = j + 1
|
|
|
|
+ idx = idx2
|
|
|
|
+ else:
|
|
|
|
+ range_from = idx + 1
|
|
|
|
+ range_to = idx2
|
|
|
|
+ next_header = j
|
|
|
|
+ break
|
|
|
|
+ i = next_header
|
|
|
|
+ if len(header_dic.keys()&set(['tenderer','score_technical', 'score_commercial', 'score_price', 'score_comprehensive'])) >= 2 and 'tenderer' in header_dic:
|
|
|
|
+ for index in range(range_from, range_to):
|
|
|
|
+ tmp_dic = {}
|
|
|
|
+ for k, v in header_dic.items():
|
|
|
|
+ if k.startswith('score'):
|
|
|
|
+ content = get_score(df.loc[v, index])
|
|
|
|
+ elif k == 'tenderer':
|
|
|
|
+ content = get_role(df.loc[v, index], nlp_enterprise)
|
|
|
|
+ elif k == 'ranking':
|
|
|
|
+ content = df.loc[v, index] if re.search('^第?[\d一二三四五六七八九十]+名?$', df.loc[v, index]) else ''
|
|
|
|
+ else:
|
|
|
|
+ content = df.loc[v, index]
|
|
|
|
+ if content != '':
|
|
|
|
+ tmp_dic[k] = content
|
|
|
|
+ if len(tmp_dic) > 2 and 'tenderer' in tmp_dic and tmp_dic not in result_l:
|
|
|
|
+ result_l.append(tmp_dic)
|
|
|
|
+ elif len(header_row) == 1 and len(header_col) == 1:
|
|
|
|
+ pass
|
|
|
|
+ return result_l
|
|
|
|
+
|
|
|
|
+ def predict(self, html, nlp_enterprise=[]):
|
|
|
|
+ html = re.sub("<html>|</html>|<body>|</body>", "", html)
|
|
|
|
+ html = re.sub("##attachment##", "", html)
|
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
|
+ richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
|
+ self.nlp_enterprise = nlp_enterprise
|
|
|
|
+ if richText:
|
|
|
|
+ richText = richText.extract() # 过滤掉附件
|
|
|
|
+ tables = soup.find_all('table')
|
|
|
|
+ if len(tables) == 0 and richText:
|
|
|
|
+ tables = richText.find_all('table')
|
|
|
|
+ tables.reverse()
|
|
|
|
+ rs_dic = {}
|
|
|
|
+ for table in tables:
|
|
|
|
+ trs = self.tb.table2list(table)
|
|
|
|
+ if len(trs)>1 and len(trs[0])>1 and len(set([len(tr) for tr in trs])) == 1:
|
|
|
|
+ df = pd.DataFrame(trs)
|
|
|
|
+ rs_l = self.get_table_info(df, nlp_enterprise)
|
|
|
|
+ for d in rs_l:
|
|
|
|
+ if d['tenderer'] not in rs_dic:
|
|
|
|
+ rs_dic[d['tenderer']] = d
|
|
|
|
+ elif len(d) > len(rs_dic[d['tenderer']]):
|
|
|
|
+ rs_dic[d['tenderer']] = d
|
|
|
|
+ table.extract()
|
|
|
|
+ return list(rs_dic.values())
|
|
|
|
+
|
|
|
|
+class EntityTypeRulePredictor():
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址]([((]网址[))])?[:为]'
|
|
|
|
+ self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([((]网址[))])?[:为]'
|
|
|
|
+ self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址][:为]'
|
|
|
|
+ self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[:为]|项目位于'
|
|
|
|
+ self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
|
|
+ self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|
|
|
|
+ def predict(self, list_entitys, list_sentences, list_articles):
|
|
|
|
+ addr_dic = {}
|
|
|
|
+ time_dic = {}
|
|
|
|
+ code_investment = ''
|
|
|
|
+ for entity in list_entitys[0]:
|
|
|
|
+ if entity.entity_type == 'location':
|
|
|
|
+ b = entity.wordOffset_begin
|
|
|
|
+ s_index = entity.sentence_index
|
|
|
|
+ sentance_text = list_sentences[0][s_index].sentence_text
|
|
|
|
+ if re.search(self.pattern_addr_bidopen, sentance_text[max(0, b-10): b]):
|
|
|
|
+ addr_dic['addr_bidopen'] = entity.entity_text
|
|
|
|
+ elif re.search(self.pattern_addr_bidsend, sentance_text[max(0, b-10): b]):
|
|
|
|
+ addr_dic['addr_bidsend'] = entity.entity_text
|
|
|
|
+ elif re.search(self.pattern_addr_delivery, sentance_text[max(0, b-10): b]):
|
|
|
|
+ addr_dic['addr_delivery'] = entity.entity_text
|
|
|
|
+ elif re.search(self.pattern_addr_project, sentance_text[max(0, b-10): b]):
|
|
|
|
+ addr_dic['addr_project'] = entity.entity_text
|
|
|
|
+ elif entity.entity_type == 'time':
|
|
|
|
+ b = entity.wordOffset_begin
|
|
|
|
+ s_index = entity.sentence_index
|
|
|
|
+ sentance_text = list_sentences[0][s_index].sentence_text
|
|
|
|
+ if re.search(self.pattern_time_planned, sentance_text[max(0, b-12): b]):
|
|
|
|
+ time_dic['time_planned'] = entity.entity_text
|
|
|
|
+ elif entity.entity_type == 'code':
|
|
|
|
+ b = entity.wordOffset_begin
|
|
|
|
+ s_index = entity.sentence_index
|
|
|
|
+ sentance_text = list_sentences[0][s_index].sentence_text
|
|
|
|
+ if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
|
|
|
|
+ code_investment = entity.entity_text
|
|
|
|
+
|
|
|
|
+ ser1 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_bidopen, list_articles[0].content)
|
|
|
|
+ ser2 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_bidsend, list_articles[0].content)
|
|
|
|
+ ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
|
|
|
|
+ ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
|
|
|
|
+ ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
|
|
+ if ser1 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
|
|
|
|
+ addr_dic['addr_bidopen'] = ser1.group('addr')
|
|
|
|
+ if ser2 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
|
|
|
|
+ addr_dic['addr_bidsend'] = ser2.group('addr')
|
|
|
|
+ if ser3 and re.search('\w{2,5}[省市区]|\d号', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
|
|
|
|
+ addr_dic['addr_delivery'] = ser3.group('addr')
|
|
|
|
+ if ser4 and re.search('\w{2,5}[省市区]|\d号', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
|
|
|
|
+ addr_dic['addr_project'] = ser4.group('addr')
|
|
|
|
+ if ser5 and code_investment == '':
|
|
|
|
+ code_investment = ser5.group('code')
|
|
|
|
+
|
|
|
|
+ return addr_dic, time_dic, code_investment
|
|
|
|
+
|
|
def getSavedModel():
|
|
def getSavedModel():
|
|
#predictor = FormPredictor()
|
|
#predictor = FormPredictor()
|
|
graph = tf.Graph()
|
|
graph = tf.Graph()
|
|
@@ -8336,12 +8612,16 @@ if __name__=="__main__":
|
|
title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
|
|
title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
html = f.read()
|
|
html = f.read()
|
|
- tb_extract = TablePremExtractor()
|
|
|
|
- rs = tb_extract.predict(html, [
|
|
|
|
- "江苏中联铸本混凝土有限公司",
|
|
|
|
- "鼓楼区协荣机械设备经销部"
|
|
|
|
- ], web_source_name = '', all_winner=False)
|
|
|
|
- print('标段数:',len(rs[0]))
|
|
|
|
|
|
+ # tb_extract = TablePremExtractor()
|
|
|
|
+ # rs = tb_extract.predict(html, [
|
|
|
|
+ # "江苏中联铸本混凝土有限公司",
|
|
|
|
+ # "鼓楼区协荣机械设备经销部"
|
|
|
|
+ # ], web_source_name = '', all_winner=False)
|
|
|
|
+ # print('标段数:',len(rs[0]))
|
|
|
|
+ # print(rs)
|
|
|
|
+ bdscore = BiddingScore()
|
|
|
|
+ rs = bdscore.predict(html)
|
|
|
|
+ print(type(rs), len(rs))
|
|
print(rs)
|
|
print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|