|
@@ -681,7 +681,9 @@ class PREMPredict():
|
|
|
label = np.argmax(predict_y[i])
|
|
|
values = predict_y[i]
|
|
|
text = text_list[i]
|
|
|
- if label == 2:
|
|
|
+ if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
|
+ label = 5
|
|
|
+ elif label == 2:
|
|
|
if re.search('中标单位和.{,25}签订合同', text):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
@@ -738,12 +740,13 @@ class PREMPredict():
|
|
|
label = np.argmax(predict_y[i])
|
|
|
values = predict_y[i]
|
|
|
text = text_list[i]
|
|
|
- if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
|
|
|
+ if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
|
|
|
+ label = 2
|
|
|
+ elif label == 1 and re.search('[::,。](总金额|总价|单价)', text):
|
|
|
values[label] = 0.49
|
|
|
elif label ==0 and entity.notes in ["投资", "工程造价"]:
|
|
|
values[label] = 0.49
|
|
|
elif label == 0 and re.search('最低限价', text):
|
|
|
- label = 2
|
|
|
values[label] = 0.49
|
|
|
entity.set_Money(label, values)
|
|
|
|
|
@@ -1202,7 +1205,7 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额")
|
|
|
+ self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
|
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
@@ -1400,6 +1403,10 @@ class RoleRulePredictor():
|
|
|
_span[0]) is None:
|
|
|
p_entity.values[1] = 0.8 + p_entity.values[1] / 10
|
|
|
p_entity.label = 1
|
|
|
+ elif re.search('(预算金额|最高(投标)?上?限[价额]?格?|招标控制价))?:?([\d.,]+万?元[,(]其中)?(第?[一二三四五0-9](标[段|包]|[分子]包):?[\d.,]+万?元,)*第?[一二三四五0-9](标[段|包]|[分子]包):?$'
|
|
|
+ , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
|
|
|
+ p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
|
+ p_entity.label = 0
|
|
|
|
|
|
# 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
|
list_p = []
|
|
@@ -2976,7 +2983,8 @@ class DocChannel():
|
|
|
'公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
|
|
|
'公告变更neg': '履约变更内容',
|
|
|
'候选人公示': '候选人公示|评标结果公示|中标候选人名单公示',
|
|
|
- '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
+ '候选人公示neg': '中标候选人公示期',
|
|
|
+ '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
'中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
|
|
|
'中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
'中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家',
|
|
@@ -4248,7 +4256,7 @@ class DistrictPredictor():
|
|
|
except Exception as e:
|
|
|
print('解析prem 获取招标人、及地址出错')
|
|
|
return tenderee, tenderee_address
|
|
|
- def get_area(text, web_source_name):
|
|
|
+ def get_area(text, web_source_name, not_in_content=True):
|
|
|
score_l = []
|
|
|
id_set = set()
|
|
|
|
|
@@ -4297,14 +4305,17 @@ class DistrictPredictor():
|
|
|
w = self.dist_dic[_id]['权重']
|
|
|
score = w * 0.2
|
|
|
score_l.append([_id, score] + area)
|
|
|
- area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知'}
|
|
|
+ area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
|
if len(score_l) == 0:
|
|
|
return {'district': area_dic}
|
|
|
else:
|
|
|
df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
|
|
|
+ df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
|
|
|
+ # print('地区评分:')
|
|
|
+ # print(df)
|
|
|
df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
|
|
|
pro_id = df_pro.index[0]
|
|
|
- if df_pro.loc[pro_id, 'score'] < 0.1: # 省级评分小于0.1的不要
|
|
|
+ if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content: # 不是二次全文匹配的 省级评分小于0.1的不要
|
|
|
# print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
|
|
|
return {'district': area_dic}
|
|
|
area_dic['province'] = self.dist_dic[pro_id]['地区']
|
|
@@ -4325,15 +4336,32 @@ class DistrictPredictor():
|
|
|
return {'district': area_dic}
|
|
|
|
|
|
tenderee, tenderee_address = get_ree_addr(prem)
|
|
|
- project_name = str(project_name).replace(str(tenderee), '')
|
|
|
- text1 = "{} {} {}".format(project_name, tenderee, tenderee_address)
|
|
|
+ project_name = str(project_name)
|
|
|
+ tenderee = str(tenderee)
|
|
|
+
|
|
|
+ if '##attachment##' in list_articles[0].content:
|
|
|
+ content, attachment = list_articles[0].content.split('##attachment##')
|
|
|
+ if len(content) < 200:
|
|
|
+ content += attachment
|
|
|
+ else:
|
|
|
+ content = list_articles[0].content
|
|
|
+
|
|
|
+ project_name = project_name + title if project_name not in title else project_name
|
|
|
+ project_name = project_name.replace(tenderee, '')
|
|
|
+ text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
|
|
|
+ ser = re.search('项目所在地区?:(\w{2,8}[省市区县])+', content)
|
|
|
+ if ser:
|
|
|
+ text1 = ser.group(0)
|
|
|
+
|
|
|
web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
|
text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区
|
|
|
rs = get_area(text1, web_source_name)
|
|
|
+
|
|
|
if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
|
|
|
- text2 = title + list_articles[0].content if len(list_articles[0].content)<2000 else title + list_articles[0].content[:1000] + list_articles[0].content[-1000:]
|
|
|
+ text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
|
|
|
text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
|
|
|
- rs2 = get_area(text2, web_source_name)
|
|
|
+ rs2 = get_area(text2, web_source_name, not_in_content=False)
|
|
|
+ rs2['district']['is_in_text'] = True
|
|
|
if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
|
|
|
rs = rs2
|
|
|
elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
|
|
@@ -4342,7 +4370,7 @@ class DistrictPredictor():
|
|
|
|
|
|
class TableTag2List():
|
|
|
'''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
|
|
|
- def table2list(self, table):
|
|
|
+ def table2list(self, table, text_process=None):
|
|
|
self._output = []
|
|
|
row_ind = 0
|
|
|
col_ind = 0
|
|
@@ -4351,17 +4379,21 @@ class TableTag2List():
|
|
|
# we should skip
|
|
|
smallest_row_span = 1
|
|
|
|
|
|
+ if len(row.find_all(['td', 'th'], recursive=False)) > 20:
|
|
|
+ log('未补全前表格列数大于20的不做表格处理')
|
|
|
+ return []
|
|
|
+
|
|
|
for cell in row.children:
|
|
|
if cell.name in ('td', 'th'):
|
|
|
# check multiple rows
|
|
|
# pdb.set_trace()
|
|
|
- row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and re.search('[0-9]', cell.get('rowspan')) else 1
|
|
|
+ row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and cell.get('rowspan').isdigit() else 1
|
|
|
|
|
|
# try updating smallest_row_span
|
|
|
smallest_row_span = min(smallest_row_span, row_span)
|
|
|
|
|
|
# check multiple columns
|
|
|
- col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and re.search('[0-9]', cell.get('colspan')) else 1
|
|
|
+ col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and cell.get('colspan').isdigit() else 1
|
|
|
|
|
|
# find the right index
|
|
|
while True:
|
|
@@ -4371,15 +4403,19 @@ class TableTag2List():
|
|
|
|
|
|
# insert into self._output
|
|
|
try:
|
|
|
- text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
|
|
|
- text = re.sub('\s', '', text)[:200] # 只需取前200字即可
|
|
|
+ if text_process != None:
|
|
|
+ text = [re.sub('\xa0','',text_process(cell,final=False)),0]
|
|
|
+ else:
|
|
|
+ text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
|
|
|
+ text = re.sub('\s', '', text)[:200] # 只需取前200字即可
|
|
|
+ text = ' ' if text == "" else text
|
|
|
self._insert(row_ind, col_ind, row_span, col_span, text)
|
|
|
except UnicodeEncodeError:
|
|
|
raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
|
|
|
|
|
|
# update col_ind
|
|
|
col_ind += col_span
|
|
|
- if col_ind > 50: # 表格列数大于50的去掉
|
|
|
+ if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
|
|
|
return []
|
|
|
|
|
|
# update row_ind
|
|
@@ -4401,7 +4437,7 @@ class TableTag2List():
|
|
|
return True
|
|
|
if j >= len(self._output[i]):
|
|
|
return True
|
|
|
- if self._output[i][j] is None:
|
|
|
+ if self._output[i][j] == "":
|
|
|
return True
|
|
|
return False
|
|
|
|
|
@@ -4427,7 +4463,7 @@ class TablePremExtractor(object):
|
|
|
self.head_rule_dic = {
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
- "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品)(名称?|内容)",
|
|
|
"win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
@@ -4444,7 +4480,7 @@ class TablePremExtractor(object):
|
|
|
def find_header(self, td_list):
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
- if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>0.6:
|
|
|
+ if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
|
|
|
flag = True
|
|
|
for i in range(len(td_list)) :
|
|
|
text = td_list[i]
|
|
@@ -4462,6 +4498,19 @@ class TablePremExtractor(object):
|
|
|
if num>1:
|
|
|
print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
return flag, dict()
|
|
|
+ if re.search(';金额(万?元);', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
|
+ if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
|
+ for i in range(len(td_list)):
|
|
|
+ text = td_list[i]
|
|
|
+ if re.search('^金额(万?元)$',text):
|
|
|
+ header_dic['bid_amount'] = (i, text)
|
|
|
+ break
|
|
|
+ elif 'tenderee' in header_dic and 'budget' not in header_dic:
|
|
|
+ for i in range(len(td_list)):
|
|
|
+ text = td_list[i]
|
|
|
+ if re.search('^金额(万?元)$', text):
|
|
|
+ header_dic['budget'] = (i, text)
|
|
|
+ break
|
|
|
if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
'budget' in header_dic or 'tenderer' in header_dic):
|
|
|
return flag, header_dic
|
|
@@ -4610,6 +4659,7 @@ class TablePremExtractor(object):
|
|
|
def get_prem(self, soup):
|
|
|
tables = soup.find_all('table')
|
|
|
tables.reverse()
|
|
|
+
|
|
|
rs_dic = {}
|
|
|
for table in tables:
|
|
|
trs = self.tb.table2list(table)
|
|
@@ -4631,7 +4681,7 @@ class TablePremExtractor(object):
|
|
|
else:
|
|
|
print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
|
|
|
break
|
|
|
- if len(table_items) > 1:
|
|
|
+ if len(table_items) > 0:
|
|
|
df = pd.DataFrame(table_items)
|
|
|
prem_ = self.extract_from_df(df, headers)
|
|
|
rs_dic.update(prem_)
|
|
@@ -4647,6 +4697,10 @@ class TablePremExtractor(object):
|
|
|
prem = self.get_prem(soup)
|
|
|
if prem == {} and richText:
|
|
|
prem = self.get_prem(richText)
|
|
|
+ if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
|
+ k = list(prem)[0]
|
|
|
+ if k == '1' or len(k) > 2:
|
|
|
+ prem['Project'] = prem.pop(k)
|
|
|
return prem
|
|
|
|
|
|
class CandidateExtractor(object):
|