|
@@ -590,7 +590,11 @@ class PREMPredict():
|
|
|
while(p_sentences<len(list_sentence)):
|
|
|
sentence = list_sentence[p_sentences]
|
|
|
if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
|
|
|
- text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
|
|
|
+ # text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
|
|
|
+ text_sen = sentence.sentence_text
|
|
|
+ b = entity.wordOffset_begin
|
|
|
+ e = entity.wordOffset_end
|
|
|
+ text_list.append((text_sen[max(0, b-13):b], text_sen[b:e], text_sen[e:e+10]))
|
|
|
#item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
|
|
|
item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
|
|
|
data_x.append(item_x)
|
|
@@ -630,7 +634,7 @@ class PREMPredict():
|
|
|
while(p_sentences<len(list_sentence)):
|
|
|
sentence = list_sentence[p_sentences]
|
|
|
if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
|
|
|
- text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
|
|
|
+ text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 13):entity.wordOffset_begin])
|
|
|
#item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
|
|
|
#item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
|
|
|
item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
|
|
@@ -680,33 +684,45 @@ class PREMPredict():
|
|
|
entity = points_entitys[i]
|
|
|
label = np.argmax(predict_y[i])
|
|
|
values = predict_y[i]
|
|
|
- text = text_list[i]
|
|
|
+ # text = text_list[i]
|
|
|
+ text_tup = text_list[i]
|
|
|
+ front, middle, behind = text_tup
|
|
|
+ whole = "".join(text_tup)
|
|
|
if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
|
label = 5
|
|
|
- elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', text):
|
|
|
+ elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
|
label = 5
|
|
|
elif label == 2:
|
|
|
- if re.search('中标单位和.{,25}签订合同', text):
|
|
|
+ if re.search('中标单位和.{,25}签订合同', whole):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
- elif re.search('尊敬的供应商:.{,25}我公司', text):
|
|
|
+ elif re.search('尊敬的供应商:.{,25}我公司', whole):
|
|
|
label = 0
|
|
|
values[label] = 0.801
|
|
|
- elif re.search('尊敬的供应商:', text):
|
|
|
+ elif re.search('尊敬的供应商:$', front):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
- elif re.search('[^\w]中标候选人', text[:15]) and re.search('[1一]', text[:15]) == None: #修复第4以上的预测错为中标人
|
|
|
+ elif re.search('第[4-9四五六]中标候选人', front): #修复第4以上的预测错为中标人
|
|
|
label = 5
|
|
|
values[label] = 0.5
|
|
|
- elif re.search('是否中标:是,供应商', text) and label == 5:
|
|
|
+ elif re.search('是否中标:是,供应商', front) and label == 5:
|
|
|
label = 2
|
|
|
values[label] = 0.9
|
|
|
- elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
|
|
|
+ elif label == 1 and re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
- elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
|
|
|
+ elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', front):
|
|
|
label = 2
|
|
|
values[label] = 0.501
|
|
|
+ elif label in [3,4] and re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
|
|
|
+ label = 2
|
|
|
+ values[label] = 0.7
|
|
|
+ elif label == 3 and re.search('决定选择第二名', front) and re.search('^作为(中标|成交)(人|供应商|单位|公司)', behind):
|
|
|
+ label = 2
|
|
|
+ values[label] = 0.8
|
|
|
+ elif re.search('(中标|成交)通知书[,:]$', front) and re.search('^:', behind) and label != 2:
|
|
|
+ label = 2
|
|
|
+ values[label] = 0.8
|
|
|
entity.set_Role(label, values)
|
|
|
|
|
|
def predict_money(self,list_sentences,list_entitys):
|
|
@@ -742,16 +758,20 @@ class PREMPredict():
|
|
|
label = np.argmax(predict_y[i])
|
|
|
values = predict_y[i]
|
|
|
text = text_list[i]
|
|
|
+ # print('金额: ', entity.entity_text, label, values, text)
|
|
|
if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
|
|
|
label = 2
|
|
|
- elif label == 1 and re.search('[::,。](总金额|总价|单价)', text):
|
|
|
+ elif label == 1 and re.search('[::,。](总金额|总价|单价):?$', text) and re.search('(中标|投标|成交|中价)', text)==None:
|
|
|
values[label] = 0.49
|
|
|
elif label ==0 and entity.notes in ["投资", "工程造价"]:
|
|
|
values[label] = 0.49
|
|
|
- elif label == 0 and re.search('最低限价', text):
|
|
|
+ elif label == 0 and re.search('最低限价:?$', text):
|
|
|
values[label] = 0.49
|
|
|
elif re.search('金额在$', text):
|
|
|
values[label] = 0.49
|
|
|
+ elif re.search('报价:预估不?含税总价[为:]$', text) and (label != 1 or values[label]<0.5):
|
|
|
+ label = 1
|
|
|
+ values[label] = 0.8
|
|
|
entity.set_Money(label, values)
|
|
|
|
|
|
def correct_money_by_rule(self, title, list_entitys, list_articles):
|
|
@@ -1176,7 +1196,7 @@ class RoleRulePredictor():
|
|
|
self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
|
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
|
- self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
|
+ self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
|
"^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名?称?[))]))"
|
|
|
self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
@@ -1210,7 +1230,7 @@ class RoleRulePredictor():
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
|
|
|
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
|
|
|
+ self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价|报酬(含税):")
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
@@ -4631,7 +4651,7 @@ class TablePremExtractor(object):
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ "bid_amount": "投标[报总]价|(中标|成交))?([金总]?额|[报均总]价|价[格款]?)|承包价",
|
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -4643,7 +4663,7 @@ class TablePremExtractor(object):
|
|
|
def find_header(self, td_list):
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
- if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
|
|
|
+ if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
|
|
|
flag = True
|
|
|
for i in range(len(td_list)) :
|
|
|
text = td_list[i]
|
|
@@ -4674,8 +4694,8 @@ class TablePremExtractor(object):
|
|
|
if re.search('^金额(万?元)$', text):
|
|
|
header_dic['budget'] = (i, text)
|
|
|
break
|
|
|
- if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
- 'budget' in header_dic or 'tenderer' in header_dic):
|
|
|
+ if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
|
|
|
+ 'budget' in header_dic or 'bid_amount' in header_dic):
|
|
|
return flag, header_dic
|
|
|
return flag, dict()
|
|
|
|
|
@@ -4692,6 +4712,20 @@ class TablePremExtractor(object):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
+ def get_role(self, text):
|
|
|
+ if len(text) > 25 or len(text)<4:
|
|
|
+ return ''
|
|
|
+ ners = getNers([text], useselffool=True)
|
|
|
+ roles = []
|
|
|
+ if ners:
|
|
|
+ for ner in ners[0]:
|
|
|
+ if ner[2] in ['org', 'company']:
|
|
|
+ roles.append(ner[3])
|
|
|
+ if roles and len(''.join(roles)) > len(text)*0.8:
|
|
|
+ return roles[0]
|
|
|
+ else:
|
|
|
+ return ''
|
|
|
+
|
|
|
def extract_from_df(self, df, headers):
|
|
|
prem_dic = {}
|
|
|
previous_package = "" # 上一行包号
|
|
@@ -4735,8 +4769,11 @@ class TablePremExtractor(object):
|
|
|
if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
|
|
|
tenderer = ""
|
|
|
|
|
|
- tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
- tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
+ # tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
+ # tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
+
|
|
|
+ tenderee = self.get_role(tenderee)
|
|
|
+ tenderer = self.get_role(tenderer)
|
|
|
|
|
|
if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
break
|
|
@@ -4833,9 +4870,10 @@ class TablePremExtractor(object):
|
|
|
rs_dic = {}
|
|
|
for table in tables:
|
|
|
trs = self.tb.table2list(table)
|
|
|
- table.extract()
|
|
|
+ # table.extract()
|
|
|
i = 0
|
|
|
headers = ""
|
|
|
+ table_prem = {}
|
|
|
while i < len(trs) - 1:
|
|
|
flag_, headers_ = self.find_header(trs[i])
|
|
|
if flag_ and headers_ != dict():
|
|
@@ -4854,9 +4892,21 @@ class TablePremExtractor(object):
|
|
|
if len(table_items) > 0:
|
|
|
df = pd.DataFrame(table_items)
|
|
|
prem_ = self.extract_from_df(df, headers)
|
|
|
- rs_dic.update(prem_)
|
|
|
+ # rs_dic.update(prem_)
|
|
|
+ table_prem.update(prem_)
|
|
|
i = j - 1
|
|
|
i += 1
|
|
|
+ if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
|
+ sib = table.find_previous_sibling()
|
|
|
+ sib_text = sib.get_text()
|
|
|
+ ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
|
|
|
+ if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
|
|
|
+ package_sib = ser_sib.group(0)
|
|
|
+ package_sib = uniform_package_name(package_sib)
|
|
|
+ table_prem[package_sib] = table_prem.pop('1')
|
|
|
+ if table_prem:
|
|
|
+ rs_dic.update(table_prem)
|
|
|
+ table.extract()
|
|
|
return rs_dic
|
|
|
|
|
|
def predict(self, html):
|
|
@@ -4881,7 +4931,7 @@ class CandidateExtractor(object):
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
|
"candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
|
- "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ "bid_amount": "投标[报总]价|(中标|成交))?([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
|
"third_tenderer": "第三名|第三(中标|成交)?候选人",
|
|
@@ -4931,6 +4981,20 @@ class CandidateExtractor(object):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
+ def get_role(self, text):
|
|
|
+ if len(text) > 25 or len(text)<4:
|
|
|
+ return ''
|
|
|
+ ners = getNers([text], useselffool=True)
|
|
|
+ roles = []
|
|
|
+ if ners:
|
|
|
+ for ner in ners[0]:
|
|
|
+ if ner[2] in ['org', 'company']:
|
|
|
+ roles.append(ner[3])
|
|
|
+ if roles and len(''.join(roles)) > len(text)*0.8:
|
|
|
+ return roles[0]
|
|
|
+ else:
|
|
|
+ return ''
|
|
|
+
|
|
|
def money_process(self, money_text, header):
|
|
|
'''
|
|
|
输入金额文本及金额列表头,返回统一数字化金额及金额单位
|
|
@@ -4982,17 +5046,18 @@ class CandidateExtractor(object):
|
|
|
|
|
|
package_code = package_code_raw
|
|
|
|
|
|
- candidate = candidate_ if self.is_role(candidate_) else ""
|
|
|
+ # candidate = candidate_ if self.is_role(candidate_) else ""
|
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
+ candidate = self.get_role(candidate_)
|
|
|
|
|
|
# if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
# break
|
|
|
- if(candidate,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
|
|
|
+ if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
|
|
|
continue
|
|
|
link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
|
|
|
package = package_code
|
|
|
package = uniform_package_name(package) if package !="" else "Project"
|
|
|
- if candidate_:
|
|
|
+ if candidate:
|
|
|
if win_or_not and re.search('否|未入围', win_or_not):
|
|
|
pass
|
|
|
else:
|
|
@@ -5002,7 +5067,9 @@ class CandidateExtractor(object):
|
|
|
if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
|
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
|
- if self.is_role(text):
|
|
|
+ text = self.get_role(text)
|
|
|
+ if text:
|
|
|
+ # if self.is_role(text):
|
|
|
if type not in role_dic:
|
|
|
role_dic[type] = dict()
|
|
|
role_dic[type]['role_text'] = text
|
|
@@ -5134,7 +5201,9 @@ class CandidateExtractor(object):
|
|
|
text = sentences[sen_index].sentence_text
|
|
|
b = ent.wordOffset_begin
|
|
|
e = ent.wordOffset_end
|
|
|
- if isinstance(b, int) and isinstance(e, int):
|
|
|
+ if ent.label in [2,3,4]: # 直接加实体预测的候选人, 否则规则检查是否为候选人
|
|
|
+ candidates.add(ent.entity_text)
|
|
|
+ elif isinstance(b, int) and isinstance(e, int):
|
|
|
foreword = text[max(0, b - 10):b]
|
|
|
if re.search(self.p, foreword):
|
|
|
candidates.add(ent.entity_text)
|