|
@@ -32,6 +32,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
"epc":{"predictor":None,"Lock":RLock()},
|
|
|
"roleRule":{"predictor":None,"Lock":RLock()},
|
|
|
"roleRuleFinal":{"predictor":None,"Lock":RLock()},
|
|
|
+ "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
|
|
|
"form":{"predictor":None,"Lock":RLock()},
|
|
|
"time":{"predictor":None,"Lock":RLock()},
|
|
|
"punish":{"predictor":None,"Lock":RLock()},
|
|
@@ -57,6 +58,8 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]["predictor"] = RoleRulePredictor()
|
|
|
if _type == "roleRuleFinal":
|
|
|
dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
|
|
|
+ if _type == "tendereeRuleRecall":
|
|
|
+ dict_predictor[_type]["predictor"] = TendereeRuleRecall()
|
|
|
if _type == "form":
|
|
|
dict_predictor[_type]["predictor"] = FormPredictor()
|
|
|
if _type == "time":
|
|
@@ -332,9 +335,9 @@ class CodeNamePredict():
|
|
|
else:
|
|
|
begin = iter.span()[0]-get_len
|
|
|
end = iter.span()[1]+get_len
|
|
|
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
|
|
|
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
|
|
|
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
|
+ code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
|
|
|
+ code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""))
|
|
|
+ _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
|
temp_entitys.append(_entity)
|
|
|
#print("code",code_text)
|
|
|
if len(code_x)>0:
|
|
@@ -1100,11 +1103,11 @@ class RoleRulePredictor():
|
|
|
def __init__(self):
|
|
|
# (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
|
|
|
self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
|
|
|
- "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
|
- "[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
|
- self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
|
|
|
+ "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
|
+ "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
|
+ self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
|
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
- "(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
|
+ "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
|
self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
|
|
|
self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
|
|
@@ -1266,7 +1269,7 @@ class RoleRulePredictor():
|
|
|
if _v_group is not None and _v_group != "":
|
|
|
_role = _group.split("_")[0]
|
|
|
if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
|
|
|
- print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
|
+ # print('p_entity_sentenceindex:', p_entity.sentence_index)
|
|
|
if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
|
|
|
continue
|
|
|
if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
|
|
@@ -1279,8 +1282,7 @@ class RoleRulePredictor():
|
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
- list_spans[
|
|
|
- 0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
+ list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
_flag = True
|
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
"secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
@@ -1385,70 +1387,96 @@ class RoleRulePredictor():
|
|
|
|
|
|
'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
|
|
|
class RoleRuleFinalAdd():
|
|
|
- def predict(self, list_articles, list_entitys, list_codenames):
|
|
|
- text_end = list_articles[0].content[-40:]
|
|
|
+ def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
|
|
|
+ # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
|
|
|
+ main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
|
|
|
+ end_tokens = []
|
|
|
+ for sentence in main_sentences[-5:]:
|
|
|
+ end_tokens.extend(sentence.tokens)
|
|
|
+ text_end = "".join(end_tokens[-30:])
|
|
|
+ # print(text_end)
|
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
- sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
+ sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
- sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
-
|
|
|
- if sear_ent or sear_ent2 or sear_ent3:
|
|
|
- if sear_ent3:
|
|
|
- ent_re = sear_ent3.group(2)
|
|
|
- elif sear_ent2:
|
|
|
- ent_re = sear_ent2.group(2)
|
|
|
- else:
|
|
|
- ent_re = sear_ent.group(1)
|
|
|
- ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
|
- tenderee_notfound = True
|
|
|
- agency_notfound = True
|
|
|
- ents = []
|
|
|
- for ent in list_entitys[0]:
|
|
|
- if ent.entity_type in ['org', 'company']:
|
|
|
- if ent.label == 0:
|
|
|
- tenderee_notfound = False
|
|
|
- elif ent.label == 1:
|
|
|
- agency_notfound = False
|
|
|
- elif ent.label == 5:
|
|
|
- ents.append(ent)
|
|
|
- if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
|
|
|
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
|
|
|
- n = 0
|
|
|
- for i in range(len(ents) - 1, -1, -1):
|
|
|
- n += 1
|
|
|
- if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
|
- break
|
|
|
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
- ents[i].label = 0
|
|
|
- ents[i].values[0] = 0.5
|
|
|
- # log('正则最后补充实体: %s'%(ent_re))
|
|
|
- break
|
|
|
- elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
|
|
|
- n = 0
|
|
|
- for i in range(len(ents) - 1, -1, -1):
|
|
|
- n += 1
|
|
|
- if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
|
- break
|
|
|
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
- ents[i].label = 1
|
|
|
- ents[i].values[1] = 0.5
|
|
|
- # log('正则最后补充实体: %s'%(ent_re))
|
|
|
- break
|
|
|
-
|
|
|
+ sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
|
|
|
+
|
|
|
+ tenderee_notfound = True
|
|
|
+ agency_notfound = True
|
|
|
+ ents = []
|
|
|
+ for ent in list_entitys[0]:
|
|
|
+ if ent.entity_type in ['org', 'company']:
|
|
|
+ if ent.label == 0:
|
|
|
+ tenderee_notfound = False
|
|
|
+ elif ent.label == 1:
|
|
|
+ agency_notfound = False
|
|
|
+ elif ent.label == 5:
|
|
|
+ ents.append(ent)
|
|
|
+ if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
|
|
|
+ for _sear_ent in [_sear for _sear in sear_list if _sear]:
|
|
|
+ # if sear_ent4:
|
|
|
+ # ent_re = sear_ent4.group(2)
|
|
|
+ # elif sear_ent3:
|
|
|
+ # ent_re = sear_ent3.group(2)
|
|
|
+ # elif sear_ent2:
|
|
|
+ # ent_re = sear_ent2.group(2)
|
|
|
+ # else:
|
|
|
+ # ent_re = sear_ent.group(1)
|
|
|
+ if _sear_ent==sear_ent4:
|
|
|
+ ent_re = _sear_ent.group(2)
|
|
|
+ elif _sear_ent==sear_ent3:
|
|
|
+ ent_re = _sear_ent.group(2)
|
|
|
+ elif _sear_ent==sear_ent2:
|
|
|
+ ent_re = _sear_ent.group(2)
|
|
|
+ else:
|
|
|
+ ent_re = _sear_ent.group(1)
|
|
|
+ # print('ent_re', ent_re)
|
|
|
+ ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
|
+
|
|
|
+ if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
|
|
|
+ or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
|
|
|
+ n = 0
|
|
|
+ for i in range(len(ents) - 1, -1, -1):
|
|
|
+ if not ents[i].in_attachment:
|
|
|
+ n += 1
|
|
|
+ if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
|
+ break
|
|
|
+ if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
+ ents[i].label = 0
|
|
|
+ ents[i].values[0] = 0.5
|
|
|
+ tenderee_notfound = False
|
|
|
+ # log('正则最后补充实体: %s'%(ent_re))
|
|
|
+ break
|
|
|
+ elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
|
|
|
+ n = 0
|
|
|
+ for i in range(len(ents) - 1, -1, -1):
|
|
|
+ if not ents[i].in_attachment:
|
|
|
+ n += 1
|
|
|
+ if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
|
|
|
+ break
|
|
|
+ if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
+ ents[i].label = 1
|
|
|
+ ents[i].values[1] = 0.5
|
|
|
+ agency_notfound = False
|
|
|
+ # log('正则最后补充实体: %s'%(ent_re))
|
|
|
+ break
|
|
|
+ if not tenderee_notfound:
|
|
|
+ break
|
|
|
|
|
|
elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人
|
|
|
- tenderee_notfound = True
|
|
|
- ents = []
|
|
|
- for ent in list_entitys[0]:
|
|
|
- if ent.entity_type in ['org', 'company']:
|
|
|
- if ent.label == 0:
|
|
|
- tenderee_notfound = False
|
|
|
- elif ent.label == 1:
|
|
|
- agency_notfound = False
|
|
|
- elif ent.label == 5:
|
|
|
- ents.append(ent)
|
|
|
+ # tenderee_notfound = True
|
|
|
+ # ents = []
|
|
|
+ # for ent in list_entitys[0]:
|
|
|
+ # if ent.entity_type in ['org', 'company']:
|
|
|
+ # if ent.label == 0:
|
|
|
+ # tenderee_notfound = False
|
|
|
+ # elif ent.label == 1:
|
|
|
+ # agency_notfound = False
|
|
|
+ # elif ent.label == 5:
|
|
|
+ # ents.append(ent)
|
|
|
if tenderee_notfound == True:
|
|
|
- print('list_codenames',list_codenames[0]['name'])
|
|
|
+ # print('list_codenames',list_codenames[0]['name'])
|
|
|
for ent in ents:
|
|
|
if ent.entity_text in list_codenames[0]['name']:
|
|
|
ent.label = 0
|
|
@@ -1456,7 +1484,179 @@ class RoleRuleFinalAdd():
|
|
|
# log('正则召回标题中包含的实体:%s'%ent.entity_text)
|
|
|
break
|
|
|
|
|
|
-
|
|
|
+# 招标人角色召回规则
|
|
|
+class TendereeRuleRecall():
|
|
|
+ def __init__(self):
|
|
|
+ self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
|
|
|
+ "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
|
|
|
+
|
|
|
+ self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
|
|
|
+ "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
|
|
|
+ "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
|
|
|
+ "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
|
|
|
+ "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
|
|
|
+ "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
|
|
|
+ self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
|
|
|
+ self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]+)")
|
|
|
+ # 公告主语判断规则
|
|
|
+ self.subject = re.compile("[我本][院校局]")
|
|
|
+ # 未识别实体召回正则
|
|
|
+ self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
|
|
|
+ "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
+ "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
|
|
|
+ self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
|
|
|
+ "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
|
+ "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
|
|
|
+ # 未识别实体尾部判断
|
|
|
+ self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓)")
|
|
|
+ self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
|
|
|
+
|
|
|
+ def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
|
|
|
+ # tenderee_notfound = True
|
|
|
+ # agency_notfound = True
|
|
|
+ self.get_tenderee = False
|
|
|
+ ents = []
|
|
|
+ list_name = []
|
|
|
+ for ent in list_entitys[0]:
|
|
|
+ if ent.entity_type == 'name':
|
|
|
+ list_name.append(ent.entity_text)
|
|
|
+ if ent.entity_type in ['org', 'company']:
|
|
|
+ if ent.label == 0:
|
|
|
+ # tenderee_notfound = False
|
|
|
+ self.get_tenderee = True
|
|
|
+ # elif ent.label == 1:
|
|
|
+ # agency_notfound = False
|
|
|
+ elif ent.label == 5:
|
|
|
+ ents.append(ent)
|
|
|
+ if not self.get_tenderee:
|
|
|
+ self.entity_context_rule(ents,list_name,list_sentences)
|
|
|
+ if not self.get_tenderee:
|
|
|
+ self.subject_rule(ents,list_articles,list_sentences)
|
|
|
+ if not self.get_tenderee:
|
|
|
+ self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
|
|
|
+ if not self.get_tenderee:
|
|
|
+ self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
|
|
|
+
|
|
|
+ #entity上下文正则判断
|
|
|
+ def entity_context_rule(self,entitys,list_name,list_sentences):
|
|
|
+ for ent in entitys:
|
|
|
+ _sentence = list_sentences[0][ent.sentence_index]
|
|
|
+ _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
|
|
|
+ end_index=ent.end_index, size=40, center_include=True,
|
|
|
+ word_flag=True, use_text=True,
|
|
|
+ text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
|
|
|
+ if re.search(self.tenderee_left,_span[0]):
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
+ self.get_tenderee = True
|
|
|
+ elif re.search(self.tenderee_right,_span[2]):
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
+ self.get_tenderee = True
|
|
|
+ elif re.search(self.tenderee_right2, _span[2]):
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
+ self.get_tenderee = True
|
|
|
+ elif list_name:
|
|
|
+ pj_name = re.search(self.tenderee_right3, _span[2])
|
|
|
+ if pj_name:
|
|
|
+ pj_name = pj_name.groupdict()["project"]
|
|
|
+ for _name in list_name:
|
|
|
+ if _name in pj_name:
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5
|
|
|
+ self.get_tenderee = True
|
|
|
+ break
|
|
|
+ # 公告主语判断
|
|
|
+ def subject_rule(self, entitys,list_articles,list_sentences):
|
|
|
+ content = list_articles[0].content.split('##attachment##')[0]
|
|
|
+ if re.search(self.subject,content):
|
|
|
+ _subject = re.search(self.subject,content).group()
|
|
|
+ for ent in entitys:
|
|
|
+ if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
+ self.get_tenderee = True
|
|
|
+ elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
+ self.get_tenderee = True
|
|
|
+ elif re.search("局", _subject) and re.search("局", ent.entity_text):
|
|
|
+ _sentence = list_sentences[0][ent.sentence_index]
|
|
|
+ _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
|
|
|
+ end_index=ent.end_index, size=20, center_include=True,
|
|
|
+ word_flag=True, use_text=True,
|
|
|
+ text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
|
|
|
+ if not re.search("监督|投诉",_span[0][-10:]):
|
|
|
+ ent.label = 0
|
|
|
+ ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
+ self.get_tenderee = True
|
|
|
+
|
|
|
+ # 正则召回未识别实体
|
|
|
+ def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
|
|
|
+ list_sentence = list_sentences[0]
|
|
|
+ for in_attachment in [False,True]:
|
|
|
+ for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
|
|
|
+ sentence_text = sentence.sentence_text
|
|
|
+ tokens = sentence.tokens
|
|
|
+ doc_id = sentence.doc_id
|
|
|
+ in_attachment = sentence.in_attachment
|
|
|
+ list_tokenbegin = []
|
|
|
+ begin = 0
|
|
|
+ for i in range(0, len(tokens)):
|
|
|
+ list_tokenbegin.append(begin)
|
|
|
+ begin += len(str(tokens[i]))
|
|
|
+ list_tokenbegin.append(begin + 1)
|
|
|
+ for _match in re.finditer(pattern,sentence_text):
|
|
|
+ _groupdict = _match.groupdict()
|
|
|
+ _match_text = _match.group()
|
|
|
+ _unrecognized_text = _groupdict["unrecognized"]
|
|
|
+ # print(_unrecognized_text)
|
|
|
+ # if _match_text[-1] in [':',':']:
|
|
|
+ # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
|
|
|
+ # if not _unrecognized:
|
|
|
+ # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
|
|
|
+ # if _unrecognized:
|
|
|
+ # _unrecognized = _unrecognized.group()
|
|
|
+ # else:
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # _unrecognized = _unrecognized_text
|
|
|
+ _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
|
|
|
+ if not _unrecognized:
|
|
|
+ _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
|
|
|
+ if _unrecognized:
|
|
|
+ _unrecognized = _unrecognized.group()
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ # print(_unrecognized)
|
|
|
+ if re.search("某",_unrecognized):
|
|
|
+ continue
|
|
|
+ begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
|
|
|
+ for j in range(len(list_tokenbegin)):
|
|
|
+ if list_tokenbegin[j] == begin_index_temp:
|
|
|
+ begin_index = j
|
|
|
+ break
|
|
|
+ elif list_tokenbegin[j] > begin_index_temp:
|
|
|
+ begin_index = j - 1
|
|
|
+ break
|
|
|
+ index = begin_index_temp + len(_unrecognized)
|
|
|
+ end_index_temp = index
|
|
|
+ for j in range(begin_index, len(list_tokenbegin)):
|
|
|
+ if list_tokenbegin[j] >= index:
|
|
|
+ end_index = j - 1
|
|
|
+ break
|
|
|
+ entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
|
|
|
+ entity_text = _unrecognized
|
|
|
+ new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
|
|
|
+ begin_index_temp, end_index_temp, in_attachment=in_attachment)
|
|
|
+ new_entity.label = 0
|
|
|
+ new_entity.values = [on_value,0,0,0,0,0]
|
|
|
+ list_entitys[0].append(new_entity)
|
|
|
+ self.get_tenderee = True
|
|
|
+ if self.get_tenderee:
|
|
|
+ list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
+ break
|
|
|
|
|
|
# 时间类别
|
|
|
class TimePredictor():
|