|
@@ -1094,21 +1094,23 @@ class FormPredictor():
|
|
class RoleRulePredictor():
|
|
class RoleRulePredictor():
|
|
|
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
- # self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)"
|
|
|
|
- # self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价|评选|挂牌|出租|出让|谈判|邀标|邀请|洽谈|约谈|买受|选取|抽取|抽选|出售|标卖|比价)(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人)[))]?(名称|信息)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
|
|
|
|
- self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价|评选|挂牌|出租|出让|谈判|邀标|邀请|洽谈|约谈|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
|
|
|
|
|
|
+ # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
|
|
|
|
+ self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
"[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
|
|
"[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
|
|
|
|
+ self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
|
|
|
|
+ "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
|
+ "(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
|
|
self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
|
|
self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
|
|
self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^拟对|^现就|^现委托)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^拟对|^现就|^现委托)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
|
|
|
- self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
|
|
|
|
- self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|受.{,15}委托|^受托)"
|
|
|
|
|
|
+ self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
|
|
+ self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托)" # |^受托 会与 受托生产等冲突,暂时为发现受托表达代理方式
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
- # self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
|
|
|
|
- self.pattern_winTenderer_left = "(?P<winTenderer_left>(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为,]+$|" \
|
|
|
|
|
|
+ self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为,]+$|" \
|
|
"(选定单位|指定的中介服务机构|实施主体|承制单位)[::是为,]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::,]*$|" \
|
|
"(选定单位|指定的中介服务机构|实施主体|承制单位)[::是为,]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::,]*$|" \
|
|
"单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))(是|为|:|:)$|(供应|供货|供|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为,]$)"
|
|
"单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))(是|为|:|:)$|(供应|供货|供|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为,]$)"
|
|
|
|
+ self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为,]+$)"
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
@@ -1122,26 +1124,23 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
|
|
self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
|
|
self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
-
|
|
|
|
- self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
|
|
|
|
- ["C",self.pattern_tenderee_center],
|
|
|
|
- ["R",self.pattern_tenderee_right]],
|
|
|
|
- "1":[["L",self.pattern_agency_left],
|
|
|
|
- ["R",self.pattern_agency_right]],
|
|
|
|
- "2":[["L",self.pattern_winTenderer_left],
|
|
|
|
- # ["C",self.pattern_winTenderer_center],
|
|
|
|
- ["R",self.pattern_winTenderer_right],
|
|
|
|
- ["W",self.pattern_winTenderer_whole]],
|
|
|
|
- "3":[["L",self.pattern_secondTenderer_left],
|
|
|
|
- ["R",self.pattern_secondTenderer_right]],
|
|
|
|
- "4":[["L",self.pattern_thirdTenderer_left],
|
|
|
|
- ["R",self.pattern_thirdTenderer_right]]}
|
|
|
|
- self.pattern_whole = []
|
|
|
|
- for _k,_v in self.dict_list_pattern.items():
|
|
|
|
- for _d,_p in _v:
|
|
|
|
- self.pattern_whole.append(_p)
|
|
|
|
- # self.pattern_whole = "|".join(list_pattern)
|
|
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ self.pattern_whole = [self.pattern_tenderee_left,
|
|
|
|
+ self.pattern_tenderee_left_w1,
|
|
|
|
+ self.pattern_tenderee_center,
|
|
|
|
+ self.pattern_tenderee_right,
|
|
|
|
+ self.pattern_agency_left,
|
|
|
|
+ self.pattern_agency_right,
|
|
|
|
+ self.pattern_winTenderer_left,
|
|
|
|
+ self.pattern_winTenderer_left_w1,
|
|
|
|
+ self.pattern_winTenderer_whole,
|
|
|
|
+ self.pattern_winTenderer_right,
|
|
|
|
+ self.pattern_secondTenderer_left,
|
|
|
|
+ self.pattern_secondTenderer_right,
|
|
|
|
+ self.pattern_thirdTenderer_left,
|
|
|
|
+ self.pattern_thirdTenderer_right
|
|
|
|
+ ] # 需按顺序排列, 第二、三中标要在中标正则后面
|
|
|
|
+
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
|
|
self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
|
|
@@ -1162,59 +1161,66 @@ class RoleRulePredictor():
|
|
raise Exception("null text in input ")
|
|
raise Exception("null text in input ")
|
|
|
|
|
|
return text
|
|
return text
|
|
-
|
|
|
|
- def predict(self,list_articles,list_sentences,list_entitys,list_codenames,on_value = 0.5):
|
|
|
|
|
|
|
|
- for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames):
|
|
|
|
|
|
+
|
|
|
|
+ def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
|
|
|
|
+
|
|
|
|
+ for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
|
|
|
|
+ list_codenames):
|
|
list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
|
|
list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
|
|
# list_name = list_codename["name"]
|
|
# list_name = list_codename["name"]
|
|
list_name = [] # 2022/1/5 改为实体列表内所有项目名称
|
|
list_name = [] # 2022/1/5 改为实体列表内所有项目名称
|
|
for entity in list_entity:
|
|
for entity in list_entity:
|
|
if entity.entity_type == 'name':
|
|
if entity.entity_type == 'name':
|
|
list_name.append(entity.entity_text)
|
|
list_name.append(entity.entity_text)
|
|
- list_name = self._check_input(list_name)+[article.title]
|
|
|
|
|
|
+ list_name = self._check_input(list_name) + [article.title]
|
|
for p_entity in list_entity:
|
|
for p_entity in list_entity:
|
|
|
|
|
|
-
|
|
|
|
- if p_entity.entity_type in ["org","company"]:
|
|
|
|
- #将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
|
|
|
|
- if str(p_entity.label)=="0":
|
|
|
|
|
|
+ if p_entity.entity_type in ["org", "company"]:
|
|
|
|
+ # 只解析角色为无的或者概率低于阈值的
|
|
|
|
+ if p_entity.label is None:
|
|
|
|
+ continue
|
|
|
|
+ # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
|
|
|
|
+ if str(p_entity.label) == "0":
|
|
find_flag = False
|
|
find_flag = False
|
|
for _sentence in list_sentence:
|
|
for _sentence in list_sentence:
|
|
- if _sentence.sentence_index==p_entity.sentence_index:
|
|
|
|
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,use_text=True,text=re.sub(")",")",re.sub("(","(",p_entity.entity_text)))
|
|
|
|
|
|
+ if _sentence.sentence_index == p_entity.sentence_index:
|
|
|
|
+ _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
|
+ end_index=p_entity.end_index, size=20, center_include=True,
|
|
|
|
+ word_flag=True, use_text=True,
|
|
|
|
+ text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
|
|
for _name in list_name:
|
|
for _name in list_name:
|
|
- if _name!="" and str(_span[1]+_span[2][:len(str(_name))]).find(_name)>=0:
|
|
|
|
|
|
+ if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
|
|
find_flag = True
|
|
find_flag = True
|
|
- if p_entity.values[0]>on_value:
|
|
|
|
- p_entity.values[0] = 0.6+(p_entity.values[0]-0.6)/10
|
|
|
|
|
|
+ if p_entity.values[0] > on_value:
|
|
|
|
+ p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
|
|
if find_flag:
|
|
if find_flag:
|
|
continue
|
|
continue
|
|
-
|
|
|
|
-
|
|
|
|
- #只解析角色为无的或者概率低于阈值的
|
|
|
|
- if p_entity.label is None:
|
|
|
|
- continue
|
|
|
|
|
|
+
|
|
|
|
+ # 正则从概率低于阈值或其他类别中召回角色
|
|
role_prob = float(p_entity.values[int(p_entity.label)])
|
|
role_prob = float(p_entity.values[int(p_entity.label)])
|
|
- if role_prob<on_value or str(p_entity.label)=="5":
|
|
|
|
- #将标题中的实体置为招标人
|
|
|
|
- _list_name = self._check_input(list_name,ignore=True)
|
|
|
|
|
|
+ if role_prob < on_value or str(p_entity.label) == "5":
|
|
|
|
+ # 将标题中的实体置为招标人
|
|
|
|
+ _list_name = self._check_input(list_name, ignore=True)
|
|
find_flag = False
|
|
find_flag = False
|
|
- for _name in _list_name: #2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
|
|
|
|
- if str(_name).find(re.sub(")",")",re.sub("(","(",p_entity.entity_text))) >= 0 and p_entity.sentence_index<4:
|
|
|
|
|
|
+ for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
|
|
|
|
+ if str(_name).find(re.sub(")", ")", re.sub("(", "(",
|
|
|
|
+ p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
|
|
for _sentence in list_sentence:
|
|
for _sentence in list_sentence:
|
|
if _sentence.sentence_index == p_entity.sentence_index:
|
|
if _sentence.sentence_index == p_entity.sentence_index:
|
|
_span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
_span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
end_index=p_entity.end_index, size=20, center_include=True,
|
|
end_index=p_entity.end_index, size=20, center_include=True,
|
|
- word_flag=True, use_text=True,text=re.sub(")",")",re.sub("(","(",p_entity.entity_text)))
|
|
|
|
|
|
+ word_flag=True, use_text=True, text=re.sub(")", ")",
|
|
|
|
+ re.sub("(", "(",
|
|
|
|
+ p_entity.entity_text)))
|
|
if str(_span[1] + _span[2][:len(str(_name))]).find(
|
|
if str(_span[1] + _span[2][:len(str(_name))]).find(
|
|
- _name) >= 0:
|
|
|
|
|
|
+ _name) >= 0:
|
|
find_flag = True
|
|
find_flag = True
|
|
_label = 0
|
|
_label = 0
|
|
p_entity.label = _label
|
|
p_entity.label = _label
|
|
p_entity.values[int(_label)] = on_value
|
|
p_entity.values[int(_label)] = on_value
|
|
break
|
|
break
|
|
- if p_entity.sentence_index>=4:
|
|
|
|
|
|
+ if p_entity.sentence_index >= 4:
|
|
break
|
|
break
|
|
if find_flag:
|
|
if find_flag:
|
|
break
|
|
break
|
|
@@ -1224,194 +1230,142 @@ class RoleRulePredictor():
|
|
# p_entity.label = _label
|
|
# p_entity.label = _label
|
|
# p_entity.values[int(_label)] = on_value
|
|
# p_entity.values[int(_label)] = on_value
|
|
# break
|
|
# break
|
|
- #若是实体在标题中,默认为招标人,不进行以下的规则匹配
|
|
|
|
|
|
+ # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
|
|
if find_flag:
|
|
if find_flag:
|
|
continue
|
|
continue
|
|
-
|
|
|
|
|
|
+
|
|
for s_index in range(len(list_sentence)):
|
|
for s_index in range(len(list_sentence)):
|
|
- if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
|
|
|
|
|
|
+ if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
|
|
|
|
+ list_sentence[s_index].sentence_index:
|
|
tokens = list_sentence[s_index].tokens
|
|
tokens = list_sentence[s_index].tokens
|
|
begin_index = p_entity.begin_index
|
|
begin_index = p_entity.begin_index
|
|
end_index = p_entity.end_index
|
|
end_index = p_entity.end_index
|
|
size = 15
|
|
size = 15
|
|
- spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
|
|
|
|
- #距离
|
|
|
|
- list_distance = [100,100,100,100,100]
|
|
|
|
- _flag = False
|
|
|
|
-
|
|
|
|
|
|
+ spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
|
|
|
|
+ word_flag=True, use_text=False)
|
|
|
|
+ # _flag = False
|
|
|
|
|
|
- #使用正则+距离解决冲突
|
|
|
|
|
|
+ # 使用正则+距离解决冲突
|
|
# 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
|
|
# 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
|
|
- list_spans = [spans[0][-30:],spans[0][-10:]+spans[1]+spans[2][:10],spans[2]]
|
|
|
|
|
|
+ list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息
|
|
for _i_span in range(len(list_spans)):
|
|
for _i_span in range(len(list_spans)):
|
|
|
|
+ _flag = False
|
|
|
|
+ _prob_weight = 1
|
|
|
|
+
|
|
# print(list_spans[_i_span],p_entity.entity_text)
|
|
# print(list_spans[_i_span],p_entity.entity_text)
|
|
for _pattern in self.pattern_whole:
|
|
for _pattern in self.pattern_whole:
|
|
- for _iter in re.finditer(_pattern,list_spans[_i_span]):
|
|
|
|
- for _group,_v_group in _iter.groupdict().items():
|
|
|
|
- if _v_group is not None and _v_group!="":
|
|
|
|
|
|
+ for _iter in re.finditer(_pattern, list_spans[_i_span]):
|
|
|
|
+ for _group, _v_group in _iter.groupdict().items():
|
|
|
|
+ if _v_group is not None and _v_group != "":
|
|
_role = _group.split("_")[0]
|
|
_role = _group.split("_")[0]
|
|
_direct = _group.split("_")[1]
|
|
_direct = _group.split("_")[1]
|
|
- _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role)
|
|
|
|
- if _i_span==0 and _direct=="left" and re.search('各供应商|尊敬的供应商', list_spans[0])==None: #2021/12/22 修正错误中标召回 例子208668937
|
|
|
|
|
|
+ _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
|
|
+ # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
|
+ # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
|
+ if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商',
|
|
|
|
+ list_spans[
|
|
|
|
+ 0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
_flag = True
|
|
_flag = True
|
|
- _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
|
|
|
|
- list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
|
|
|
|
- if _i_span==1 and _direct=="center":
|
|
|
|
|
|
+ _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
|
+ "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
|
+ _prob_weight = 1.2 if _weight=='w1' else 1
|
|
|
|
+ # print('_v_group:',_group, _v_group, p_entity.entity_text)
|
|
|
|
+
|
|
|
|
+ if _i_span == 1 and _direct == "center":
|
|
_flag = True
|
|
_flag = True
|
|
- _distance = abs((len(list_spans[_i_span])-_iter.span()[1]))
|
|
|
|
- list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
|
|
|
|
- if _i_span==2 and _direct=="right":
|
|
|
|
|
|
+ _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
|
+ "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
|
+ _prob_weight = 1.2 if _weight == 'w1' else 1
|
|
|
|
+ # print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
|
+
|
|
|
|
+ if _i_span == 2 and _direct == "right":
|
|
_flag = True
|
|
_flag = True
|
|
- _distance = _iter.span()[0]
|
|
|
|
- list_distance[int(_label)] = min(_distance,list_distance[int(_label)])
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- # print(list_distance)
|
|
|
|
-
|
|
|
|
- # for _key in self.dict_list_pattern.keys():
|
|
|
|
- #
|
|
|
|
- # for pattern in self.dict_list_pattern[_key]:
|
|
|
|
- # if pattern[0]=="L":
|
|
|
|
- # for _iter in re.finditer(pattern[1], spans[0][-30:]):
|
|
|
|
- # _flag = True
|
|
|
|
- # if len(spans[0])-_iter.span()[1]<list_distance[int(_key)]:
|
|
|
|
- # list_distance[int(_key)] = len(spans[0])-_iter.span()[1]-(_iter.span()[1]-_iter.span()[0])
|
|
|
|
- #
|
|
|
|
- # if pattern[0]=="C":
|
|
|
|
- # if re.search(pattern[1],spans[0]) is None and re.search(pattern[1],spans[2]) is None and re.search(pattern[1],spans[0]+spans[1]+spans[2]) is not None:
|
|
|
|
- # _flag = True
|
|
|
|
- # list_distance[int(_key)] = 0
|
|
|
|
- #
|
|
|
|
- # if pattern[0]=="R":
|
|
|
|
- # for _iter in re.finditer(pattern[1], spans[2][:30]):
|
|
|
|
- # _flag = True
|
|
|
|
- # if _iter.span()[0]<list_distance[int(_key)]:
|
|
|
|
- # list_distance[int(_key)] = _iter.span()[0]
|
|
|
|
- # if pattern[0]=="W":
|
|
|
|
- # spans = spanWindow(tokens, begin_index, end_index, size=20, center_include=True, word_flag=True, use_text=False)
|
|
|
|
- # for _iter in re.finditer(pattern[1], "".join(spans)):
|
|
|
|
- # _flag = True
|
|
|
|
- # if _iter.span()[0]<list_distance[int(_key)]:
|
|
|
|
- # list_distance[int(_key)] = _iter.span()[0]
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- # print("==",list_distance)
|
|
|
|
- #得到结果
|
|
|
|
- _label = np.argmin(list_distance)
|
|
|
|
- if _flag:
|
|
|
|
- # if _label==2 and min(list_distance[3:])<100:
|
|
|
|
- # _label += np.argmin(list_distance[3:])+1
|
|
|
|
- if _label in [2,3,4]:
|
|
|
|
- if p_entity.entity_type in ["company","org"]:
|
|
|
|
- p_entity.label = _label
|
|
|
|
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
|
|
|
|
- else:
|
|
|
|
- p_entity.label = _label
|
|
|
|
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
|
|
|
|
- # if p_entity.entity_type=="location":
|
|
|
|
- # for _sentence in list_sentence:
|
|
|
|
- # if _sentence.sentence_index==p_entity.sentence_index:
|
|
|
|
- # _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=5,center_include=True,word_flag=True,text=p_entity.entity_text)
|
|
|
|
- # if re.search(self.pattern_winTenderer_location,_span[0][-10:]) is not None and re.search("地址|地点",_span[0]) is None:
|
|
|
|
- # p_entity.entity_type="company"
|
|
|
|
- # _label = "2"
|
|
|
|
- # p_entity.label = _label
|
|
|
|
- # p_entity.values = [0]*6
|
|
|
|
- # p_entity.values[int(_label)] = on_value
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- #确定性强的特殊修改
|
|
|
|
- if p_entity.entity_type in ["company","org"]:
|
|
|
|
- for s_index in range(len(list_sentence)):
|
|
|
|
- if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
|
|
|
|
- tokens = list_sentence[s_index].tokens
|
|
|
|
- begin_index = p_entity.begin_index
|
|
|
|
- end_index = p_entity.end_index
|
|
|
|
- size = 15
|
|
|
|
- spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
|
|
|
|
- #距离
|
|
|
|
- list_distance = [100,100,100,100,100]
|
|
|
|
- _flag = False
|
|
|
|
- for _key in self.dict_list_pattern.keys():
|
|
|
|
- for pattern in self.dict_list_pattern[_key]:
|
|
|
|
- if pattern[0]=="W":
|
|
|
|
- spans = spanWindow(tokens, begin_index, end_index, size=30, center_include=True, word_flag=True, use_text=False)
|
|
|
|
- for _iter in re.finditer(pattern[1], spans[0][-10:]+spans[1]+spans[2]):
|
|
|
|
- _flag = True
|
|
|
|
- if _iter.span()[0]<list_distance[int(_key)]:
|
|
|
|
- list_distance[int(_key)] = _iter.span()[0]
|
|
|
|
- #得到结果
|
|
|
|
- _label = np.argmin(list_distance)
|
|
|
|
- if _flag:
|
|
|
|
- if _label==2 and min(list_distance[3:])<100:
|
|
|
|
- _label += np.argmin(list_distance[3:])+1
|
|
|
|
- if _label in [2,3,4]:
|
|
|
|
|
|
+ _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
|
+ "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
|
+ _prob_weight = 1.2 if _weight == 'w1' else 1
|
|
|
|
+ # print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
|
+
|
|
|
|
+ # 得到结果
|
|
|
|
+ if _flag:
|
|
p_entity.label = _label
|
|
p_entity.label = _label
|
|
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
|
|
|
|
- else:
|
|
|
|
- p_entity.label = _label
|
|
|
|
- p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
|
|
|
|
|
|
+ p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
|
|
|
|
+ # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ # 其他金额通过正则召回可能是招标或中投标的金额
|
|
if p_entity.entity_type in ["money"]:
|
|
if p_entity.entity_type in ["money"]:
|
|
- if str(p_entity.label)=="2":
|
|
|
|
|
|
+ if str(p_entity.label) == "2":
|
|
for _sentence in list_sentence:
|
|
for _sentence in list_sentence:
|
|
- if _sentence.sentence_index==p_entity.sentence_index:
|
|
|
|
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
|
|
|
|
- if re.search(self.pattern_money_tenderee,_span[0]) is not None and re.search(self.pattern_money_other,_span[0]) is None:
|
|
|
|
- p_entity.values[0] = 0.8+p_entity.values[0]/10
|
|
|
|
|
|
+ if _sentence.sentence_index == p_entity.sentence_index:
|
|
|
|
+ _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
|
+ end_index=p_entity.end_index, size=20, center_include=True,
|
|
|
|
+ word_flag=True, text=p_entity.entity_text)
|
|
|
|
+ if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
|
|
|
|
+ self.pattern_money_other, _span[0]) is None:
|
|
|
|
+ p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
p_entity.label = 0
|
|
p_entity.label = 0
|
|
- if re.search(self.pattern_money_tenderer,_span[0]) is not None:
|
|
|
|
- if re.search(self.pattern_money_other,_span[0]) is not None:
|
|
|
|
- if re.search(self.pattern_money_tenderer,_span[0]).span()[1]>re.search(self.pattern_money_other,_span[0]).span()[1]:
|
|
|
|
- p_entity.values[1] = 0.8+p_entity.values[1]/10
|
|
|
|
|
|
+ if re.search(self.pattern_money_tenderer, _span[0]) is not None:
|
|
|
|
+ if re.search(self.pattern_money_other, _span[0]) is not None:
|
|
|
|
+ if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
|
|
|
|
+ re.search(self.pattern_money_other, _span[0]).span()[1]:
|
|
|
|
+ p_entity.values[1] = 0.8 + p_entity.values[1] / 10
|
|
p_entity.label = 1
|
|
p_entity.label = 1
|
|
else:
|
|
else:
|
|
- p_entity.values[1] = 0.8+p_entity.values[1]/10
|
|
|
|
|
|
+ p_entity.values[1] = 0.8 + p_entity.values[1] / 10
|
|
p_entity.label = 1
|
|
p_entity.label = 1
|
|
- if re.search(self.pattern_money_tenderer_whole,"".join(_span)) is not None and re.search(self.pattern_money_other,_span[0]) is None:
|
|
|
|
- p_entity.values[1] = 0.8+p_entity.values[1]/10
|
|
|
|
|
|
+ if re.search(self.pattern_money_tenderer_whole,
|
|
|
|
+ "".join(_span)) is not None and re.search(self.pattern_money_other,
|
|
|
|
+ _span[0]) is None:
|
|
|
|
+ p_entity.values[1] = 0.8 + p_entity.values[1] / 10
|
|
p_entity.label = 1
|
|
p_entity.label = 1
|
|
-
|
|
|
|
- #增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
|
|
|
|
+
|
|
|
|
+ # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
list_p = []
|
|
list_p = []
|
|
state = 0
|
|
state = 0
|
|
for p_entity in list_entity:
|
|
for p_entity in list_entity:
|
|
for _sentence in list_sentence:
|
|
for _sentence in list_sentence:
|
|
- if _sentence.sentence_index==p_entity.sentence_index:
|
|
|
|
- _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
|
|
|
|
-
|
|
|
|
- if state==2:
|
|
|
|
|
|
+ if _sentence.sentence_index == p_entity.sentence_index:
|
|
|
|
+ _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
|
+ end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
|
|
|
|
+ text=p_entity.entity_text)
|
|
|
|
+
|
|
|
|
+ if state == 2:
|
|
for _p in list_p[1:]:
|
|
for _p in list_p[1:]:
|
|
-
|
|
|
|
- _p.values[0] = 0.8+_p.values[0]/10
|
|
|
|
|
|
+ _p.values[0] = 0.8 + _p.values[0] / 10
|
|
_p.label = 0
|
|
_p.label = 0
|
|
state = 0
|
|
state = 0
|
|
list_p = []
|
|
list_p = []
|
|
-
|
|
|
|
- if state==0:
|
|
|
|
|
|
+
|
|
|
|
+ if state == 0:
|
|
if p_entity.entity_type in ["money"]:
|
|
if p_entity.entity_type in ["money"]:
|
|
- if str(p_entity.label)=="0" and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None:
|
|
|
|
|
|
+ if str(p_entity.label) == "0" and re.search(self.pattern_pack,
|
|
|
|
+ _span[0] + "-" + _span[2]) is not None:
|
|
state = 1
|
|
state = 1
|
|
list_p.append(p_entity)
|
|
list_p.append(p_entity)
|
|
- elif state==1:
|
|
|
|
|
|
+ elif state == 1:
|
|
if p_entity.entity_type in ["money"]:
|
|
if p_entity.entity_type in ["money"]:
|
|
- if str(p_entity.label) in ["0","2"] and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None and re.search(self.pattern_money_other,_span[0]+"-"+_span[2]) is None and p_entity.sentence_index==list_p[0].sentence_index:
|
|
|
|
|
|
+ if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
|
|
|
|
+ _span[0] + "-" + _span[
|
|
|
|
+ 2]) is not None and re.search(
|
|
|
|
+ self.pattern_money_other,
|
|
|
|
+ _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
|
|
|
|
+ 0].sentence_index:
|
|
list_p.append(p_entity)
|
|
list_p.append(p_entity)
|
|
else:
|
|
else:
|
|
state = 2
|
|
state = 2
|
|
-
|
|
|
|
- if len(list_p)>1:
|
|
|
|
|
|
+
|
|
|
|
+ if len(list_p) > 1:
|
|
for _p in list_p[1:]:
|
|
for _p in list_p[1:]:
|
|
- #print("==",_p.entity_text,_p.sentence_index,_p.label)
|
|
|
|
- _p.values[0] = 0.8+_p.values[0]/10
|
|
|
|
|
|
+ # print("==",_p.entity_text,_p.sentence_index,_p.label)
|
|
|
|
+ _p.values[0] = 0.8 + _p.values[0] / 10
|
|
_p.label = 0
|
|
_p.label = 0
|
|
state = 0
|
|
state = 0
|
|
list_p = []
|
|
list_p = []
|
|
-
|
|
|
|
-
|
|
|
|
|
|
+
|
|
for p_entity in list_entity:
|
|
for p_entity in list_entity:
|
|
- #将属于集合中的不可能是中标人的标签置为无
|
|
|
|
|
|
+ # 将属于集合中的不可能是中标人的标签置为无
|
|
if p_entity.entity_text in self.SET_NOT_TENDERER:
|
|
if p_entity.entity_text in self.SET_NOT_TENDERER:
|
|
- p_entity.label=5
|
|
|
|
|
|
+ p_entity.label = 5
|
|
|
|
|
|
'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
|
|
'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
|
|
class RoleRuleFinalAdd():
|
|
class RoleRuleFinalAdd():
|
|
@@ -1420,14 +1374,15 @@ class RoleRuleFinalAdd():
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
- sear_ent3 = re.search('报名咨询,([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
|
|
|
+ sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
if sear_ent or sear_ent2 or sear_ent3:
|
|
if sear_ent or sear_ent2 or sear_ent3:
|
|
if sear_ent3:
|
|
if sear_ent3:
|
|
- ent_re = sear_ent3.group(1).replace("(","(").replace(")",")")
|
|
|
|
|
|
+ ent_re = sear_ent3.group(2)
|
|
elif sear_ent2:
|
|
elif sear_ent2:
|
|
- ent_re = sear_ent2.group(2).replace("(","(").replace(")",")")
|
|
|
|
|
|
+ ent_re = sear_ent2.group(2)
|
|
else:
|
|
else:
|
|
- ent_re = sear_ent.group(1).replace(',', '').replace("(","(").replace(")",")")
|
|
|
|
|
|
+ ent_re = sear_ent.group(1)
|
|
|
|
+ ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
tenderee_notfound = True
|
|
tenderee_notfound = True
|
|
agency_notfound = True
|
|
agency_notfound = True
|
|
ents = []
|
|
ents = []
|
|
@@ -1448,6 +1403,7 @@ class RoleRuleFinalAdd():
|
|
if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
ents[i].label = 1
|
|
ents[i].label = 1
|
|
ents[i].values[1] = 0.5
|
|
ents[i].values[1] = 0.5
|
|
|
|
+ # log('正则最后补充实体: %s'%(ent_re))
|
|
break
|
|
break
|
|
|
|
|
|
elif tenderee_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None:
|
|
elif tenderee_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None:
|
|
@@ -1459,6 +1415,7 @@ class RoleRuleFinalAdd():
|
|
if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
ents[i].label = 0
|
|
ents[i].label = 0
|
|
ents[i].values[0] = 0.5
|
|
ents[i].values[0] = 0.5
|
|
|
|
+ # log('正则最后补充实体: %s'%(ent_re))
|
|
break
|
|
break
|
|
|
|
|
|
|
|
|
|
@@ -1628,6 +1585,16 @@ class ProductPredictor():
|
|
batch_paths = self.decode(scores, lengths, tran_)
|
|
batch_paths = self.decode(scores, lengths, tran_)
|
|
for text, path, length in zip(text_list, batch_paths, lengths):
|
|
for text, path, length in zip(text_list, batch_paths, lengths):
|
|
tags = ''.join([str(it) for it in path[:length]])
|
|
tags = ''.join([str(it) for it in path[:length]])
|
|
|
|
+ for it in re.finditer("12*3", tags):
|
|
|
|
+ start = it.start()
|
|
|
|
+ end = it.end()
|
|
|
|
+ _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
|
|
|
|
+ list_articles[0].doc_id, 0, start, end),
|
|
|
|
+ entity_text=text[start:end],
|
|
|
|
+ entity_type="product", sentence_index=0,
|
|
|
|
+ begin_index=0, end_index=0, wordOffset_begin=start,
|
|
|
|
+ wordOffset_end=end)
|
|
|
|
+ list_entitys[0].append(_entity)
|
|
for it in re.finditer("45*6", tags):
|
|
for it in re.finditer("45*6", tags):
|
|
start = it.start()
|
|
start = it.start()
|
|
end = it.end()
|
|
end = it.end()
|