|
@@ -583,7 +583,7 @@ class PREMPredict():
|
|
|
while(p_sentences<len(list_sentence)):
|
|
|
sentence = list_sentence[p_sentences]
|
|
|
if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
|
|
|
- text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
|
|
|
+ text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
|
|
|
#item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
|
|
|
item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
|
|
|
data_x.append(item_x)
|
|
@@ -684,6 +684,12 @@ class PREMPredict():
|
|
|
elif re.search('尊敬的供应商:', text):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
+ elif re.search('[^\w]中标候选人:', text) and re.search('[1一]', text) == None: #修复第4以上的预测错为中标人
|
|
|
+ label = 5
|
|
|
+ values[label] = 0.5
|
|
|
+ elif re.search('是否中标:是,供应商', text) and label == 5:
|
|
|
+ label = 2
|
|
|
+ values[label] = 0.9
|
|
|
elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
@@ -1150,7 +1156,7 @@ class RoleRulePredictor():
|
|
|
"(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
|
|
|
"单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
- self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
|
|
|
+ self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
|
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
@@ -1327,7 +1333,7 @@ class RoleRulePredictor():
|
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
+ if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
_flag = True
|
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
@@ -1454,7 +1460,7 @@ class RoleRuleFinalAdd():
|
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
|
- sear_ent2 = re.search('(户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
|
sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
|
|
@@ -1715,10 +1721,10 @@ class RoleGrade():
|
|
|
self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
|
|
|
self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
|
|
|
self.agency_left_9 = "(?P<agency_left_9>代理)"
|
|
|
- self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一])"
|
|
|
- self.winTenderer_left_8 = "(?P<winTenderer_left_8>(供应商|供货商|候选人))"
|
|
|
- self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名))"
|
|
|
- self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名))"
|
|
|
+ self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一]|排名:1)"
|
|
|
+ self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商))"
|
|
|
+ self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
|
|
|
+ self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
|
|
|
self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
|
|
|
self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
|
|
|
def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
|
|
@@ -1733,9 +1739,10 @@ class RoleGrade():
|
|
|
sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
|
|
|
role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
|
|
|
for entity in list_entitys[0]:
|
|
|
- if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.6:
|
|
|
+ if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.5:
|
|
|
text = sentences[entity.sentence_index].sentence_text
|
|
|
in_att = sentences[entity.sentence_index].in_attachment
|
|
|
+ pre_prob = entity.values[entity.label]
|
|
|
b = entity.wordOffset_begin
|
|
|
e = entity.wordOffset_end
|
|
|
not_found = 1
|
|
@@ -1759,6 +1766,8 @@ class RoleGrade():
|
|
|
# print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
|
|
|
if in_att:
|
|
|
_prob = _prob - 0.2
|
|
|
+ if pre_prob < _prob:
|
|
|
+ _prob = 0.65
|
|
|
entity.values[_label] = _prob + entity.values[_label] / 20
|
|
|
not_found = 0
|
|
|
# print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
|
|
@@ -1773,7 +1782,7 @@ class MoneyGrade():
|
|
|
def __init__(self):
|
|
|
self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
|
|
|
self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
|
|
|
- self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同))"
|
|
|
+ self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
|
|
|
self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
|
|
|
|
|
|
self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
|
|
@@ -2115,6 +2124,7 @@ class ProductAttributesPredictor():
|
|
|
continue
|
|
|
for td in tds:
|
|
|
td_text = re.sub('\s', '', td.get_text())
|
|
|
+ td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/') # 修复272144312 # 产品单价数量提取结果有特殊符号\ 气动执行装置备件\密封组件\NBR+PT
|
|
|
tr_line.append(td_text)
|
|
|
inner_table.append(tr_line)
|
|
|
return inner_table
|
|
@@ -3932,7 +3942,7 @@ class DistrictPredictor():
|
|
|
project_name = str(project_name).replace(str(tenderee), '')
|
|
|
text = "{} {} {}".format(project_name, tenderee, tenderee_address)
|
|
|
web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
|
- text = re.sub('复合肥|铁路|公路', ' ', text)
|
|
|
+ text = re.sub('复合肥|铁路|公路|新会计', ' ', text) #预防提取错 合肥 路南 新会 等地区
|
|
|
score_l = []
|
|
|
id_set = set()
|
|
|
|
|
@@ -3981,7 +3991,7 @@ class DistrictPredictor():
|
|
|
w = self.dist_dic[_id]['权重']
|
|
|
score = w * 0.2
|
|
|
score_l.append([_id, score] + area)
|
|
|
- area_dic = {'area': '全国', 'province': '未知', 'city': '未知', 'district': '未知'}
|
|
|
+ area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知'}
|
|
|
if len(score_l) == 0:
|
|
|
return {'district':area_dic}
|
|
|
else:
|