Эх сурвалжийг харах

调整优化角色金额发现的问题及某些特殊网站源特殊处理

lsm 2 жил өмнө
parent
commit
1af098b1a5

+ 5 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -678,7 +678,7 @@ def tableToText(soup):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|标的|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见)"  # 2020/11/23 大网站规则,添加序号为排序
-        entityPattern = "((候选|([中投]标|报价))(单位|公司|人|供应商))"
+        entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
         moneyPattern = "([中投]标|报价)(金额|价)"
         height = len(inner_table)
         width = len(inner_table[0])
@@ -1792,6 +1792,9 @@ def special_treatment(sourceContent, web_source_no):
             ser = re.search('支付金额:', sourceContent)
             if ser:
                 sourceContent = sourceContent.replace('支付金额:', '合同金额:')
+        elif web_source_no=='00811-8':
+            if re.search('是否中标:是', sourceContent) and re.search('排名:\d,', sourceContent):
+                sourceContent = re.sub('排名:\d,', '候选', sourceContent)
         return sourceContent
     except Exception as e:
         log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
@@ -2111,7 +2114,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed_list[1] = attachment_text
             article_processed = "##attachment##".join(article_processed_list)
         '''特别数据源对 预处理后文本 做特别修改'''
-        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2", '00811-8']:
             article_processed = special_treatment(article_processed, web_source_no)
 
         # 提取bidway

+ 2 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -3010,7 +3010,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
         #                       "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
     return result
 
-def correct_rolemoney(prem, total_product_money):
+def correct_rolemoney(prem, total_product_money): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换
     if total_product_money>0 and len(prem[0]['prem'])==1:
         for value in prem[0]['prem'].values():
             for l in value['roleList']:
@@ -3018,7 +3018,7 @@ def correct_rolemoney(prem, total_product_money):
                     # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
                     #     l[2] = total_product_money
                     #     log('修改中标金额为所有产品总金额')
-                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money:
+                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
                         l["role_money"]['money'] = total_product_money
                         # log('修改中标金额为所有产品总金额')
                 except Exception as e:

+ 22 - 12
BiddingKG/dl/interface/predictor.py

@@ -583,7 +583,7 @@ class PREMPredict():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
-                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
                             item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
                             data_x.append(item_x)
@@ -684,6 +684,12 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:', text):
                     label = 0
                     values[label] = 0.501
+                elif re.search('[^\w]中标候选人:', text) and re.search('[1一]', text) == None:  #修复第4以上的预测错为中标人
+                    label = 5
+                    values[label] = 0.5
+            elif re.search('是否中标:是,供应商', text) and label == 5:
+                label = 2
+                values[label] = 0.9
             elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
                 label = 0
                 values[label] = 0.501
@@ -1150,7 +1156,7 @@ class RoleRulePredictor():
                                         "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
                                         "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
         self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
-        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
@@ -1327,7 +1333,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -1454,7 +1460,7 @@ class RoleRuleFinalAdd():
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
         sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
-        sear_ent2 = re.search('(户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
         sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
         sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
         sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
@@ -1715,10 +1721,10 @@ class RoleGrade():
         self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
         self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
         self.agency_left_9 = "(?P<agency_left_9>代理)"
-        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一])"
-        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(供应商|供货商|候选人))"
-        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名))"
-        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名))"
+        self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一]|排名:1)"
+        self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商))"
+        self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
+        self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
                              self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
     def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
@@ -1733,9 +1739,10 @@ class RoleGrade():
         sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
         role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
         for entity in list_entitys[0]:
-            if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.6:
+            if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.5:
                 text = sentences[entity.sentence_index].sentence_text
                 in_att = sentences[entity.sentence_index].in_attachment
+                pre_prob = entity.values[entity.label]
                 b = entity.wordOffset_begin
                 e = entity.wordOffset_end
                 not_found = 1
@@ -1759,6 +1766,8 @@ class RoleGrade():
                         # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
                         if in_att:
                             _prob = _prob - 0.2
+                        if pre_prob < _prob:
+                            _prob = 0.65
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0
                         # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
@@ -1773,7 +1782,7 @@ class MoneyGrade():
     def __init__(self):
         self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
         self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
-        self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同))"
+        self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
         self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
 
         self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
@@ -2115,6 +2124,7 @@ class ProductAttributesPredictor():
                 continue
             for td in tds:
                 td_text = re.sub('\s', '', td.get_text())
+                td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/') # 修复272144312 # 产品单价数量提取结果有特殊符号\  气动执行装置备件\密封组件\NBR+PT
                 tr_line.append(td_text)
             inner_table.append(tr_line)
         return inner_table
@@ -3932,7 +3942,7 @@ class DistrictPredictor():
         project_name = str(project_name).replace(str(tenderee), '')
         text = "{} {} {}".format(project_name, tenderee, tenderee_address)
         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
-        text = re.sub('复合肥|铁路|公路', ' ', text)
+        text = re.sub('复合肥|铁路|公路|新会计', ' ', text)  #预防提取错 合肥 路南 新会 等地区
         score_l = []
         id_set = set()
 
@@ -3981,7 +3991,7 @@ class DistrictPredictor():
                     w = self.dist_dic[_id]['权重']
                     score = w * 0.2
                     score_l.append([_id, score] + area)
-        area_dic = {'area': '全国', 'province': '未知', 'city': '未知', 'district': '未知'}
+        area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知'}
         if len(score_l) == 0:
             return {'district':area_dic}
         else: