Ver código fonte

优化字典补充实体;新增变更内容、时间;优化角色金额

lsm 5 dias atrás
pai
commit
5a795a54c5

+ 2 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -561,7 +561,8 @@ def match_enterprise_max_first(sentence, business_dic):
                             if fix_name not in business_dic:
                                 have_bus, dic = get_business_data(fix_name) # 20210124 改为有工商数据的实体才添加
                                 if have_bus == False and 'have_business' in dic and re.search('^(上海|云南|内蒙古|北京|吉林|四川|天津|宁夏|安徽|山东|山西|广东|广西|新疆|江苏|江西|河北|河南|浙江|海南|湖北|湖南'
-                                    '|甘肃|福建|西藏|贵州|辽宁|重庆|陕西|青海|黑龙江|\w{1,5}[市县])[\w()]{2,15}[厂店铺市场行部城室馆中心站处社会狱所园关局司署段厅院队小学]$',fix_name): # 无工商数据有前面地址后面有关键词且在字典表的添加
+                                    '|甘肃|福建|西藏|贵州|辽宁|重庆|陕西|青海|黑龙江|\w{1,5}[市县])[\w()]{2,15}[厂店铺市场行部城室馆中心站处社会狱所园关局司署段厅院队小学]$',fix_name) and re.search(
+                                    '某|x|X|^\w{2,3}[分支闵](行|局|院|会|园|中心)$|^\w{2,5}中小学$', fix_name)==None: # 无工商数据有前面地址后面有关键词且在字典表的添加
                                     have_bus = True
                                     log('字典表补充无工商数据有关键词实体:%s'%fix_name)
                                 business_dic[fix_name] = (have_bus, dic)

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -533,7 +533,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-06-10'}
+    version_date = {'version_date': '2025-06-16'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 3 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -5153,10 +5153,10 @@ def confirm_prem(docid, prem, channel_dic, content, is_deposit_project=False, to
                 prem[k]['tendereeMoney'] = total_tendereeMoney
 
     # 采购人:某单位 等去掉tenderee
-    if "Project" in prem and re.search('(招标|采购|招商)(人|商|单位|部门)(信息[,:]?)?(名称)?((甲方))?:某(单位|部)', content):
+    if "Project" in prem and re.search('(招标|采购|招商)(人|商|单位|部门)(信息[,:]?)?(名称)?((甲方))?:[\u4e00-\u9fa5]{,3}+(单位|部)', content):
         for d in prem['Project']['roleList']:
-            if d['role_name'] == 'tenderee' and d.get('role_prob', 0)<0.8:
-                log('规则去除文中包含“采购人:某单位”等概率小于0.8的招标人:%s,docid:%s'%(d.get('role_text', ''), docid))
+            if d['role_name'] == 'tenderee' and d.get('role_prob', 0)<0.7: # 581195772 小于0.7才删除,附件明确表达可能0.7多
+                log('规则去除文中包含“采购人:某单位”等概率小于0.7的招标人:%s,docid:%s'%(d.get('role_text', ''), docid))
                 prem['Project']['roleList'].remove(d)
 
 def add_package_name(prem, list_entity, product_list, name):

+ 5 - 6
BiddingKG/dl/interface/html_2_kvtree.py

@@ -1578,6 +1578,11 @@ class Html2KVTree():
             traceback.print_exc()
             return result_kv
         result_kv = []
+        if from_outline:
+            result_kv_outline = self.extract_kvs_from_outline([k_pattern])
+            for _d in result_kv_outline[0]:
+                _d["from_outline"] = True
+            result_kv.extend(result_kv_outline[0])
         if from_table:
             result_kv_table = self.extract_kvs_from_table([k_pattern])
             for table_d in result_kv_table[0]:
@@ -1591,12 +1596,6 @@ class Html2KVTree():
             for _d in result_kv_sentence[0]:
                 _d["from_sentence"] = True
             result_kv.extend(result_kv_sentence[0])
-        if from_outline:
-            result_kv_outline = self.extract_kvs_from_outline([k_pattern])
-            for _d in result_kv_outline[0]:
-                _d["from_outline"] = True
-            result_kv.extend(result_kv_outline[0])
-
         return result_kv
 
     # def extract_kvs_from_table(self,list_pattern):

+ 27 - 8
BiddingKG/dl/interface/kvtree_search.py

@@ -6,6 +6,7 @@
 @time: 2024/12/26 10:31
 """
 from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
+from BiddingKG.dl.common.Utils import timeFormat
 import re
 
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
@@ -17,11 +18,19 @@ pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)"
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
 
-pattern_dic_single = {'requirement': requirement_pattern,
-               'aptitude': aptitude_pattern,
-               'pinmu_name': pinmu_name_pattern}
-pattern_dic_addr = {'addr_bidopen': addr_bidopen_pattern,
-                    'addr_bidsend': addr_bidsend_pattern}
+change_content_pattern = "(变更|更正)(事项[与及和、])?(信息|内容|说明|事项)|现做出如下变更|变更内容如下"
+change_time_pattern = "(变更|更正)日期"
+pattern_dic_single = {
+    'requirement': requirement_pattern,
+    'aptitude': aptitude_pattern,
+    'pinmu_name': pinmu_name_pattern,
+    'change_content': change_content_pattern,
+    'change_time': change_time_pattern
+}
+pattern_dic_addr = {
+    'addr_bidopen': addr_bidopen_pattern,
+    'addr_bidsend': addr_bidsend_pattern
+}
 
 def get_kvtree_value(html):
     '''
@@ -44,8 +53,15 @@ def get_kvtree_value(html):
             if d.get('value', '').strip() != '':
                 value = d['value'].strip()
                 break
-        if value != '' and re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要
-            kv_single_dic[k] = value
+        if value != '':
+            if 'time' in k:
+                value = timeFormat(value)
+                if value != '':
+                    kv_single_dic[k] = value
+            elif re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要
+                if k == 'change_content':
+                    value = re.sub('\s', '', value)[:200] # 变更内容去掉空格并限制200字
+                kv_single_dic[k] = value
     for k, v in pattern_dic_addr.items():
         kv_l = _pd.extract_kv(v)
         value = ''
@@ -63,4 +79,7 @@ if __name__ == "__main__":
     with open('d:/html/2.html', encoding='utf-8') as f:
         html = f.read()
         rs = get_kvtree_value(html)
-        print(rs)
+        # print(rs)
+        d1, d2 = rs
+        for k, v in d1.items():
+            print(k, v)

+ 8 - 6
BiddingKG/dl/interface/predictor.py

@@ -812,7 +812,7 @@ class PREMPredict():
                             text_sen = sentence.sentence_text
                             b = entity.wordOffset_begin
                             e = entity.wordOffset_end
-                            text_list.append((text_sen[max(0, b - 13):b], text_sen[b:e], text_sen[e:e + 10]))
+                            text_list.append((text_sen[max(0, b - 13):b], text_sen[b:e], text_sen[e:e + 15]))
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                             #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                             item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
@@ -947,7 +947,7 @@ class PREMPredict():
                 elif re.search('^为\w{,10}第二(成交|中标)单位', behind): # 中标预测错误,例:601143888 河南省创慧新材料科技有限公司为铸咀采购项目第二成交单位
                     label = 3
                     values[3] = 0.5
-                elif re.search('中标单位,$', front):
+                elif re.search('中标单位,$|被确定为$', front): # 632523961 现通知:贵司被确定为广州地铁传媒有限公司贵阳地铁广告媒体服务项目(2025年)的执行单位。
                     label = 5
                 elif re.search('^为预备中标单位', behind):
                     label = 3
@@ -1053,7 +1053,7 @@ class PREMPredict():
                     values[label] = 0.5
                 elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
                     values[label] = 0.49
-                elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+', behind):
+                elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+|服务招标费率|招标代理服务收费', behind[:20]): # 修复 628951835
                     values[label] = 0.49
                 elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
                     values[label] = 0.49
@@ -1531,7 +1531,7 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
 
-        self.candidate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位|候选企业)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为【]+$)"
+        self.candidate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)?候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位|候选企业)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为【]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1637,7 +1637,7 @@ class RoleRulePredictor():
             if _label == 5:
                 _label, _prob, keyword = self.ser_role(self.pattern_right, after, entity_text) # 后文匹配
                 keyword = "right_" + keyword if keyword!="" else keyword
-        if _label==5 and re.search('(中标|中选|成交)?)(结果)?(公告|公示|通知书?),', before) and re.match(':', after):
+        if _label==5 and re.search('(中标|中选|成交)?|谈判结果)(结果)?(公告|公示|通知书?),', before) and re.match(':', after): # 632523961 直接谈判结果通知书,广东金钥匙智能包装科技有限公司:经广州地铁传媒有限公司
             _label = 2
             _prob = 0.5
         _flag = False if _label==5 else True
@@ -1939,6 +1939,8 @@ class RoleRulePredictor():
                                         break
                                     elif re.search('合同价暂定为?$', _span[0]): # 20250310 修复 598504921 合同价暂定 为招标金额
                                         break
+                                    elif re.search('^(以[上下])?按[\d.%]+收取|^及?以[上下]|^[()]?[+×*-][\d.%]+|服务招标费率|招标代理服务收费', _span[2][:20]):
+                                        break
                                     if re.search(self.pattern_money_other, _span[0]) is not None:
                                         if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
                                                 re.search(self.pattern_money_other, _span[0]).span()[1]:
@@ -8684,7 +8686,7 @@ class EntityTypeRulePredictor():
                     if k == 'addr_contact' and re.search('中标|成交|中选|代理|供应商', sentance_text[max(0, b-12):]):
                         continue
                     v = v.replace('[:为]', '')
-                    if re.search(v, sentance_text[max(0, b-10): b]) and len(entity.entity_text)>2:
+                    if re.search(v, sentance_text[max(0, b-10): b]) and len(entity.entity_text)>=2: # 修复 621226337 项目所在地:重庆, 漏提
                         addr_dic[k] = entity.entity_text
             elif entity.entity_type == 'time':
                 b = entity.wordOffset_begin