Browse Source

project_label提取规则修改,新增main_content_text(正文定位词后45个字)

znj 5 days ago
parent
commit
7a81edd365

+ 5 - 0
BiddingKG/dl/entityLink/entityLink.py

@@ -13,6 +13,7 @@ from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import *
 import json
 from BiddingKG.dl.common.constDict import ConstDict
+# from BiddingKG.dl.interface.classification_process import entity_classify_process
 
 def edit_distance(source,target):
     dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
@@ -324,6 +325,10 @@ def get_nlp_enterprise(list_entity):
             else:
                 if entity.entity_text not in nlp_enterprise_attachment:
                     nlp_enterprise_attachment.append(entity.entity_text)
+    # for enterprise,value in dict_enterprise.items():
+    #     enterprise_class = entity_classify_process(enterprise)
+    #     _class = [{"first_level":key.split("-")[0],"second_level":key.split("-")[1]} for key in enterprise_class]
+    #     value['class'] = _class
     return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise
 
 ENTERPRISE_HUGE = None

+ 3 - 2
BiddingKG/dl/interface/extract.py

@@ -502,7 +502,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(doc_id, web_source_no, web_source_name, prem)
 
     '''根据关键词表生成项目标签'''
-    project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
+    project_label, main_content_text = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem,all_text=list_articles[0].content)
     # 额外需求的标签
     project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
     # print(project_label)
@@ -584,6 +584,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
 
     data_res["project_label"] = project_label
+    data_res["main_content_text"] = main_content_text
     data_res["industry_label"] = industry_label
     data_res["property_label"] = property_label
     data_res["doctitle_refine"] = doctitle_refine
@@ -616,7 +617,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     text_main, text_attn = 0, 0
     for sentence in list_sentences[0]:
         if sentence.in_attachment:
-            text_attn += len(sentence.sentence_text)
+            text_attn += len(re.sub("##attachment##[,。]?","",sentence.sentence_text))
         else:
             text_main += len(sentence.sentence_text)
     data_res['word_count'] = {'正文': text_main, '附件': text_attn}

+ 17 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -3505,17 +3505,29 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
     # time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
     new_time_entitys = []
     year_list = []
-    if page_time:
-        year_list.append(page_time[:4])
     for _entity in time_entitys:
         _time_list,_year = my_timeFormat(_entity.entity_text,page_time)
+        _in_attachment = _entity.in_attachment
         if _time_list:
             new_time_entitys.append([_entity,_time_list,_year])
-            year_list.append(_year)
+            year_list.append([_year,_in_attachment])
+    get_all_time = False if False in [i[1] for i in year_list] else True
+    if page_time:
+        current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
+        year_list.append([current_year,False])
+    else:
+        current_year = time.strftime("%Y",time.localtime())
+    if get_all_time:
+        year_list = [i[0] for i in year_list]
+    else:
+        year_list = [i[0] for i in year_list if not i[1]]
     year_list = [(y,year_list.count(y)) for y in year_list if y[:2]=='20']
     year_list.sort(key=lambda x:x[1],reverse=True)
-    most_year = year_list[0][0]
-    time_entitys = [item for item in new_time_entitys if int(item[2])-int(most_year)<=10 and int(item[2])-int(most_year)>=-1]
+    most_year = year_list[0][0] if year_list else ""
+    if most_year:
+        time_entitys = [item for item in new_time_entitys if int(item[2])-int(most_year)<=10 and int(item[2])-int(most_year)>=-1]
+    else:
+        time_entitys = new_time_entitys
 
     # print(time_entitys)
     for entity_idx in range(len(time_entitys)):

+ 90 - 3
BiddingKG/dl/interface/predictor.py

@@ -5016,7 +5016,55 @@ class ProjectLabel():
                 key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
         return key_word_list
 
-    def predict(self, doctitle,product,project_name,prem):
+    def extract_core_text(self,all_text,tenderee="",agency=""):
+        # 剔除 招标单位、代理机构名称
+        if tenderee:
+            all_text = all_text.replace(tenderee, " ")
+        if agency:
+            all_text = all_text.replace(agency, " ")
+        # 定义需要匹配的关键词列表
+        keywords = [
+            '项目名称', '工程名称', '采购名称', '标段名称', '项目的名称', '设备名称', '申购主题',
+            '申购单主题', '标的', '商品名称', '二级目录', '招标内容', '项目内容', '商品清单',
+            '标的名称', '采购内容', '集成要求', '概况介绍', '品目分类', '招标范围', '采购范围',
+            '项目采购分', '采购合同', '招标合同','产品名称','服务内容','采购品目名称','货物名称',
+            '采购需求概况','项目概况','招标范围','采购条目名称','物资名称','物料名称','建设规模',
+            '建设内容','采购项目概况','采购包名称','物料描述','商品信息','服务品目','标项名称',
+            '规格描述','采购标的','服务名称','采购单名称','明细信息','申购主题','需求详情'
+        ]
+
+        # 创建正则表达式模式,匹配任意一个关键词
+        pattern = r'(' + '|'.join(re.escape(kw) for kw in keywords) + r')'
+
+        # 查找所有匹配位置
+        matches = list(re.finditer(pattern, all_text))
+
+        if not matches:
+            return ""  # 没有找到关键词
+
+        # 取第一个匹配位置
+        first_match = matches[0]
+        start_pos = first_match.end()  # 关键词结束位置
+
+        # 提取关键词之后的内容
+        after_text = all_text[start_pos:]
+
+        # 提取最多45个汉字
+        chinese_chars = []
+        count = 0
+        for char in after_text:
+            # 判断是否为汉字 (Unicode范围)
+            if '\u4e00' <= char <= '\u9fff':
+                count += 1
+                if count > 45:
+                    break
+            chinese_chars.append(char)
+
+        # 将字符列表组合成字符串
+        core_text = ''.join(chinese_chars).strip()
+        return core_text
+
+    def predict(self, doctitle,product,project_name,prem,all_text):
 
         doctitle = doctitle if doctitle else ""
         product = product if product else ""
@@ -5038,8 +5086,11 @@ class ProjectLabel():
             # print('解析prem 获取招标人、代理人出错')
             pass
         sub_project_names = ";".join(sub_project_names)
+        main_content_text = self.extract_core_text(all_text,tenderee,agency)
         # 核心字段:标题+产品词+项目名称+标段名称
-        main_text = ",".join([doctitle, product, project_name, sub_project_names])
+        # main_text = ",".join([doctitle, product, project_name, sub_project_names])
+        # 核心字段:标题+项目名称+产品词+正文定位词后45个字
+        main_text = ",".join([doctitle, project_name, product, main_content_text])
         # 剔除 招标单位、代理机构名称
         if tenderee:
             doctitle = doctitle.replace(tenderee, " ")
@@ -5107,7 +5158,7 @@ class ProjectLabel():
             for item in main_text_labels[10:]:
                 main_text_dict.pop(item[0])
 
-        return {"标题":doctitle_dict,"核心字段":main_text_dict}
+        return {"标题":doctitle_dict,"核心字段":main_text_dict},main_content_text
 
     def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
         # doctextcon 取正文内容
@@ -5157,11 +5208,47 @@ class ProjectLabel():
 
         return project_label
 
+
+# from BiddingKG.dl.interface.classification_process import product_classify_process
 # 行业标签
 class IndustryLabel():
 
     def __init__(self):
         self.keyword_list = self.get_label_keywords()
+        pass
+
+    # def predict(self,doctitle,article,product,prem):
+    #     doctitle = doctitle if doctitle else ""
+    #     product = product if product else ""
+    #     product = ",".join(set(product.split(','))) # 产品词去重
+    #     all_text = article.content
+    #     all_text = re.sub('\s+', ' ', all_text)
+    #     tenderee = ""
+    #     agency = ""
+    #     try:
+    #         for k,v in prem[0]['prem'].items():
+    #             for link in v['roleList']:
+    #                 if link['role_name'] == 'tenderee' and tenderee == "":
+    #                     tenderee = link['role_text']
+    #                 if link['role_name'] == 'agency' and agency == "":
+    #                     agency = link['role_text']
+    #     except Exception as e:
+    #         # print('解析prem 获取招标人、代理人出错')
+    #         pass
+    #     # 剔除 招标单位、代理机构名称
+    #     if tenderee:
+    #         doctitle = doctitle.replace(tenderee, " ")
+    #         all_text = all_text.replace(tenderee, " ")
+    #     if agency:
+    #         doctitle = doctitle.replace(agency, " ")
+    #         all_text = all_text.replace(agency, " ")
+    #
+    #     category_1, category_2, category_3, matched_keywords, rule_id = product_classify_process(doctitle, all_text, product)
+    #     print(category_1, category_2, category_3, matched_keywords, rule_id)
+    #     if category_2=="标题排除":
+    #         category_1 = "其他"
+    #         category_2 = ""
+    #     return {"first_level":category_1,"second_level":category_2}
 
     def get_label_keywords(self):
         import csv