5 days ago · 7a81edd365
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -13,6 +13,7 @@ from BiddingKG.dl.common.Utils import *
 
				 from BiddingKG.dl.interface.Entitys import *
			
 
				 import json
			
 
				 from BiddingKG.dl.common.constDict import ConstDict
			
 
				+# from BiddingKG.dl.interface.classification_process import entity_classify_process
			
 
				 
			
 
				 def edit_distance(source,target):
			
 
				     dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
			
@@ -324,6 +325,10 @@ def get_nlp_enterprise(list_entity):
 
				             else:
			
 
				                 if entity.entity_text not in nlp_enterprise_attachment:
			
 
				                     nlp_enterprise_attachment.append(entity.entity_text)
			
 
				+    # for enterprise,value in dict_enterprise.items():
			
 
				+    #     enterprise_class = entity_classify_process(enterprise)
			
 
				+    #     _class = [{"first_level":key.split("-")[0],"second_level":key.split("-")[1]} for key in enterprise_class]
			
 
				+    #     value['class'] = _class
			
 
				     return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise
			
 
				 
			
 
				 ENTERPRISE_HUGE = None
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -502,7 +502,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(doc_id, web_source_no, web_source_name, prem)
			
 
				 
			
 
				     '''根据关键词表生成项目标签'''
			
 
				-    project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
			
 
				+    project_label, main_content_text = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem,all_text=list_articles[0].content)
			
 
				     # 额外需求的标签
			
 
				     project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
			
 
				     # print(project_label)
			
@@ -584,6 +584,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				         getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
			
 
				 
			
 
				     data_res["project_label"] = project_label
			
 
				+    data_res["main_content_text"] = main_content_text
			
 
				     data_res["industry_label"] = industry_label
			
 
				     data_res["property_label"] = property_label
			
 
				     data_res["doctitle_refine"] = doctitle_refine
			
@@ -616,7 +617,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     text_main, text_attn = 0, 0
			
 
				     for sentence in list_sentences[0]:
			
 
				         if sentence.in_attachment:
			
 
				-            text_attn += len(sentence.sentence_text)
			
 
				+            text_attn += len(re.sub("##attachment##[，。]?","",sentence.sentence_text))
			
 
				         else:
			
 
				             text_main += len(sentence.sentence_text)
			
 
				     data_res['word_count'] = {'正文': text_main, '附件': text_attn}
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -3505,17 +3505,29 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
 
				     # time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
			
 
				     new_time_entitys = []
			
 
				     year_list = []
			
 
				-    if page_time:
			
 
				-        year_list.append(page_time[:4])
			
 
				     for _entity in time_entitys:
			
 
				         _time_list,_year = my_timeFormat(_entity.entity_text,page_time)
			
 
				+        _in_attachment = _entity.in_attachment
			
 
				         if _time_list:
			
 
				             new_time_entitys.append([_entity,_time_list,_year])
			
 
				-            year_list.append(_year)
			
 
				+            year_list.append([_year,_in_attachment])
			
 
				+    get_all_time = False if False in [i[1] for i in year_list] else True
			
 
				+    if page_time:
			
 
				+        current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
			
 
				+        year_list.append([current_year,False])
			
 
				+    else:
			
 
				+        current_year = time.strftime("%Y",time.localtime())
			
 
				+    if get_all_time:
			
 
				+        year_list = [i[0] for i in year_list]
			
 
				+    else:
			
 
				+        year_list = [i[0] for i in year_list if not i[1]]
			
 
				     year_list = [(y,year_list.count(y)) for y in year_list if y[:2]=='20']
			
 
				     year_list.sort(key=lambda x:x[1],reverse=True)
			
 
				-    most_year = year_list[0][0]
			
 
				-    time_entitys = [item for item in new_time_entitys if int(item[2])-int(most_year)<=10 and int(item[2])-int(most_year)>=-1]
			
 
				+    most_year = year_list[0][0] if year_list else ""
			
 
				+    if most_year:
			
 
				+        time_entitys = [item for item in new_time_entitys if int(item[2])-int(most_year)<=10 and int(item[2])-int(most_year)>=-1]
			
 
				+    else:
			
 
				+        time_entitys = new_time_entitys
			
 
				 
			
 
				     # print(time_entitys)
			
 
				     for entity_idx in range(len(time_entitys)):
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -5016,7 +5016,55 @@ class ProjectLabel():
 
				                 key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
			
 
				         return key_word_list
			
 
				 
			
 
				-    def predict(self, doctitle,product,project_name,prem):
			
 
				+    def extract_core_text(self,all_text,tenderee="",agency=""):
			
 
				+        # 剔除 招标单位、代理机构名称
			
 
				+        if tenderee:
			
 
				+            all_text = all_text.replace(tenderee, " ")
			
 
				+        if agency:
			
 
				+            all_text = all_text.replace(agency, " ")
			
 
				+        # 定义需要匹配的关键词列表
			
 
				+        keywords = [
			
 
				+            '项目名称', '工程名称', '采购名称', '标段名称', '项目的名称', '设备名称', '申购主题',
			
 
				+            '申购单主题', '标的', '商品名称', '二级目录', '招标内容', '项目内容', '商品清单',
			
 
				+            '标的名称', '采购内容', '集成要求', '概况介绍', '品目分类', '招标范围', '采购范围',
			
 
				+            '项目采购分', '采购合同', '招标合同','产品名称','服务内容','采购品目名称','货物名称',
			
 
				+            '采购需求概况','项目概况','招标范围','采购条目名称','物资名称','物料名称','建设规模',
			
 
				+            '建设内容','采购项目概况','采购包名称','物料描述','商品信息','服务品目','标项名称',
			
 
				+            '规格描述','采购标的','服务名称','采购单名称','明细信息','申购主题','需求详情'
			
 
				+        ]
			
 
				+
			
 
				+        # 创建正则表达式模式，匹配任意一个关键词
			
 
				+        pattern = r'(' + '|'.join(re.escape(kw) for kw in keywords) + r')'
			
 
				+
			
 
				+        # 查找所有匹配位置
			
 
				+        matches = list(re.finditer(pattern, all_text))
			
 
				+
			
 
				+        if not matches:
			
 
				+            return ""  # 没有找到关键词
			
 
				+
			
 
				+        # 取第一个匹配位置
			
 
				+        first_match = matches[0]
			
 
				+        start_pos = first_match.end()  # 关键词结束位置
			
 
				+
			
 
				+        # 提取关键词之后的内容
			
 
				+        after_text = all_text[start_pos:]
			
 
				+
			
 
				+        # 提取最多45个汉字
			
 
				+        chinese_chars = []
			
 
				+        count = 0
			
 
				+        for char in after_text:
			
 
				+            # 判断是否为汉字 (Unicode范围)
			
 
				+            if '\u4e00' <= char <= '\u9fff':
			
 
				+                count += 1
			
 
				+                if count > 45:
			
 
				+                    break
			
 
				+            chinese_chars.append(char)
			
 
				+
			
 
				+        # 将字符列表组合成字符串
			
 
				+        core_text = ''.join(chinese_chars).strip()
			
 
				+        return core_text
			
 
				+
			
 
				+    def predict(self, doctitle,product,project_name,prem,all_text):
			
 
				 
			
 
				         doctitle = doctitle if doctitle else ""
			
 
				         product = product if product else ""
			
@@ -5038,8 +5086,11 @@ class ProjectLabel():
 
				             # print('解析prem 获取招标人、代理人出错')
			
 
				             pass
			
 
				         sub_project_names = ";".join(sub_project_names)
			
 
				+        main_content_text = self.extract_core_text(all_text,tenderee,agency)
			
 
				         # 核心字段：标题+产品词+项目名称+标段名称
			
 
				-        main_text = "，".join([doctitle, product, project_name, sub_project_names])
			
 
				+        # main_text = "，".join([doctitle, product, project_name, sub_project_names])
			
 
				+        # 核心字段：标题+项目名称+产品词+正文定位词后45个字
			
 
				+        main_text = "，".join([doctitle, project_name, product, main_content_text])
			
 
				         # 剔除 招标单位、代理机构名称
			
 
				         if tenderee:
			
 
				             doctitle = doctitle.replace(tenderee, " ")
			
@@ -5107,7 +5158,7 @@ class ProjectLabel():
 
				             for item in main_text_labels[10:]:
			
 
				                 main_text_dict.pop(item[0])
			
 
				 
			
 
				-        return {"标题":doctitle_dict,"核心字段":main_text_dict}
			
 
				+        return {"标题":doctitle_dict,"核心字段":main_text_dict},main_content_text
			
 
				 
			
 
				     def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
			
 
				         # doctextcon 取正文内容
			
@@ -5157,11 +5208,47 @@ class ProjectLabel():
 
				 
			
 
				         return project_label
			
 
				 
			
 
				+
			
 
				+# from BiddingKG.dl.interface.classification_process import product_classify_process
			
 
				 # 行业标签
			
 
				 class IndustryLabel():
			
 
				 
			
 
				     def __init__(self):
			
 
				         self.keyword_list = self.get_label_keywords()
			
 
				+        pass
			
 
				+
			
 
				+    # def predict(self,doctitle,article,product,prem):
			
 
				+    #     doctitle = doctitle if doctitle else ""
			
 
				+    #     product = product if product else ""
			
 
				+    #     product = ",".join(set(product.split(','))) # 产品词去重
			
 
				+    #     all_text = article.content
			
 
				+    #     all_text = re.sub('\s+', ' ', all_text)
			
 
				+    #     tenderee = ""
			
 
				+    #     agency = ""
			
 
				+    #     try:
			
 
				+    #         for k,v in prem[0]['prem'].items():
			
 
				+    #             for link in v['roleList']:
			
 
				+    #                 if link['role_name'] == 'tenderee' and tenderee == "":
			
 
				+    #                     tenderee = link['role_text']
			
 
				+    #                 if link['role_name'] == 'agency' and agency == "":
			
 
				+    #                     agency = link['role_text']
			
 
				+    #     except Exception as e:
			
 
				+    #         # print('解析prem 获取招标人、代理人出错')
			
 
				+    #         pass
			
 
				+    #     # 剔除 招标单位、代理机构名称
			
 
				+    #     if tenderee:
			
 
				+    #         doctitle = doctitle.replace(tenderee, " ")
			
 
				+    #         all_text = all_text.replace(tenderee, " ")
			
 
				+    #     if agency:
			
 
				+    #         doctitle = doctitle.replace(agency, " ")
			
 
				+    #         all_text = all_text.replace(agency, " ")
			
 
				+    #
			
 
				+    #     category_1, category_2, category_3, matched_keywords, rule_id = product_classify_process(doctitle, all_text, product)
			
 
				+    #     print(category_1, category_2, category_3, matched_keywords, rule_id)
			
 
				+    #     if category_2=="标题排除":
			
 
				+    #         category_1 = "其他"
			
 
				+    #         category_2 = ""
			
 
				+    #     return {"first_level":category_1,"second_level":category_2}
			
 
				 
			
 
				     def get_label_keywords(self):
			
 
				         import csv