2 Commits 2805f0eb29 ... 6b0f539095

Tác giả SHA1 Thông báo Ngày
  znj 6b0f539095 Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION 1 tháng trước cách đây
  znj 7a81edd365 project_label提取规则修改,新增main_content_text(正文定位词后45个字) 1 tháng trước cách đây

+ 5 - 0
BiddingKG/dl/entityLink/entityLink.py

@@ -13,6 +13,7 @@ from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import *
 import json
 from BiddingKG.dl.common.constDict import ConstDict
+# from BiddingKG.dl.interface.classification_process import entity_classify_process
 
 def edit_distance(source,target):
     dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
@@ -324,6 +325,10 @@ def get_nlp_enterprise(list_entity):
             else:
                 if entity.entity_text not in nlp_enterprise_attachment:
                     nlp_enterprise_attachment.append(entity.entity_text)
+    # for enterprise,value in dict_enterprise.items():
+    #     enterprise_class = entity_classify_process(enterprise)
+    #     _class = [{"first_level":key.split("-")[0],"second_level":key.split("-")[1]} for key in enterprise_class]
+    #     value['class'] = _class
     return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise
 
 ENTERPRISE_HUGE = None

+ 2 - 1
BiddingKG/dl/interface/extract.py

@@ -502,7 +502,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(doc_id, web_source_no, web_source_name, prem)
 
     '''根据关键词表生成项目标签'''
-    project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
+    project_label, main_content_text = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem,all_text=list_articles[0].content)
     # 额外需求的标签
     project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
     # print(project_label)
@@ -584,6 +584,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
 
     data_res["project_label"] = project_label
+    data_res["main_content_text"] = main_content_text
     data_res["industry_label"] = industry_label
     data_res["property_label"] = property_label
     data_res["doctitle_refine"] = doctitle_refine

+ 90 - 3
BiddingKG/dl/interface/predictor.py

@@ -5033,7 +5033,55 @@ class ProjectLabel():
                 key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
         return key_word_list
 
-    def predict(self, doctitle,product,project_name,prem):
+    def extract_core_text(self,all_text,tenderee="",agency=""):
+        # 剔除 招标单位、代理机构名称
+        if tenderee:
+            all_text = all_text.replace(tenderee, " ")
+        if agency:
+            all_text = all_text.replace(agency, " ")
+        # 定义需要匹配的关键词列表
+        keywords = [
+            '项目名称', '工程名称', '采购名称', '标段名称', '项目的名称', '设备名称', '申购主题',
+            '申购单主题', '标的', '商品名称', '二级目录', '招标内容', '项目内容', '商品清单',
+            '标的名称', '采购内容', '集成要求', '概况介绍', '品目分类', '招标范围', '采购范围',
+            '项目采购分', '采购合同', '招标合同','产品名称','服务内容','采购品目名称','货物名称',
+            '采购需求概况','项目概况','招标范围','采购条目名称','物资名称','物料名称','建设规模',
+            '建设内容','采购项目概况','采购包名称','物料描述','商品信息','服务品目','标项名称',
+            '规格描述','采购标的','服务名称','采购单名称','明细信息','申购主题','需求详情'
+        ]
+
+        # 创建正则表达式模式,匹配任意一个关键词
+        pattern = r'(' + '|'.join(re.escape(kw) for kw in keywords) + r')'
+
+        # 查找所有匹配位置
+        matches = list(re.finditer(pattern, all_text))
+
+        if not matches:
+            return ""  # 没有找到关键词
+
+        # 取第一个匹配位置
+        first_match = matches[0]
+        start_pos = first_match.end()  # 关键词结束位置
+
+        # 提取关键词之后的内容
+        after_text = all_text[start_pos:]
+
+        # 提取最多45个汉字
+        chinese_chars = []
+        count = 0
+        for char in after_text:
+            # 判断是否为汉字 (Unicode范围)
+            if '\u4e00' <= char <= '\u9fff':
+                count += 1
+                if count > 45:
+                    break
+            chinese_chars.append(char)
+
+        # 将字符列表组合成字符串
+        core_text = ''.join(chinese_chars).strip()
+        return core_text
+
+    def predict(self, doctitle,product,project_name,prem,all_text):
 
         doctitle = doctitle if doctitle else ""
         product = product if product else ""
@@ -5055,8 +5103,11 @@ class ProjectLabel():
             # print('解析prem 获取招标人、代理人出错')
             pass
         sub_project_names = ";".join(sub_project_names)
+        main_content_text = self.extract_core_text(all_text,tenderee,agency)
         # 核心字段:标题+产品词+项目名称+标段名称
-        main_text = ",".join([doctitle, product, project_name, sub_project_names])
+        # main_text = ",".join([doctitle, product, project_name, sub_project_names])
+        # 核心字段:标题+项目名称+产品词+正文定位词后45个字
+        main_text = ",".join([doctitle, project_name, product, main_content_text])
         # 剔除 招标单位、代理机构名称
         if tenderee:
             doctitle = doctitle.replace(tenderee, " ")
@@ -5124,7 +5175,7 @@ class ProjectLabel():
             for item in main_text_labels[10:]:
                 main_text_dict.pop(item[0])
 
-        return {"标题":doctitle_dict,"核心字段":main_text_dict}
+        return {"标题":doctitle_dict,"核心字段":main_text_dict},main_content_text
 
     def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
         # doctextcon 取正文内容
@@ -5174,11 +5225,47 @@ class ProjectLabel():
 
         return project_label
 
+
+# from BiddingKG.dl.interface.classification_process import product_classify_process
 # 行业标签
 class IndustryLabel():
 
     def __init__(self):
         self.keyword_list = self.get_label_keywords()
+        pass
+
+    # def predict(self,doctitle,article,product,prem):
+    #     doctitle = doctitle if doctitle else ""
+    #     product = product if product else ""
+    #     product = ",".join(set(product.split(','))) # 产品词去重
+    #     all_text = article.content
+    #     all_text = re.sub('\s+', ' ', all_text)
+    #     tenderee = ""
+    #     agency = ""
+    #     try:
+    #         for k,v in prem[0]['prem'].items():
+    #             for link in v['roleList']:
+    #                 if link['role_name'] == 'tenderee' and tenderee == "":
+    #                     tenderee = link['role_text']
+    #                 if link['role_name'] == 'agency' and agency == "":
+    #                     agency = link['role_text']
+    #     except Exception as e:
+    #         # print('解析prem 获取招标人、代理人出错')
+    #         pass
+    #     # 剔除 招标单位、代理机构名称
+    #     if tenderee:
+    #         doctitle = doctitle.replace(tenderee, " ")
+    #         all_text = all_text.replace(tenderee, " ")
+    #     if agency:
+    #         doctitle = doctitle.replace(agency, " ")
+    #         all_text = all_text.replace(agency, " ")
+    #
+    #     category_1, category_2, category_3, matched_keywords, rule_id = product_classify_process(doctitle, all_text, product)
+    #     print(category_1, category_2, category_3, matched_keywords, rule_id)
+    #     if category_2=="标题排除":
+    #         category_1 = "其他"
+    #         category_2 = ""
+    #     return {"first_level":category_1,"second_level":category_2}
 
     def get_label_keywords(self):
         import csv