|
@@ -5016,7 +5016,55 @@ class ProjectLabel():
|
|
|
key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
|
|
|
return key_word_list
|
|
|
|
|
|
- def predict(self, doctitle,product,project_name,prem):
|
|
|
+ def extract_core_text(self,all_text,tenderee="",agency=""):
|
|
|
+ # 剔除 招标单位、代理机构名称
|
|
|
+ if tenderee:
|
|
|
+ all_text = all_text.replace(tenderee, " ")
|
|
|
+ if agency:
|
|
|
+ all_text = all_text.replace(agency, " ")
|
|
|
+ # 定义需要匹配的关键词列表
|
|
|
+ keywords = [
|
|
|
+ '项目名称', '工程名称', '采购名称', '标段名称', '项目的名称', '设备名称', '申购主题',
|
|
|
+ '申购单主题', '标的', '商品名称', '二级目录', '招标内容', '项目内容', '商品清单',
|
|
|
+ '标的名称', '采购内容', '集成要求', '概况介绍', '品目分类', '招标范围', '采购范围',
|
|
|
+ '项目采购分', '采购合同', '招标合同','产品名称','服务内容','采购品目名称','货物名称',
|
|
|
+ '采购需求概况','项目概况','招标范围','采购条目名称','物资名称','物料名称','建设规模',
|
|
|
+ '建设内容','采购项目概况','采购包名称','物料描述','商品信息','服务品目','标项名称',
|
|
|
+ '规格描述','采购标的','服务名称','采购单名称','明细信息','申购主题','需求详情'
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 创建正则表达式模式,匹配任意一个关键词
|
|
|
+ pattern = r'(' + '|'.join(re.escape(kw) for kw in keywords) + r')'
|
|
|
+
|
|
|
+ # 查找所有匹配位置
|
|
|
+ matches = list(re.finditer(pattern, all_text))
|
|
|
+
|
|
|
+ if not matches:
|
|
|
+ return "" # 没有找到关键词
|
|
|
+
|
|
|
+ # 取第一个匹配位置
|
|
|
+ first_match = matches[0]
|
|
|
+ start_pos = first_match.end() # 关键词结束位置
|
|
|
+
|
|
|
+ # 提取关键词之后的内容
|
|
|
+ after_text = all_text[start_pos:]
|
|
|
+
|
|
|
+ # 提取最多45个汉字
|
|
|
+ chinese_chars = []
|
|
|
+ count = 0
|
|
|
+ for char in after_text:
|
|
|
+ # 判断是否为汉字 (Unicode范围)
|
|
|
+ if '\u4e00' <= char <= '\u9fff':
|
|
|
+ count += 1
|
|
|
+ if count > 45:
|
|
|
+ break
|
|
|
+ chinese_chars.append(char)
|
|
|
+
|
|
|
+ # 将字符列表组合成字符串
|
|
|
+ core_text = ''.join(chinese_chars).strip()
|
|
|
+ return core_text
|
|
|
+
|
|
|
+ def predict(self, doctitle,product,project_name,prem,all_text):
|
|
|
|
|
|
doctitle = doctitle if doctitle else ""
|
|
|
product = product if product else ""
|
|
@@ -5038,8 +5086,11 @@ class ProjectLabel():
|
|
|
# print('解析prem 获取招标人、代理人出错')
|
|
|
pass
|
|
|
sub_project_names = ";".join(sub_project_names)
|
|
|
+ main_content_text = self.extract_core_text(all_text,tenderee,agency)
|
|
|
# 核心字段:标题+产品词+项目名称+标段名称
|
|
|
- main_text = ",".join([doctitle, product, project_name, sub_project_names])
|
|
|
+ # main_text = ",".join([doctitle, product, project_name, sub_project_names])
|
|
|
+ # 核心字段:标题+项目名称+产品词+正文定位词后45个字
|
|
|
+ main_text = ",".join([doctitle, project_name, product, main_content_text])
|
|
|
# 剔除 招标单位、代理机构名称
|
|
|
if tenderee:
|
|
|
doctitle = doctitle.replace(tenderee, " ")
|
|
@@ -5107,7 +5158,7 @@ class ProjectLabel():
|
|
|
for item in main_text_labels[10:]:
|
|
|
main_text_dict.pop(item[0])
|
|
|
|
|
|
- return {"标题":doctitle_dict,"核心字段":main_text_dict}
|
|
|
+ return {"标题":doctitle_dict,"核心字段":main_text_dict},main_content_text
|
|
|
|
|
|
def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
|
|
|
# doctextcon 取正文内容
|
|
@@ -5157,11 +5208,47 @@ class ProjectLabel():
|
|
|
|
|
|
return project_label
|
|
|
|
|
|
+
|
|
|
+# from BiddingKG.dl.interface.classification_process import product_classify_process
|
|
|
# 行业标签
|
|
|
class IndustryLabel():
|
|
|
|
|
|
def __init__(self):
|
|
|
self.keyword_list = self.get_label_keywords()
|
|
|
+ pass
|
|
|
+
|
|
|
+ # def predict(self,doctitle,article,product,prem):
|
|
|
+ # doctitle = doctitle if doctitle else ""
|
|
|
+ # product = product if product else ""
|
|
|
+ # product = ",".join(set(product.split(','))) # 产品词去重
|
|
|
+ # all_text = article.content
|
|
|
+ # all_text = re.sub('\s+', ' ', all_text)
|
|
|
+ # tenderee = ""
|
|
|
+ # agency = ""
|
|
|
+ # try:
|
|
|
+ # for k,v in prem[0]['prem'].items():
|
|
|
+ # for link in v['roleList']:
|
|
|
+ # if link['role_name'] == 'tenderee' and tenderee == "":
|
|
|
+ # tenderee = link['role_text']
|
|
|
+ # if link['role_name'] == 'agency' and agency == "":
|
|
|
+ # agency = link['role_text']
|
|
|
+ # except Exception as e:
|
|
|
+ # # print('解析prem 获取招标人、代理人出错')
|
|
|
+ # pass
|
|
|
+ # # 剔除 招标单位、代理机构名称
|
|
|
+ # if tenderee:
|
|
|
+ # doctitle = doctitle.replace(tenderee, " ")
|
|
|
+ # all_text = all_text.replace(tenderee, " ")
|
|
|
+ # if agency:
|
|
|
+ # doctitle = doctitle.replace(agency, " ")
|
|
|
+ # all_text = all_text.replace(agency, " ")
|
|
|
+ #
|
|
|
+ # category_1, category_2, category_3, matched_keywords, rule_id = product_classify_process(doctitle, all_text, product)
|
|
|
+ # print(category_1, category_2, category_3, matched_keywords, rule_id)
|
|
|
+ # if category_2=="标题排除":
|
|
|
+ # category_1 = "其他"
|
|
|
+ # category_2 = ""
|
|
|
+ # return {"first_level":category_1,"second_level":category_2}
|
|
|
|
|
|
def get_label_keywords(self):
|
|
|
import csv
|