|
@@ -110,6 +110,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
'candidate': {"predictor": None, "Lock": RLock()},
|
|
|
'websource_tenderee': {"predictor": None, "Lock": RLock()},
|
|
|
'project_label': {"predictor": None, "Lock": RLock()},
|
|
|
+ 'industry_label': {"predictor": None, "Lock": RLock()},
|
|
|
'pb_extract': {"predictor": None, "Lock": RLock()},
|
|
|
'property_label': {"predictor": None, "Lock": RLock()},
|
|
|
'approval': {"predictor": None, "Lock": RLock()}, # 审批项目预测
|
|
@@ -166,6 +167,8 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]['predictor'] = WebsourceTenderee()
|
|
|
if _type == 'project_label':
|
|
|
dict_predictor[_type]['predictor'] = ProjectLabel()
|
|
|
+ if _type == 'industry_label':
|
|
|
+ dict_predictor[_type]['predictor'] = IndustryLabel()
|
|
|
if _type == 'pb_extract':
|
|
|
dict_predictor[_type]['predictor'] = PBPredictor()
|
|
|
if _type == 'property_label':
|
|
@@ -5052,12 +5055,12 @@ class ProjectLabel():
|
|
|
key_wrod = item[1]
|
|
|
# 关键词排除词
|
|
|
key_paichuci = item[2]
|
|
|
- key_paichuci_s = "|".join(key_paichuci.strip('、').split('、'))
|
|
|
+ key_paichuci_s = "|".join([re.escape(word) for word in key_paichuci.strip('、').split('、')])
|
|
|
# 类型排除词
|
|
|
type_paichuci = item[3]
|
|
|
if type_paichuci:
|
|
|
paichuci_split = type_paichuci.strip('、').split('、')
|
|
|
- if re.search("|".join(paichuci_split), main_text):
|
|
|
+ if re.search("|".join([re.escape(word) for word in paichuci_split]), main_text):
|
|
|
continue
|
|
|
|
|
|
if doctitle:
|
|
@@ -5154,6 +5157,108 @@ class ProjectLabel():
|
|
|
|
|
|
return project_label
|
|
|
|
|
|
+# 行业标签
|
|
|
+class IndustryLabel():
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.keyword_list = self.get_label_keywords()
|
|
|
+
|
|
|
+ def get_label_keywords(self):
|
|
|
+ import csv
|
|
|
+ path = os.path.dirname(__file__)+'/industry_label_keywords.csv'
|
|
|
+ with open(path, 'r',encoding='utf-8') as f:
|
|
|
+ reader = csv.reader(f)
|
|
|
+ key_word_list = []
|
|
|
+ for r in reader:
|
|
|
+ if r[0] == '一级标签':
|
|
|
+ continue
|
|
|
+ first_level = r[0]
|
|
|
+ second_level = str(r[1])
|
|
|
+ second_level = second_level.strip() if second_level and second_level != 'nan' else ""
|
|
|
+ key_word = str(r[2]).strip()
|
|
|
+ all_paichuci = str(r[3])
|
|
|
+ all_paichuci = all_paichuci.strip() if all_paichuci and all_paichuci != 'nan' else ""
|
|
|
+ title_paichuci = str(r[4])
|
|
|
+ title_paichuci = title_paichuci.strip() if title_paichuci and title_paichuci != 'nan' else ""
|
|
|
+ product_paichuci = str(r[5])
|
|
|
+ product_paichuci = product_paichuci.strip() if product_paichuci and product_paichuci != 'nan' else ""
|
|
|
+ key_word_list.append((first_level, second_level, key_word, all_paichuci,title_paichuci,product_paichuci))
|
|
|
+ return key_word_list
|
|
|
+
|
|
|
+ def predict(self, doctitle,article,product,prem):
|
|
|
+
|
|
|
+ doctitle = doctitle if doctitle else ""
|
|
|
+ product = product if product else ""
|
|
|
+ product = ",".join(set(product.split(','))) # 产品词去重
|
|
|
+ all_text = article.content
|
|
|
+ tenderee = ""
|
|
|
+ agency = ""
|
|
|
+ try:
|
|
|
+ for k,v in prem[0]['prem'].items():
|
|
|
+ for link in v['roleList']:
|
|
|
+ if link['role_name'] == 'tenderee' and tenderee == "":
|
|
|
+ tenderee = link['role_text']
|
|
|
+ if link['role_name'] == 'agency' and agency == "":
|
|
|
+ agency = link['role_text']
|
|
|
+ except Exception as e:
|
|
|
+ # print('解析prem 获取招标人、代理人出错')
|
|
|
+ pass
|
|
|
+ # 剔除 招标单位、代理机构名称
|
|
|
+ if tenderee:
|
|
|
+ doctitle = doctitle.replace(tenderee, " ")
|
|
|
+ all_text = all_text.replace(tenderee, " ")
|
|
|
+ if agency:
|
|
|
+ doctitle = doctitle.replace(agency, " ")
|
|
|
+ all_text = all_text.replace(agency, " ")
|
|
|
+
|
|
|
+ label_list = []
|
|
|
+ for item in self.keyword_list:
|
|
|
+ first_level = item[0]
|
|
|
+ second_level = item[1]
|
|
|
+ key_word = item[2]
|
|
|
+ key_word = key_word.strip('、').split('、')
|
|
|
+ # 全文排除词
|
|
|
+ all_paichuci = item[3]
|
|
|
+ all_paichuci = "|".join([re.escape(word) for word in all_paichuci.strip('、').split('、')])
|
|
|
+ # 标题排除词
|
|
|
+ title_paichuci = item[4]
|
|
|
+ title_paichuci = "|".join([re.escape(word) for word in title_paichuci.strip('、').split('、')])
|
|
|
+ # 产品排除词
|
|
|
+ product_paichuci = item[5]
|
|
|
+ product_paichuci = "|".join([re.escape(word) for word in product_paichuci.strip('、').split('、')])
|
|
|
+
|
|
|
+
|
|
|
+ if doctitle and title_paichuci:
|
|
|
+ if re.search(title_paichuci,doctitle):
|
|
|
+ continue
|
|
|
+ if product and product_paichuci:
|
|
|
+ if re.search(product_paichuci,product):
|
|
|
+ continue
|
|
|
+ if all_text:
|
|
|
+ if all_paichuci:
|
|
|
+ if re.search(all_paichuci,all_text):
|
|
|
+ continue
|
|
|
+ get_label = False
|
|
|
+ for _keyword in key_word:
|
|
|
+ if '+' not in _keyword:
|
|
|
+ if _keyword in all_text:
|
|
|
+ get_label = True
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ get_keyword = True
|
|
|
+ for _word in _keyword.split("+"):
|
|
|
+ if _word not in all_text:
|
|
|
+ get_keyword = False
|
|
|
+ break
|
|
|
+ if get_keyword:
|
|
|
+ get_label = True
|
|
|
+ break
|
|
|
+ if get_label:
|
|
|
+ label_list.append({"first_level":first_level,"second_level":second_level})
|
|
|
+
|
|
|
+ return label_list
|
|
|
+
|
|
|
+
|
|
|
# 产权分类二级标签
|
|
|
class PropertyLabel():
|
|
|
'''
|