1 month ago · 98a61190e8
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -507,6 +507,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
			
 
				     # print(project_label)
			
 
				 
			
 
				+    '''行业关键词标签'''
			
 
				+    industry_label = predictor.getPredictor('industry_label').predict(title,list_articles[0],product=','.join(product_list),prem=prem)
			
 
				+
			
 
				     '''产权分类二级标签'''
			
 
				     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
			
 
				 
			
@@ -581,6 +584,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				         getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
			
 
				 
			
 
				     data_res["project_label"] = project_label
			
 
				+    data_res["industry_label"] = industry_label
			
 
				     data_res["property_label"] = property_label
			
 
				     data_res["doctitle_refine"] = doctitle_refine
			
 
				     data_res["nlp_enterprise"] = nlp_enterprise
			
--- a/BiddingKG/dl/interface/industry_label_keywords.csv
+++ b/BiddingKG/dl/interface/industry_label_keywords.csv
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -110,6 +110,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
 
				                   'candidate': {"predictor": None, "Lock": RLock()},
			
 
				                   'websource_tenderee': {"predictor": None, "Lock": RLock()},
			
 
				                   'project_label': {"predictor": None, "Lock": RLock()},
			
 
				+                  'industry_label': {"predictor": None, "Lock": RLock()},
			
 
				                   'pb_extract': {"predictor": None, "Lock": RLock()},
			
 
				                   'property_label': {"predictor": None, "Lock": RLock()},
			
 
				                   'approval': {"predictor": None, "Lock": RLock()}, # 审批项目预测
			
@@ -166,6 +167,8 @@ def getPredictor(_type):
 
				                     dict_predictor[_type]['predictor'] = WebsourceTenderee()
			
 
				                 if _type == 'project_label':
			
 
				                     dict_predictor[_type]['predictor'] = ProjectLabel()
			
 
				+                if _type == 'industry_label':
			
 
				+                    dict_predictor[_type]['predictor'] = IndustryLabel()
			
 
				                 if _type == 'pb_extract':
			
 
				                     dict_predictor[_type]['predictor'] = PBPredictor()
			
 
				                 if _type == 'property_label':
			
@@ -5052,12 +5055,12 @@ class ProjectLabel():
 
				             key_wrod = item[1]
			
 
				             # 关键词排除词
			
 
				             key_paichuci = item[2]
			
 
				-            key_paichuci_s = "|".join(key_paichuci.strip('、').split('、'))
			
 
				+            key_paichuci_s = "|".join([re.escape(word) for word in key_paichuci.strip('、').split('、')])
			
 
				             # 类型排除词
			
 
				             type_paichuci = item[3]
			
 
				             if type_paichuci:
			
 
				                 paichuci_split = type_paichuci.strip('、').split('、')
			
 
				-                if re.search("|".join(paichuci_split), main_text):
			
 
				+                if re.search("|".join([re.escape(word) for word in paichuci_split]), main_text):
			
 
				                     continue
			
 
				 
			
 
				             if doctitle:
			
@@ -5154,6 +5157,108 @@ class ProjectLabel():
 
				 
			
 
				         return project_label
			
 
				 
			
 
				+# 行业标签
			
 
				+class IndustryLabel():
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.keyword_list = self.get_label_keywords()
			
 
				+
			
 
				+    def get_label_keywords(self):
			
 
				+        import csv
			
 
				+        path = os.path.dirname(__file__)+'/industry_label_keywords.csv'
			
 
				+        with open(path, 'r',encoding='utf-8') as f:
			
 
				+            reader = csv.reader(f)
			
 
				+            key_word_list = []
			
 
				+            for r in reader:
			
 
				+                if r[0] == '一级标签':
			
 
				+                    continue
			
 
				+                first_level = r[0]
			
 
				+                second_level = str(r[1])
			
 
				+                second_level = second_level.strip() if second_level and second_level != 'nan' else ""
			
 
				+                key_word = str(r[2]).strip()
			
 
				+                all_paichuci = str(r[3])
			
 
				+                all_paichuci = all_paichuci.strip() if all_paichuci and all_paichuci != 'nan' else ""
			
 
				+                title_paichuci = str(r[4])
			
 
				+                title_paichuci = title_paichuci.strip() if title_paichuci and title_paichuci != 'nan' else ""
			
 
				+                product_paichuci = str(r[5])
			
 
				+                product_paichuci = product_paichuci.strip() if product_paichuci and product_paichuci != 'nan' else ""
			
 
				+                key_word_list.append((first_level, second_level, key_word, all_paichuci,title_paichuci,product_paichuci))
			
 
				+        return key_word_list
			
 
				+
			
 
				+    def predict(self, doctitle,article,product,prem):
			
 
				+
			
 
				+        doctitle = doctitle if doctitle else ""
			
 
				+        product = product if product else ""
			
 
				+        product = ",".join(set(product.split(','))) # 产品词去重
			
 
				+        all_text = article.content
			
 
				+        tenderee = ""
			
 
				+        agency = ""
			
 
				+        try:
			
 
				+            for k,v in prem[0]['prem'].items():
			
 
				+                for link in v['roleList']:
			
 
				+                    if link['role_name'] == 'tenderee' and tenderee == "":
			
 
				+                        tenderee = link['role_text']
			
 
				+                    if link['role_name'] == 'agency' and agency == "":
			
 
				+                        agency = link['role_text']
			
 
				+        except Exception as e:
			
 
				+            # print('解析prem 获取招标人、代理人出错')
			
 
				+            pass
			
 
				+        # 剔除 招标单位、代理机构名称
			
 
				+        if tenderee:
			
 
				+            doctitle = doctitle.replace(tenderee, " ")
			
 
				+            all_text = all_text.replace(tenderee, " ")
			
 
				+        if agency:
			
 
				+            doctitle = doctitle.replace(agency, " ")
			
 
				+            all_text = all_text.replace(agency, " ")
			
 
				+
			
 
				+        label_list = []
			
 
				+        for item in self.keyword_list:
			
 
				+            first_level = item[0]
			
 
				+            second_level = item[1]
			
 
				+            key_word = item[2]
			
 
				+            key_word = key_word.strip('、').split('、')
			
 
				+            # 全文排除词
			
 
				+            all_paichuci = item[3]
			
 
				+            all_paichuci = "|".join([re.escape(word) for word in all_paichuci.strip('、').split('、')])
			
 
				+            # 标题排除词
			
 
				+            title_paichuci = item[4]
			
 
				+            title_paichuci = "|".join([re.escape(word) for word in title_paichuci.strip('、').split('、')])
			
 
				+            # 产品排除词
			
 
				+            product_paichuci = item[5]
			
 
				+            product_paichuci = "|".join([re.escape(word) for word in product_paichuci.strip('、').split('、')])
			
 
				+
			
 
				+
			
 
				+            if doctitle and title_paichuci:
			
 
				+                if re.search(title_paichuci,doctitle):
			
 
				+                    continue
			
 
				+            if product and product_paichuci:
			
 
				+                if re.search(product_paichuci,product):
			
 
				+                    continue
			
 
				+            if all_text:
			
 
				+                if all_paichuci:
			
 
				+                    if re.search(all_paichuci,all_text):
			
 
				+                        continue
			
 
				+                get_label = False
			
 
				+                for _keyword in key_word:
			
 
				+                    if '+' not in _keyword:
			
 
				+                        if _keyword in all_text:
			
 
				+                            get_label = True
			
 
				+                            break
			
 
				+                    else:
			
 
				+                        get_keyword = True
			
 
				+                        for _word in _keyword.split("+"):
			
 
				+                            if _word not in all_text:
			
 
				+                                get_keyword = False
			
 
				+                                break
			
 
				+                        if get_keyword:
			
 
				+                            get_label = True
			
 
				+                            break
			
 
				+                if get_label:
			
 
				+                    label_list.append({"first_level":first_level,"second_level":second_level})
			
 
				+
			
 
				+        return label_list
			
 
				+
			
 
				+
			
 
				 # 产权分类二级标签
			
 
				 class PropertyLabel():
			
 
				     '''