Kaynağa Gözat

新增"行业标签"识别

znj 4 gün önce
ebeveyn
işleme
9a5784fa75

+ 4 - 0
BiddingKG/dl/interface/extract.py

@@ -507,6 +507,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
     # print(project_label)
 
+    '''行业关键词标签'''
+    industry_label = predictor.getPredictor('industry_label').predict(title,list_articles[0],product=','.join(product_list),prem=prem)
+
     '''产权分类二级标签'''
     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
 
@@ -581,6 +584,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
 
     data_res["project_label"] = project_label
+    data_res["industry_label"] = industry_label
     data_res["property_label"] = property_label
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

Dosya farkı çok büyük olduğundan ihmal edildi
+ 1 - 0
BiddingKG/dl/interface/industry_label_keywords.csv


+ 107 - 2
BiddingKG/dl/interface/predictor.py

@@ -110,6 +110,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   'candidate': {"predictor": None, "Lock": RLock()},
                   'websource_tenderee': {"predictor": None, "Lock": RLock()},
                   'project_label': {"predictor": None, "Lock": RLock()},
+                  'industry_label': {"predictor": None, "Lock": RLock()},
                   'pb_extract': {"predictor": None, "Lock": RLock()},
                   'property_label': {"predictor": None, "Lock": RLock()},
                   'approval': {"predictor": None, "Lock": RLock()}, # 审批项目预测
@@ -166,6 +167,8 @@ def getPredictor(_type):
                     dict_predictor[_type]['predictor'] = WebsourceTenderee()
                 if _type == 'project_label':
                     dict_predictor[_type]['predictor'] = ProjectLabel()
+                if _type == 'industry_label':
+                    dict_predictor[_type]['predictor'] = IndustryLabel()
                 if _type == 'pb_extract':
                     dict_predictor[_type]['predictor'] = PBPredictor()
                 if _type == 'property_label':
@@ -5050,12 +5053,12 @@ class ProjectLabel():
             key_wrod = item[1]
             # 关键词排除词
             key_paichuci = item[2]
-            key_paichuci_s = "|".join(key_paichuci.strip('、').split('、'))
+            key_paichuci_s = "|".join([re.escape(word) for word in key_paichuci.strip('、').split('、')])
             # 类型排除词
             type_paichuci = item[3]
             if type_paichuci:
                 paichuci_split = type_paichuci.strip('、').split('、')
-                if re.search("|".join(paichuci_split), main_text):
+                if re.search("|".join([re.escape(word) for word in paichuci_split]), main_text):
                     continue
 
             if doctitle:
@@ -5152,6 +5155,108 @@ class ProjectLabel():
 
         return project_label
 
+# 行业标签
+class IndustryLabel():
+
+    def __init__(self):
+        self.keyword_list = self.get_label_keywords()
+
+    def get_label_keywords(self):
+        import csv
+        path = os.path.dirname(__file__)+'/industry_label_keywords.csv'
+        with open(path, 'r',encoding='utf-8') as f:
+            reader = csv.reader(f)
+            key_word_list = []
+            for r in reader:
+                if r[0] == '一级标签':
+                    continue
+                first_level = r[0]
+                second_level = str(r[1])
+                second_level = second_level.strip() if second_level and second_level != 'nan' else ""
+                key_word = str(r[2]).strip()
+                all_paichuci = str(r[3])
+                all_paichuci = all_paichuci.strip() if all_paichuci and all_paichuci != 'nan' else ""
+                title_paichuci = str(r[4])
+                title_paichuci = title_paichuci.strip() if title_paichuci and title_paichuci != 'nan' else ""
+                product_paichuci = str(r[5])
+                product_paichuci = product_paichuci.strip() if product_paichuci and product_paichuci != 'nan' else ""
+                key_word_list.append((first_level, second_level, key_word, all_paichuci,title_paichuci,product_paichuci))
+        return key_word_list
+
+    def predict(self, doctitle,article,product,prem):
+
+        doctitle = doctitle if doctitle else ""
+        product = product if product else ""
+        product = ",".join(set(product.split(','))) # 产品词去重
+        all_text = article.content
+        tenderee = ""
+        agency = ""
+        try:
+            for k,v in prem[0]['prem'].items():
+                for link in v['roleList']:
+                    if link['role_name'] == 'tenderee' and tenderee == "":
+                        tenderee = link['role_text']
+                    if link['role_name'] == 'agency' and agency == "":
+                        agency = link['role_text']
+        except Exception as e:
+            # print('解析prem 获取招标人、代理人出错')
+            pass
+        # 剔除 招标单位、代理机构名称
+        if tenderee:
+            doctitle = doctitle.replace(tenderee, " ")
+            all_text = all_text.replace(tenderee, " ")
+        if agency:
+            doctitle = doctitle.replace(agency, " ")
+            all_text = all_text.replace(agency, " ")
+
+        label_list = []
+        for item in self.keyword_list:
+            first_level = item[0]
+            second_level = item[1]
+            key_word = item[2]
+            key_word = key_word.strip('、').split('、')
+            # 全文排除词
+            all_paichuci = item[3]
+            all_paichuci = "|".join([re.escape(word) for word in all_paichuci.strip('、').split('、')])
+            # 标题排除词
+            title_paichuci = item[4]
+            title_paichuci = "|".join([re.escape(word) for word in title_paichuci.strip('、').split('、')])
+            # 产品排除词
+            product_paichuci = item[5]
+            product_paichuci = "|".join([re.escape(word) for word in product_paichuci.strip('、').split('、')])
+
+
+            if doctitle and title_paichuci:
+                if re.search(title_paichuci,doctitle):
+                    continue
+            if product and product_paichuci:
+                if re.search(product_paichuci,product):
+                    continue
+            if all_text:
+                if all_paichuci:
+                    if re.search(all_paichuci,all_text):
+                        continue
+                get_label = False
+                for _keyword in key_word:
+                    if '+' not in _keyword:
+                        if _keyword in all_text:
+                            get_label = True
+                            break
+                    else:
+                        get_keyword = True
+                        for _word in _keyword.split("+"):
+                            if _word not in all_text:
+                                get_keyword = False
+                                break
+                        if get_keyword:
+                            get_label = True
+                            break
+                if get_label:
+                    label_list.append({"first_level":first_level,"second_level":second_level})
+
+        return label_list
+
+
 # 产权分类二级标签
 class PropertyLabel():
     '''

Bu fark içinde çok fazla dosya değişikliği olduğu için bazı dosyalar gösterilmiyor