1 年間前 · 96b2e9dfae
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1971,9 +1971,11 @@ def article_limit(soup,limit_words=30000):
 
				         _gap = _count - max_count
			
 
				         _is_skip = False
			
 
				         next_soup = None
			
 
				+        # 跳过层级结构为1的标签，向下取值
			
 
				         while len(_soup.find_all(recursive=False)) == 1 and \
			
 
				                 _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
			
 
				             _soup = _soup.find_all(recursive=False)[0]
			
 
				+        # 无结构的纯文本直接取值
			
 
				         if len(_soup.find_all(recursive=False)) == 0:
			
 
				             _soup.string = str(_soup.get_text())[:max_count-_count]
			
 
				             _count += len(re.sub(sub_space, "", _soup.string))
			
@@ -2007,22 +2009,24 @@ def article_limit(soup,limit_words=30000):
 
				                 have_attachment = True
			
 
				                 break
			
 
				     if not have_attachment:
			
 
				-        # 无附件
			
 
				+        # 无附件，通过get_text()方法与limit_words大小判断是否要限制字数
			
 
				         if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
			
 
				-            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
			
 
				+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=1000)
			
 
				             while n_soup:
			
 
				-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
			
 
				+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
			
 
				 
			
 
				     else:
			
 
				         # 有附件
			
 
				         _text = re.sub(sub_space, "", soup.get_text())
			
 
				         _text_split = _text.split("##attachment##")
			
 
				+        # 正文部分
			
 
				         if len(_text_split[0])>limit_words:
			
 
				             main_soup = attachment_part.parent
			
 
				             main_text = main_soup.find_all(recursive=False)[0]
			
 
				-            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
			
 
				+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=1000)
			
 
				             while n_soup:
			
 
				-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
			
 
				+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
			
 
				+        # 附件部分
			
 
				         if len(_text_split[1])>limit_words:
			
 
				             # attachment_html纯文本，无子结构
			
 
				             if len(attachment_part.find_all(recursive=False))==0:
			
@@ -2042,6 +2046,12 @@ def article_limit(soup,limit_words=30000):
 
				                                         attachment_skip = True
			
 
				                                 else:
			
 
				                                     p_part.decompose()
			
 
				+                            # attachment_text_nums, gap, n_part = soup_limit(part, attachment_text_nums,
			
 
				+                            #                                     max_count=limit_words,max_gap=1000)
			
 
				+                            # while n_part:
			
 
				+                            #     attachment_text_nums, gap, n_part = soup_limit(n_part, attachment_text_nums,
			
 
				+                            #                                         max_count=limit_words,max_gap=1000)
			
 
				+                            # print(attachment_text_nums)
			
 
				                         else:
			
 
				                             last_attachment_text_nums = attachment_text_nums
			
 
				                             attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -272,7 +272,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     entityLink.link_entitys(list_entitys)
			
 
				     doctitle_refine = entityLink.doctitle_refine(title)
			
 
				     nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
			
 
				-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
			
 
				+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
			
 
				     log("get attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["attrs"] = round(time.time()-start_time,2)
			
 
				 
			
@@ -317,7 +317,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
			
 
				     log("get product done of doc_id%s"%(doc_id))
			
 
				     cost_time["product"] = round(time.time()-start_time,2)
			
 
				-    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
			
 
				+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time))
			
 
				 
			
 
				     '''更新单一来源招标公告中标角色为预中标'''
			
 
				     getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
			
@@ -345,6 +345,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     '''根据关键词表生成项目标签'''
			
 
				     project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
			
 
				+    # 额外需求的标签
			
 
				+    project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
			
 
				     # print(project_label)
			
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1433,7 +1433,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                     last_phone_mask = False
			
 
				                     continue
			
 
				                 # 排除号码实体为时间格式 ，例如：20150515
			
 
				-                if re.search("^20(1[0-9]|2[0-2])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
			
 
				+                if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
			
 
				                     error_numStr_index.append(numStr_index)
			
 
				                     last_phone_mask = False
			
 
				                     continue
			
@@ -2982,9 +2982,12 @@ def turnMoneySource(moneysource):
 
				 
			
 
				 my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
			
 
				 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
			
 
				-import time
			
 
				-def my_timeFormat(_time):
			
 
				-    current_year = time.strftime("%Y",time.localtime())
			
 
				+import time,datetime
			
 
				+def my_timeFormat(_time,page_time):
			
 
				+    if page_time:
			
 
				+        current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
			
 
				+    else:
			
 
				+        current_year = time.strftime("%Y",time.localtime())
			
 
				     all_match = re.finditer(my_time_format_pattern,_time)
			
 
				     time_list = []
			
 
				     for _match in all_match:
			
@@ -3004,10 +3007,10 @@ def my_timeFormat(_time):
 
				                 if re.search("^\d+$", year):
			
 
				                     if len(year) == 2:
			
 
				                         year = "20" + year
			
 
				-                        if int(year) - int(current_year) > 5:
			
 
				+                        if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
			
 
				                             legal = False
			
 
				                     else:
			
 
				-                        if int(year) - int(current_year)>10:
			
 
				+                        if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
			
 
				                             legal = False
			
 
				                 else:
			
 
				                     _year = ""
			
@@ -3053,7 +3056,7 @@ def my_timeFormat(_time):
 
				                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
			
 
				     return time_list
			
 
				 
			
 
				-def getTimeAttributes(list_entity,list_sentence):
			
 
				+def getTimeAttributes(list_entity,list_sentence,page_time):
			
 
				     time_entitys = [i for i in list_entity if i.entity_type=='time']
			
 
				     time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
			
 
				     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
			
@@ -3122,24 +3125,29 @@ def getTimeAttributes(list_entity,list_sentence):
 
				             last_time_type = ""
			
 
				         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
			
 
				         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
			
 
				-        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
			
 
				+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
			
 
				         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
			
 
				         label_prob = entity.values[entity.label]
			
 
				         entity_text = entity.entity_text
			
 
				         in_attachment = entity.in_attachment
			
 
				-        extract_time = my_timeFormat(entity_text)
			
 
				+        extract_time = my_timeFormat(entity_text,page_time)
			
 
				         # print(entity_text,entity_left2)
			
 
				-        # definite_time = "00:00:00"
			
 
				         if extract_time:
			
 
				             definite_time_list = []
			
 
				-            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[:：时点](?P<half_hour>半)?(?P<minute>\d{2})?[:：分]?(?P<second>\d{2})?秒?")
			
 
				+            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[:：时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[:：分]?(?P<second>\d{2})?秒?")
			
 
				             _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
			
 
				+            _entity_text_len = len(_entity_text)
			
 
				+            _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
			
 
				             t_in_word_num = len(re.findall(t,_entity_text))
			
 
				-            t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
			
 
				+            # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
			
 
				             begin_index = 0
			
 
				             for _num in range(t_in_word_num):
			
 
				+                if begin_index> _entity_text_len + 8:
			
 
				+                    break
			
 
				                 t_in_word = re.search(t, _entity_text[begin_index:])
			
 
				                 if t_in_word:
			
 
				+                    if _num==0 and t_in_word.start() > _entity_text_len + 8:
			
 
				+                        break
			
 
				                     begin_index = t_in_word.end()
			
 
				                     # print('t_in_word',entity_text,t_in_word.groupdict())
			
 
				                     day = t_in_word.groupdict().get('day',"")
			
@@ -3169,35 +3177,35 @@ def getTimeAttributes(list_entity,list_sentence):
 
				                     # print(definite_time)
			
 
				                     definite_time_list.append(definite_time)
			
 
				 
			
 
				-            if t_out_of_word:
			
 
				-                # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
			
 
				-                day = t_out_of_word.groupdict().get('day', "")
			
 
				-                hour = t_out_of_word.groupdict().get('hour', "")
			
 
				-                half_hour = t_out_of_word.groupdict().get('half_hour', "")
			
 
				-                minute = t_out_of_word.groupdict().get('minute', "")
			
 
				-                second = t_out_of_word.groupdict().get('second', "")
			
 
				-                if hour:
			
 
				-                    if day == '下午' and int(hour) < 12:
			
 
				-                        hour = str(int(hour) + 12)
			
 
				-                    if int(hour) > 24:
			
 
				-                        continue
			
 
				-                else:
			
 
				-                    hour = "00"
			
 
				-                if not minute:
			
 
				-                    if half_hour:
			
 
				-                        minute = "30"
			
 
				-                    else:
			
 
				-                        minute = "00"
			
 
				-                if int(minute) > 60:
			
 
				-                    continue
			
 
				-                if not second:
			
 
				-                    second = "00"
			
 
				-                if int(second) > 60:
			
 
				-                    continue
			
 
				-                definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
			
 
				-                # print(definite_time)
			
 
				-                definite_time_list.append(definite_time)
			
 
				-
			
 
				+            # if t_out_of_word:
			
 
				+            #     # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
			
 
				+            #     day = t_out_of_word.groupdict().get('day', "")
			
 
				+            #     hour = t_out_of_word.groupdict().get('hour', "")
			
 
				+            #     half_hour = t_out_of_word.groupdict().get('half_hour', "")
			
 
				+            #     minute = t_out_of_word.groupdict().get('minute', "")
			
 
				+            #     second = t_out_of_word.groupdict().get('second', "")
			
 
				+            #     if hour:
			
 
				+            #         if day == '下午' and int(hour) < 12:
			
 
				+            #             hour = str(int(hour) + 12)
			
 
				+            #         if int(hour) > 24:
			
 
				+            #             continue
			
 
				+            #     else:
			
 
				+            #         hour = "00"
			
 
				+            #     if not minute:
			
 
				+            #         if half_hour:
			
 
				+            #             minute = "30"
			
 
				+            #         else:
			
 
				+            #             minute = "00"
			
 
				+            #     if int(minute) > 60:
			
 
				+            #         continue
			
 
				+            #     if not second:
			
 
				+            #         second = "00"
			
 
				+            #     if int(second) > 60:
			
 
				+            #         continue
			
 
				+            #     definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
			
 
				+            #     # print(definite_time)
			
 
				+            #     definite_time_list.append(definite_time)
			
 
				+            #
			
 
				 
			
 
				             min_len = min(len(extract_time),len(definite_time_list))
			
 
				             for i in range(min_len):
			
@@ -3507,7 +3515,7 @@ def getTimeAttributes(list_entity,list_sentence):
 
				     return result_dict
			
 
				 
			
 
				 
			
 
				-def getOtherAttributes(list_entity):
			
 
				+def getOtherAttributes(list_entity,page_time):
			
 
				     dict_other = {"moneysource":"",
			
 
				                   "person_review":[],
			
 
				                   "serviceTime":"",
			
@@ -3553,7 +3561,7 @@ def getOtherAttributes(list_entity):
 
				                 for _serviceTime in list_time:
			
 
				                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
			
 
				                     if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;；]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
			
 
				-                        _extract_time = my_timeFormat(_serviceTime.entity_text)
			
 
				+                        _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
			
 
				                         if _extract_time and len(_extract_time)==2:
			
 
				                             # 排除开始和结束时间一样的错误模板，例：“履约期限：2023年02月15日至2023年02月15日”
			
 
				                             if _extract_time[0]!=_extract_time[1]:
			
@@ -3588,7 +3596,7 @@ def getOtherAttributes(list_entity):
 
				 def getMoneyRange(RoleList):
			
 
				     pass
			
 
				 
			
 
				-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
			
 
				+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
			
 
				     '''
			
 
				     @param:
			
 
				         list_sentence:所有文章的句子list
			
@@ -3599,7 +3607,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
 
				     for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
			
 
				         RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
			
 
				         result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
			
 
				-                           **getTimeAttributes(list_entity, list_sentence),
			
 
				+                           **getTimeAttributes(list_entity, list_sentence,page_time),
			
 
				                            **{"fingerprint": list_article.fingerprint,
			
 
				                               "match_enterprise": list_article.match_enterprise,
			
 
				                               "match_enterprise_type": list_article.match_enterprise_type,
			
--- a/BiddingKG/dl/interface/kongjing_label_keywords.csv
+++ b/BiddingKG/dl/interface/kongjing_label_keywords.csv
@@ -0,0 +1,218 @@
 
				+关键词,组合词（并关系）,匹配字段,限定行业（info_type）
			
 
				+污染治理,室内环境,正文,
			
 
				+通风系统,集中空调,正文,
			
 
				+通风系统,中央空调,正文,
			
 
				+清洁消毒服务,公共场所,正文,
			
 
				+循环水处理服务,,正文,
			
 
				+清洗,油烟,正文,
			
 
				+病媒生物防制,,正文,
			
 
				+清洁消毒服务,医院,正文,
			
 
				+白蚁防治服务,,正文,
			
 
				+空气净化器,,产品,
			
 
				+消毒机,,产品,
			
 
				+新风机,,产品,
			
 
				+新风系统,,产品,
			
 
				+污水消毒耗材,,正文,
			
 
				+废气治理,,正文,
			
 
				+医用耗材,,产品,
			
 
				+医疗耗材,,产品,
			
 
				+污水处理服务,,正文,
			
 
				+水处理服务,,正文,
			
 
				+空调清洗,院部,正文,
			
 
				+保洁服务,,正文,
			
 
				+室内空气净化,,正文,
			
 
				+除甲醛,,产品,
			
 
				+清洗服务,中央空调,正文,
			
 
				+食堂,外包,正文,
			
 
				+除异味设施,,正文,
			
 
				+清洗消毒服务,设备,正文,
			
 
				+保洁服务,垃圾,正文,
			
 
				+杀虫灭害服务,,正文,
			
 
				+杀虫服务,,正文,
			
 
				+保洁服务,除四害,正文,
			
 
				+保洁服务,医院,正文,
			
 
				+管护服务,公厕,正文,
			
 
				+病媒生物防治,,正文,
			
 
				+消毒服务,学校,正文,
			
 
				+消毒服务,小学,正文,
			
 
				+物业管理服务,,正文,
			
 
				+空气治理,,正文,
			
 
				+消杀服务,码头,正文,
			
 
				+仓库,,产品,水资源管理|水文服务|污水处理及其再生利用|环境污染防治设备|环境治理业|环境与生态监测检测服务|清洁服务|卫生|社会工作|生态保护|环保咨询
			
 
				+空气净化,,产品,
			
 
				+水质检测,,产品,
			
 
				+空调维修,,正文,
			
 
				+车内保养异味,,正文,
			
 
				+大健康,,产品,
			
 
				+宠物毛发,,正文,
			
 
				+甲醛,医院,正文,
			
 
				+消毒服务,养殖,正文,
			
 
				+消毒服务,,正文,
			
 
				+空调清洗,,正文,
			
 
				+冷却塔清洗,,正文,
			
 
				+消杀服务,,正文,
			
 
				+异味处理,,正文,
			
 
				+净水设备,,正文,
			
 
				+白蚁防治,,正文,
			
 
				+水处理药品,,正文,
			
 
				+水处理,,产品,
			
 
				+保洁项目,,正文,
			
 
				+污水处理,物料采购,正文,
			
 
				+空调维保,,产品,
			
 
				+水处理,监测,正文,
			
 
				+水箱清洗,空调,正文,
			
 
				+水处理,设备运维,正文,
			
 
				+健康,空调,正文,
			
 
				+健康,陶瓷,正文,
			
 
				+健康,油烟机,正文,
			
 
				+健康,燃气灶,正文,
			
 
				+健康,集成灶,正文,
			
 
				+健康,橱柜,正文,
			
 
				+健康,乘用车,正文,
			
 
				+健康,酒店,正文,
			
 
				+健康,空气净化器,正文,
			
 
				+健康,新风系统,正文,
			
 
				+健康,新风机,正文,
			
 
				+健康,照明,正文,
			
 
				+健康,涂料,正文,
			
 
				+空气净化,产品,产品,
			
 
				+空气净化,OEM,产品,
			
 
				+ODM企业,,正文,
			
 
				+室内环境污染治理企业,,正文,
			
 
				+室内空气质量及相关产品检验检测机构,,正文,
			
 
				+空调运维,,正文,
			
 
				+灭白蚁,,正文,
			
 
				+防治白蚁,,正文,
			
 
				+杀虫灭鼠,,正文,
			
 
				+空气污染,,正文,
			
 
				+甲醛治理,,正文,
			
 
				+除味,,产品,
			
 
				+味道治理,,正文,
			
 
				+装修治理,,正文,
			
 
				+空气检测,,正文,
			
 
				+甲醛检测,,正文,
			
 
				+CMA检测,,正文,
			
 
				+TVOC治理,,正文,
			
 
				+治理药剂,,产品,
			
 
				+除醛剂,,产品,
			
 
				+除味剂,,产品,
			
 
				+光触媒,,产品,
			
 
				+生物酶,,产品,
			
 
				+甲醛封闭剂,,产品,
			
 
				+空气消毒,,正文,
			
 
				+室内消毒,,正文,
			
 
				+净化器,,正文,
			
 
				+消毒器,,正文,
			
 
				+治理药水,,产品,
			
 
				+物业保洁,,正文,
			
 
				+社区保洁,,正文,
			
 
				+工厂保洁,,正文,
			
 
				+校园保洁,,正文,
			
 
				+道路保洁,,正文,
			
 
				+室内保洁,,正文,
			
 
				+保洁劳务,,正文,
			
 
				+景区保洁,,正文,
			
 
				+医院保洁,,正文,
			
 
				+清扫服务,,正文,
			
 
				+清洁卫生,,正文,
			
 
				+保洁外派,,正文,
			
 
				+保洁外包,,正文,
			
 
				+开荒保洁,,正文,
			
 
				+聘请专业保洁,,正文,
			
 
				+开荒清洁,,正文,
			
 
				+保洁劳务外包,,正文,
			
 
				+清洁服务,,正文,
			
 
				+清扫保洁,,正文,
			
 
				+办公楼保洁,,正文,
			
 
				+消杀,,产品,
			
 
				+公司保洁,,正文,
			
 
				+保洁承包,,正文,
			
 
				+氟化物废物,,正文,
			
 
				+废油气,,正文,
			
 
				+含氰废物,,正文,
			
 
				+精馏残渣,,正文,
			
 
				+蒸馏残渣,,正文,
			
 
				+废吸附剂,,正文,
			
 
				+废密封剂,,正文,
			
 
				+化学废物,,正文,
			
 
				+爆炸性废物,,正文,
			
 
				+焚烧处置残渣,,正文,
			
 
				+氰化物废物,,正文,
			
 
				+废齿轮油,,正文,
			
 
				+涂料废物,,正文,
			
 
				+废发动机油,,正文,
			
 
				+废酸,,产品,
			
 
				+废防锈油,,正文,
			
 
				+废碱,,产品,
			
 
				+废冷冻机油,,正文,
			
 
				+磷化合物废物,,正文,
			
 
				+废润滑油,,正文,
			
 
				+废石蜡,,正文,
			
 
				+废液压油,,正文,
			
 
				+医疗废物,,正文,
			
 
				+废催化剂,,正文,
			
 
				+废银催化剂,,正文,
			
 
				+废溶剂,,正文,
			
 
				+医药废物,,正文,
			
 
				+农药废物,,正文,
			
 
				+废液,,产品,
			
 
				+有机溶剂废物,,产品,
			
 
				+废油,,产品,
			
 
				+废焦油,,产品,
			
 
				+废矿物油,,产品,
			
 
				+染料废物,,产品,
			
 
				+活性炭过滤器,,产品,
			
 
				+复合式净化器,,产品,
			
 
				+虑芯,,产品,
			
 
				+分离过滤器,,产品,
			
 
				+聚结器,,产品,
			
 
				+清洗服务,,产品,
			
 
				+物业维修,,产品,
			
 
				+物业项目,,产品,
			
 
				+物业服务,,产品,
			
 
				+物业采购,,产品,
			
 
				+物业管理,,产品,
			
 
				+物业业务,,产品,
			
 
				+物业外包,,产品,
			
 
				+物业合同,,产品,
			
 
				+小区物业,,产品,
			
 
				+后勤服务,,产品,
			
 
				+办公物业,,产品,
			
 
				+后勤保障,,产品,
			
 
				+物管服务,,产品,
			
 
				+后勤社会化,,产品,
			
 
				+物业托管,,产品,
			
 
				+物业委托,,产品,
			
 
				+宿舍管理,,产品,
			
 
				+公寓管理,,产品,
			
 
				+后勤管理,,产品,
			
 
				+后勤综合,,产品,
			
 
				+物业综合,,产品,
			
 
				+消毒剂,,产品,
			
 
				+恒温培养箱,,产品,
			
 
				+医用真空干燥,,产品,
			
 
				+灭菌烘箱,,产品,
			
 
				+微波智能设备,,产品,
			
 
				+高温干蒸设备,,产品,
			
 
				+医用干燥柜,,产品,
			
 
				+等离子消毒器,,产品,
			
 
				+清洗消毒器,,产品,
			
 
				+高温灭菌器,,产品,
			
 
				+低温灭菌器,,产品,
			
 
				+除异味,,产品,
			
 
				+有害气体,,产品,
			
 
				+污染防治,,产品,
			
 
				+空气污染物,,产品,
			
 
				+物业管理,,行业,
			
 
				+清洁服务,,行业,
			
 
				+污水处理及其再生利用,,行业,
			
 
				+卫生,,行业,
			
 
				+水资源管理,,行业,
			
 
				+水文服务,,行业,
			
 
				+生态保护,,行业,
			
 
				+社会工作,,行业,
			
 
				+环境治理业,,行业,
			
 
				+环境与生态监测检测服务,,行业,
			
 
				+环境污染防治设备,,行业,
			
 
				+环保咨询,,行业,
			
 
				+仓储业,,行业,
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -4581,6 +4581,7 @@ class ProjectLabel():
 
				     def __init__(self, ):
			
 
				 
			
 
				         self.keyword_list = self.get_label_keywords()
			
 
				+        self.kongjing_keyword_list = self.get_kongjing_keywords()
			
 
				 
			
 
				     def get_label_keywords(self):
			
 
				         import csv
			
@@ -4600,6 +4601,25 @@ class ProjectLabel():
 
				                 key_word_list.append((type, key_wrod, key_paichuci, type_paichuci))
			
 
				         return key_word_list
			
 
				 
			
 
				+    def get_kongjing_keywords(self):
			
 
				+        import csv
			
 
				+        path = os.path.dirname(__file__)+'/kongjing_label_keywords.csv'
			
 
				+        with open(path, 'r',encoding='utf-8') as f:
			
 
				+            reader = csv.reader(f)
			
 
				+            key_word_list = []
			
 
				+            for r in reader:
			
 
				+                if r[0] == '关键词':
			
 
				+                    continue
			
 
				+                key_wrod = r[0]
			
 
				+                key_wrod2 = str(r[1])
			
 
				+                key_wrod2 = key_wrod2 if key_wrod2 and key_wrod2 != 'nan' else ""
			
 
				+                search_type = r[2]
			
 
				+                info_type_list = str(r[3])
			
 
				+                info_type_list = info_type_list if info_type_list and info_type_list != 'nan' else ""
			
 
				+
			
 
				+                key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
			
 
				+        return key_word_list
			
 
				+
			
 
				     def predict(self, doctitle,product,project_name,prem):
			
 
				 
			
 
				         doctitle = doctitle if doctitle else ""
			
@@ -4618,7 +4638,8 @@ class ProjectLabel():
 
				                     if link['role_name'] == 'agency' and agency == "":
			
 
				                         agency = link['role_text']
			
 
				         except Exception as e:
			
 
				-            print('解析prem 获取招标人、代理人出错')
			
 
				+            # print('解析prem 获取招标人、代理人出错')
			
 
				+            pass
			
 
				         sub_project_names = ";".join(sub_project_names)
			
 
				         # 核心字段：标题+产品词+项目名称+标段名称
			
 
				         main_text = "，".join([doctitle, product, project_name, sub_project_names])
			
@@ -4691,6 +4712,55 @@ class ProjectLabel():
 
				 
			
 
				         return {"标题":doctitle_dict,"核心字段":main_text_dict}
			
 
				 
			
 
				+    def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
			
 
				+        # doctextcon 取正文内容
			
 
				+        doctextcon = list_articles[0].content.split('##attachment##')[0]
			
 
				+        info_type = industry.get('industry',{}).get("class_name","")
			
 
				+        doctitle = doctitle if doctitle else ""
			
 
				+        product = product if product else ""
			
 
				+        product = ",".join(set(product.split(',')))  # 产品词去重
			
 
				+        project_name = project_name if project_name else ""
			
 
				+
			
 
				+        get_kongjing_label = False
			
 
				+        keywords_list = []
			
 
				+        for item in self.kongjing_keyword_list:
			
 
				+            key_wrod = item[0]
			
 
				+            key_wrod2 = item[1]
			
 
				+            search_type = item[2]
			
 
				+            info_type_list = item[3]
			
 
				+            info_type_list = info_type_list.split("|") if info_type_list else []
			
 
				+
			
 
				+            search_text = ""
			
 
				+            if search_type=='正文':
			
 
				+                search_text = "，".join([doctextcon,doctitle,project_name,product])
			
 
				+            elif search_type=='产品':
			
 
				+                search_text = "，".join([doctitle,project_name,product])
			
 
				+            if search_type=='行业':
			
 
				+                # ’行业’类型直接用info_type匹配关键词
			
 
				+                if info_type==key_wrod:
			
 
				+                    # 匹配关键词记录
			
 
				+                    keywords_list.append(key_wrod)
			
 
				+                    get_kongjing_label = True
			
 
				+                    # break
			
 
				+            else:
			
 
				+                if key_wrod in search_text:
			
 
				+                    if key_wrod2 and key_wrod2 not in search_text:
			
 
				+                        continue
			
 
				+                    if info_type_list and info_type not in info_type_list:
			
 
				+                        continue
			
 
				+                    # 匹配关键词记录
			
 
				+                    if key_wrod2:
			
 
				+                        keywords_list.append(key_wrod+'+'+key_wrod2)
			
 
				+                    else:
			
 
				+                        keywords_list.append(key_wrod)
			
 
				+                    get_kongjing_label = True
			
 
				+                    # break
			
 
				+        if get_kongjing_label:
			
 
				+            project_label["核心字段"]["空净通"] = [[word,1] for word in keywords_list][:10]
			
 
				+
			
 
				+        return project_label
			
 
				+
			
 
				+
			
 
				 # 总价单价提取
			
 
				 class TotalUnitMoney:
			
 
				     def __init__(self):
			
--- a/BiddingKG/dl/interface/project_label_keywords.csv
+++ b/BiddingKG/dl/interface/project_label_keywords.csv
@@ -17,6 +17,7 @@
 
				 家具,茶水柜,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 家具,方几,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 家具,条几,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				+学校家具,学校家具,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 学校家具,教学家具,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 学校家具,演示台,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 学校家具,讨论桌椅,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
@@ -30,6 +31,7 @@
 
				 学校家具,教师休息柜,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 学校家具,学习卡座带写字板,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 学校家具,教师休息椅,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				+办公家具,办公家具,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 办公家具,职员工位,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 办公家具,班台,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
 
				 办公家具,会议台,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
			
@@ -10787,7 +10789,7 @@
 
				 道路工程,桥梁维修,,设备采购、设计、研究、方法、环评、报废、废旧、回收、生产、研发、监理、咨询、编制、审计
			
 
				 道路工程,隧道检修,,设备采购、设计、研究、方法、环评、报废、废旧、回收、生产、研发、监理、咨询、编制、审计
			
 
				 给排水工程,给排水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
			
 
				-,饮水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
			
 
				+给排水工程,饮水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
			
 
				 给排水工程,下水道工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
			
 
				 给排水工程,供水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
			
 
				 给排水工程,自来水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品