ソースを参照

Merge remote-tracking branch 'origin/master'

lsm 1 年間 前
コミット
96b2e9dfae

+ 15 - 5
BiddingKG/dl/interface/Preprocessing.py

@@ -1971,9 +1971,11 @@ def article_limit(soup,limit_words=30000):
         _gap = _count - max_count
         _is_skip = False
         next_soup = None
+        # 跳过层级结构为1的标签,向下取值
         while len(_soup.find_all(recursive=False)) == 1 and \
                 _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
             _soup = _soup.find_all(recursive=False)[0]
+        # 无结构的纯文本直接取值
         if len(_soup.find_all(recursive=False)) == 0:
             _soup.string = str(_soup.get_text())[:max_count-_count]
             _count += len(re.sub(sub_space, "", _soup.string))
@@ -2007,22 +2009,24 @@ def article_limit(soup,limit_words=30000):
                 have_attachment = True
                 break
     if not have_attachment:
-        # 无附件
+        # 无附件,通过get_text()方法与limit_words大小判断是否要限制字数
         if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
-            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=1000)
             while n_soup:
-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
 
     else:
         # 有附件
         _text = re.sub(sub_space, "", soup.get_text())
         _text_split = _text.split("##attachment##")
+        # 正文部分
         if len(_text_split[0])>limit_words:
             main_soup = attachment_part.parent
             main_text = main_soup.find_all(recursive=False)[0]
-            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=1000)
             while n_soup:
-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
+        # 附件部分
         if len(_text_split[1])>limit_words:
             # attachment_html纯文本,无子结构
             if len(attachment_part.find_all(recursive=False))==0:
@@ -2042,6 +2046,12 @@ def article_limit(soup,limit_words=30000):
                                         attachment_skip = True
                                 else:
                                     p_part.decompose()
+                            # attachment_text_nums, gap, n_part = soup_limit(part, attachment_text_nums,
+                            #                                     max_count=limit_words,max_gap=1000)
+                            # while n_part:
+                            #     attachment_text_nums, gap, n_part = soup_limit(n_part, attachment_text_nums,
+                            #                                         max_count=limit_words,max_gap=1000)
+                            # print(attachment_text_nums)
                         else:
                             last_attachment_text_nums = attachment_text_nums
                             attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))

+ 4 - 2
BiddingKG/dl/interface/extract.py

@@ -272,7 +272,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     entityLink.link_entitys(list_entitys)
     doctitle_refine = entityLink.doctitle_refine(title)
     nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
@@ -317,7 +317,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
-    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time))
 
     '''更新单一来源招标公告中标角色为预中标'''
     getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
@@ -345,6 +345,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''根据关键词表生成项目标签'''
     project_label = predictor.getPredictor('project_label').predict(title,product=','.join(product_list),project_name=codeName[0]['name'],prem=prem)
+    # 额外需求的标签
+    project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
     # print(project_label)
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]

+ 53 - 45
BiddingKG/dl/interface/getAttributes.py

@@ -1433,7 +1433,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     last_phone_mask = False
                     continue
                 # 排除号码实体为时间格式 ,例如:20150515
-                if re.search("^20(1[0-9]|2[0-2])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
+                if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
                     error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
@@ -2982,9 +2982,12 @@ def turnMoneySource(moneysource):
 
 my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
-import time
-def my_timeFormat(_time):
-    current_year = time.strftime("%Y",time.localtime())
+import time,datetime
+def my_timeFormat(_time,page_time):
+    if page_time:
+        current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
+    else:
+        current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(my_time_format_pattern,_time)
     time_list = []
     for _match in all_match:
@@ -3004,10 +3007,10 @@ def my_timeFormat(_time):
                 if re.search("^\d+$", year):
                     if len(year) == 2:
                         year = "20" + year
-                        if int(year) - int(current_year) > 5:
+                        if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
                             legal = False
                     else:
-                        if int(year) - int(current_year)>10:
+                        if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
                             legal = False
                 else:
                     _year = ""
@@ -3053,7 +3056,7 @@ def my_timeFormat(_time):
                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
     return time_list
 
-def getTimeAttributes(list_entity,list_sentence):
+def getTimeAttributes(list_entity,list_sentence,page_time):
     time_entitys = [i for i in list_entity if i.entity_type=='time']
     time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
@@ -3122,24 +3125,29 @@ def getTimeAttributes(list_entity,list_sentence):
             last_time_type = ""
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
-        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
-        extract_time = my_timeFormat(entity_text)
+        extract_time = my_timeFormat(entity_text,page_time)
         # print(entity_text,entity_left2)
-        # definite_time = "00:00:00"
         if extract_time:
             definite_time_list = []
-            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
+            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
             _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
+            _entity_text_len = len(_entity_text)
+            _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
             t_in_word_num = len(re.findall(t,_entity_text))
-            t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
+            # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
             begin_index = 0
             for _num in range(t_in_word_num):
+                if begin_index> _entity_text_len + 8:
+                    break
                 t_in_word = re.search(t, _entity_text[begin_index:])
                 if t_in_word:
+                    if _num==0 and t_in_word.start() > _entity_text_len + 8:
+                        break
                     begin_index = t_in_word.end()
                     # print('t_in_word',entity_text,t_in_word.groupdict())
                     day = t_in_word.groupdict().get('day',"")
@@ -3169,35 +3177,35 @@ def getTimeAttributes(list_entity,list_sentence):
                     # print(definite_time)
                     definite_time_list.append(definite_time)
 
-            if t_out_of_word:
-                # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
-                day = t_out_of_word.groupdict().get('day', "")
-                hour = t_out_of_word.groupdict().get('hour', "")
-                half_hour = t_out_of_word.groupdict().get('half_hour', "")
-                minute = t_out_of_word.groupdict().get('minute', "")
-                second = t_out_of_word.groupdict().get('second', "")
-                if hour:
-                    if day == '下午' and int(hour) < 12:
-                        hour = str(int(hour) + 12)
-                    if int(hour) > 24:
-                        continue
-                else:
-                    hour = "00"
-                if not minute:
-                    if half_hour:
-                        minute = "30"
-                    else:
-                        minute = "00"
-                if int(minute) > 60:
-                    continue
-                if not second:
-                    second = "00"
-                if int(second) > 60:
-                    continue
-                definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
-                # print(definite_time)
-                definite_time_list.append(definite_time)
-
+            # if t_out_of_word:
+            #     # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
+            #     day = t_out_of_word.groupdict().get('day', "")
+            #     hour = t_out_of_word.groupdict().get('hour', "")
+            #     half_hour = t_out_of_word.groupdict().get('half_hour', "")
+            #     minute = t_out_of_word.groupdict().get('minute', "")
+            #     second = t_out_of_word.groupdict().get('second', "")
+            #     if hour:
+            #         if day == '下午' and int(hour) < 12:
+            #             hour = str(int(hour) + 12)
+            #         if int(hour) > 24:
+            #             continue
+            #     else:
+            #         hour = "00"
+            #     if not minute:
+            #         if half_hour:
+            #             minute = "30"
+            #         else:
+            #             minute = "00"
+            #     if int(minute) > 60:
+            #         continue
+            #     if not second:
+            #         second = "00"
+            #     if int(second) > 60:
+            #         continue
+            #     definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
+            #     # print(definite_time)
+            #     definite_time_list.append(definite_time)
+            #
 
             min_len = min(len(extract_time),len(definite_time_list))
             for i in range(min_len):
@@ -3507,7 +3515,7 @@ def getTimeAttributes(list_entity,list_sentence):
     return result_dict
 
 
-def getOtherAttributes(list_entity):
+def getOtherAttributes(list_entity,page_time):
     dict_other = {"moneysource":"",
                   "person_review":[],
                   "serviceTime":"",
@@ -3553,7 +3561,7 @@ def getOtherAttributes(list_entity):
                 for _serviceTime in list_time:
                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
                     if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
-                        _extract_time = my_timeFormat(_serviceTime.entity_text)
+                        _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
                         if _extract_time and len(_extract_time)==2:
                             # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
                             if _extract_time[0]!=_extract_time[1]:
@@ -3588,7 +3596,7 @@ def getOtherAttributes(list_entity):
 def getMoneyRange(RoleList):
     pass
 
-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
     '''
     @param:
         list_sentence:所有文章的句子list
@@ -3599,7 +3607,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
     for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
         RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
         result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
-                           **getTimeAttributes(list_entity, list_sentence),
+                           **getTimeAttributes(list_entity, list_sentence,page_time),
                            **{"fingerprint": list_article.fingerprint,
                               "match_enterprise": list_article.match_enterprise,
                               "match_enterprise_type": list_article.match_enterprise_type,

+ 218 - 0
BiddingKG/dl/interface/kongjing_label_keywords.csv

@@ -0,0 +1,218 @@
+关键词,组合词(并关系),匹配字段,限定行业(info_type)
+污染治理,室内环境,正文,
+通风系统,集中空调,正文,
+通风系统,中央空调,正文,
+清洁消毒服务,公共场所,正文,
+循环水处理服务,,正文,
+清洗,油烟,正文,
+病媒生物防制,,正文,
+清洁消毒服务,医院,正文,
+白蚁防治服务,,正文,
+空气净化器,,产品,
+消毒机,,产品,
+新风机,,产品,
+新风系统,,产品,
+污水消毒耗材,,正文,
+废气治理,,正文,
+医用耗材,,产品,
+医疗耗材,,产品,
+污水处理服务,,正文,
+水处理服务,,正文,
+空调清洗,院部,正文,
+保洁服务,,正文,
+室内空气净化,,正文,
+除甲醛,,产品,
+清洗服务,中央空调,正文,
+食堂,外包,正文,
+除异味设施,,正文,
+清洗消毒服务,设备,正文,
+保洁服务,垃圾,正文,
+杀虫灭害服务,,正文,
+杀虫服务,,正文,
+保洁服务,除四害,正文,
+保洁服务,医院,正文,
+管护服务,公厕,正文,
+病媒生物防治,,正文,
+消毒服务,学校,正文,
+消毒服务,小学,正文,
+物业管理服务,,正文,
+空气治理,,正文,
+消杀服务,码头,正文,
+仓库,,产品,水资源管理|水文服务|污水处理及其再生利用|环境污染防治设备|环境治理业|环境与生态监测检测服务|清洁服务|卫生|社会工作|生态保护|环保咨询
+空气净化,,产品,
+水质检测,,产品,
+空调维修,,正文,
+车内保养异味,,正文,
+大健康,,产品,
+宠物毛发,,正文,
+甲醛,医院,正文,
+消毒服务,养殖,正文,
+消毒服务,,正文,
+空调清洗,,正文,
+冷却塔清洗,,正文,
+消杀服务,,正文,
+异味处理,,正文,
+净水设备,,正文,
+白蚁防治,,正文,
+水处理药品,,正文,
+水处理,,产品,
+保洁项目,,正文,
+污水处理,物料采购,正文,
+空调维保,,产品,
+水处理,监测,正文,
+水箱清洗,空调,正文,
+水处理,设备运维,正文,
+健康,空调,正文,
+健康,陶瓷,正文,
+健康,油烟机,正文,
+健康,燃气灶,正文,
+健康,集成灶,正文,
+健康,橱柜,正文,
+健康,乘用车,正文,
+健康,酒店,正文,
+健康,空气净化器,正文,
+健康,新风系统,正文,
+健康,新风机,正文,
+健康,照明,正文,
+健康,涂料,正文,
+空气净化,产品,产品,
+空气净化,OEM,产品,
+ODM企业,,正文,
+室内环境污染治理企业,,正文,
+室内空气质量及相关产品检验检测机构,,正文,
+空调运维,,正文,
+灭白蚁,,正文,
+防治白蚁,,正文,
+杀虫灭鼠,,正文,
+空气污染,,正文,
+甲醛治理,,正文,
+除味,,产品,
+味道治理,,正文,
+装修治理,,正文,
+空气检测,,正文,
+甲醛检测,,正文,
+CMA检测,,正文,
+TVOC治理,,正文,
+治理药剂,,产品,
+除醛剂,,产品,
+除味剂,,产品,
+光触媒,,产品,
+生物酶,,产品,
+甲醛封闭剂,,产品,
+空气消毒,,正文,
+室内消毒,,正文,
+净化器,,正文,
+消毒器,,正文,
+治理药水,,产品,
+物业保洁,,正文,
+社区保洁,,正文,
+工厂保洁,,正文,
+校园保洁,,正文,
+道路保洁,,正文,
+室内保洁,,正文,
+保洁劳务,,正文,
+景区保洁,,正文,
+医院保洁,,正文,
+清扫服务,,正文,
+清洁卫生,,正文,
+保洁外派,,正文,
+保洁外包,,正文,
+开荒保洁,,正文,
+聘请专业保洁,,正文,
+开荒清洁,,正文,
+保洁劳务外包,,正文,
+清洁服务,,正文,
+清扫保洁,,正文,
+办公楼保洁,,正文,
+消杀,,产品,
+公司保洁,,正文,
+保洁承包,,正文,
+氟化物废物,,正文,
+废油气,,正文,
+含氰废物,,正文,
+精馏残渣,,正文,
+蒸馏残渣,,正文,
+废吸附剂,,正文,
+废密封剂,,正文,
+化学废物,,正文,
+爆炸性废物,,正文,
+焚烧处置残渣,,正文,
+氰化物废物,,正文,
+废齿轮油,,正文,
+涂料废物,,正文,
+废发动机油,,正文,
+废酸,,产品,
+废防锈油,,正文,
+废碱,,产品,
+废冷冻机油,,正文,
+磷化合物废物,,正文,
+废润滑油,,正文,
+废石蜡,,正文,
+废液压油,,正文,
+医疗废物,,正文,
+废催化剂,,正文,
+废银催化剂,,正文,
+废溶剂,,正文,
+医药废物,,正文,
+农药废物,,正文,
+废液,,产品,
+有机溶剂废物,,产品,
+废油,,产品,
+废焦油,,产品,
+废矿物油,,产品,
+染料废物,,产品,
+活性炭过滤器,,产品,
+复合式净化器,,产品,
+虑芯,,产品,
+分离过滤器,,产品,
+聚结器,,产品,
+清洗服务,,产品,
+物业维修,,产品,
+物业项目,,产品,
+物业服务,,产品,
+物业采购,,产品,
+物业管理,,产品,
+物业业务,,产品,
+物业外包,,产品,
+物业合同,,产品,
+小区物业,,产品,
+后勤服务,,产品,
+办公物业,,产品,
+后勤保障,,产品,
+物管服务,,产品,
+后勤社会化,,产品,
+物业托管,,产品,
+物业委托,,产品,
+宿舍管理,,产品,
+公寓管理,,产品,
+后勤管理,,产品,
+后勤综合,,产品,
+物业综合,,产品,
+消毒剂,,产品,
+恒温培养箱,,产品,
+医用真空干燥,,产品,
+灭菌烘箱,,产品,
+微波智能设备,,产品,
+高温干蒸设备,,产品,
+医用干燥柜,,产品,
+等离子消毒器,,产品,
+清洗消毒器,,产品,
+高温灭菌器,,产品,
+低温灭菌器,,产品,
+除异味,,产品,
+有害气体,,产品,
+污染防治,,产品,
+空气污染物,,产品,
+物业管理,,行业,
+清洁服务,,行业,
+污水处理及其再生利用,,行业,
+卫生,,行业,
+水资源管理,,行业,
+水文服务,,行业,
+生态保护,,行业,
+社会工作,,行业,
+环境治理业,,行业,
+环境与生态监测检测服务,,行业,
+环境污染防治设备,,行业,
+环保咨询,,行业,
+仓储业,,行业,

+ 71 - 1
BiddingKG/dl/interface/predictor.py

@@ -4581,6 +4581,7 @@ class ProjectLabel():
     def __init__(self, ):
 
         self.keyword_list = self.get_label_keywords()
+        self.kongjing_keyword_list = self.get_kongjing_keywords()
 
     def get_label_keywords(self):
         import csv
@@ -4600,6 +4601,25 @@ class ProjectLabel():
                 key_word_list.append((type, key_wrod, key_paichuci, type_paichuci))
         return key_word_list
 
+    def get_kongjing_keywords(self):
+        import csv
+        path = os.path.dirname(__file__)+'/kongjing_label_keywords.csv'
+        with open(path, 'r',encoding='utf-8') as f:
+            reader = csv.reader(f)
+            key_word_list = []
+            for r in reader:
+                if r[0] == '关键词':
+                    continue
+                key_wrod = r[0]
+                key_wrod2 = str(r[1])
+                key_wrod2 = key_wrod2 if key_wrod2 and key_wrod2 != 'nan' else ""
+                search_type = r[2]
+                info_type_list = str(r[3])
+                info_type_list = info_type_list if info_type_list and info_type_list != 'nan' else ""
+
+                key_word_list.append((key_wrod, key_wrod2, search_type, info_type_list))
+        return key_word_list
+
     def predict(self, doctitle,product,project_name,prem):
 
         doctitle = doctitle if doctitle else ""
@@ -4618,7 +4638,8 @@ class ProjectLabel():
                     if link['role_name'] == 'agency' and agency == "":
                         agency = link['role_text']
         except Exception as e:
-            print('解析prem 获取招标人、代理人出错')
+            # print('解析prem 获取招标人、代理人出错')
+            pass
         sub_project_names = ";".join(sub_project_names)
         # 核心字段:标题+产品词+项目名称+标段名称
         main_text = ",".join([doctitle, product, project_name, sub_project_names])
@@ -4691,6 +4712,55 @@ class ProjectLabel():
 
         return {"标题":doctitle_dict,"核心字段":main_text_dict}
 
+    def predict_other(self,project_label,industry,doctitle,project_name,product,list_articles):
+        # doctextcon 取正文内容
+        doctextcon = list_articles[0].content.split('##attachment##')[0]
+        info_type = industry.get('industry',{}).get("class_name","")
+        doctitle = doctitle if doctitle else ""
+        product = product if product else ""
+        product = ",".join(set(product.split(',')))  # 产品词去重
+        project_name = project_name if project_name else ""
+
+        get_kongjing_label = False
+        keywords_list = []
+        for item in self.kongjing_keyword_list:
+            key_wrod = item[0]
+            key_wrod2 = item[1]
+            search_type = item[2]
+            info_type_list = item[3]
+            info_type_list = info_type_list.split("|") if info_type_list else []
+
+            search_text = ""
+            if search_type=='正文':
+                search_text = ",".join([doctextcon,doctitle,project_name,product])
+            elif search_type=='产品':
+                search_text = ",".join([doctitle,project_name,product])
+            if search_type=='行业':
+                # ’行业’类型直接用info_type匹配关键词
+                if info_type==key_wrod:
+                    # 匹配关键词记录
+                    keywords_list.append(key_wrod)
+                    get_kongjing_label = True
+                    # break
+            else:
+                if key_wrod in search_text:
+                    if key_wrod2 and key_wrod2 not in search_text:
+                        continue
+                    if info_type_list and info_type not in info_type_list:
+                        continue
+                    # 匹配关键词记录
+                    if key_wrod2:
+                        keywords_list.append(key_wrod+'+'+key_wrod2)
+                    else:
+                        keywords_list.append(key_wrod)
+                    get_kongjing_label = True
+                    # break
+        if get_kongjing_label:
+            project_label["核心字段"]["空净通"] = [[word,1] for word in keywords_list][:10]
+
+        return project_label
+
+
 # 总价单价提取
 class TotalUnitMoney:
     def __init__(self):

+ 3 - 1
BiddingKG/dl/interface/project_label_keywords.csv

@@ -17,6 +17,7 @@
 家具,茶水柜,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 家具,方几,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 家具,条几,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
+学校家具,学校家具,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 学校家具,教学家具,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 学校家具,演示台,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 学校家具,讨论桌椅,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
@@ -30,6 +31,7 @@
 学校家具,教师休息柜,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 学校家具,学习卡座带写字板,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 学校家具,教师休息椅,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
+办公家具,办公家具,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 办公家具,职员工位,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 办公家具,班台,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
 办公家具,会议台,,租赁、维保、维修、维护、报废、回收、废旧、检测、改造、售后、运维、配件、设计、修补、监理、造价、结算审核、审计
@@ -10787,7 +10789,7 @@
 道路工程,桥梁维修,,设备采购、设计、研究、方法、环评、报废、废旧、回收、生产、研发、监理、咨询、编制、审计
 道路工程,隧道检修,,设备采购、设计、研究、方法、环评、报废、废旧、回收、生产、研发、监理、咨询、编制、审计
 给排水工程,给排水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
-,饮水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
+给排水工程,饮水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
 给排水工程,下水道工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
 给排水工程,供水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品
 给排水工程,自来水工程,,编制、监理、设计、勘察、代理、咨询、检测、审计、造价、结算、杂品