10 mesiacov pred · 665768140e
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2069,7 +2069,7 @@ def segment(soup,final=True):
 
				             # text = re.sub("\s+","##space##",text)
			
 
				             return text
			
 
				     segList = ["title"]
			
 
				-    commaList = ["div","br","td","p","li"]
			
 
				+    commaList = ["div","br","td","p","li","h1","h2","h3","h4","h5","h6"]
			
 
				     #commaList = []
			
 
				     spaceList = ["span"]
			
 
				     tbodies = soup.find_all('tbody')
			
@@ -2117,11 +2117,13 @@ def segment(soup,final=True):
 
				         for _sent in re.split("。+",text):
			
 
				             for _sent2 in re.split('，+',_sent):
			
 
				                 for _sent3 in re.split("：+",_sent2):
			
 
				+                    pre_t = ''
			
 
				                     for _t in re.split("\s{4,}",_sent3):
			
 
				-                        if len(_t)<3:
			
 
				+                        if len(_t)<3 or len(pre_t)<3 or re.search('[^\w\s]$', pre_t):  # 20240726 前文小于3字或以符合结尾的不加 避免乱加逗号 例：2）    申请人的资格要求
			
 
				                             _text += _t
			
 
				                         else:
			
 
				                             _text += "，"+_t
			
 
				+                        pre_t = _t
			
 
				                     _text += "："
			
 
				                 _text = _text[:-1]
			
 
				                 _text += "，"
			
@@ -3124,7 +3126,8 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
				             sentences_set = set()
			
 
				             for _iter in re.finditer(split_patten,article_processed):
			
 
				                 _sen = article_processed[_begin:_iter.span()[1]]
			
 
				-                if len(_sen)>0 and _sen not in sentences_set:
			
 
				+                # if len(_sen)>0 and _sen not in sentences_set: # 去重导致内容丢失
			
 
				+                if len(_sen)>0 and (len(sentences)>0 and _sen != sentences[-1] or len(sentences)==0): # 2024/07/25 改为顺序去重
			
 
				                     # 标识在附件里的句子
			
 
				                     if re.search("##attachment##",_sen):
			
 
				                         attachment_begin_index = len(sentences)
			
@@ -3136,7 +3139,8 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
				             if re.search("##attachment##", _sen):
			
 
				                 # _sen = re.sub("##attachment##", "", _sen)
			
 
				                 attachment_begin_index = len(sentences)
			
 
				-            if len(_sen)>0 and _sen not in sentences_set:
			
 
				+            # if len(_sen)>0 and _sen not in sentences_set:
			
 
				+            if len(_sen)>0 and (len(sentences)>0 and _sen != sentences[-1] or len(sentences)==0):  # 2024/07/25 改为顺序去重
			
 
				                 sentences.append(_sen)
			
 
				                 sentences_set.add(_sen)
			
 
				             # 解析outline大纲分段
			
@@ -3262,6 +3266,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				     # print('all_match:', all_match)
			
 
				     for _match in all_match:
			
 
				         # print('_match: ', _match.group())
			
 
				+        if re.search('^元/1\d{10}，$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
			
 
				+            continue
			
 
				         if len(_match.group()) > 0:
			
 
				             # print("===",_match.group())
			
 
				             # # print(_match.groupdict())
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -28,6 +28,7 @@ import json
 
				 from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
			
 
				 from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				 from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list
			
 
				+from BiddingKG.dl.interface.get_label_dic import get_all_label
			
 
				 
			
 
				 
			
 
				 # 自定义jsonEncoder
			
@@ -251,13 +252,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     '''大纲提取及大纲内容相关提取'''
			
 
				     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
			
 
				-    tt = time.time()
			
 
				     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
			
 
				-    requirement_text, aptitude_text = extract_parameters(parse_document)
			
 
				-    if aptitude_text == '':
			
 
				+    requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
			
 
				+    if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
			
 
				         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
			
 
				-        requirement_text, aptitude_text = extract_parameters(parse_document)
			
 
				-    parse_document.print_tree(parse_document.tree)
			
 
				+        requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
			
 
				 
			
 
				     # 过滤掉Redis里值为0的错误实体
			
 
				     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
			
@@ -428,9 +427,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     log("pb_extract done of doc_id%s"%(doc_id))
			
 
				     cost_time["pb_extract"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				+    '''打标签'''
			
 
				+    label_dic = get_all_label(title, list_articles[0].content)
			
 
				+
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-07-22'}
			
 
				+    version_date = {'version_date': '2024-07-26'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
@@ -474,15 +476,21 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # 资质要求
			
 
				     data_res['aptitude'] = aptitude_text[:1500]
			
 
				-
			
 
				-    for _article in list_articles:
			
 
				-            log(_article.content)
			
 
				-
			
 
				-    for list_entity in list_entitys:
			
 
				-        for _entity in list_entity:
			
 
				-            log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
			
 
				-                  (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
			
 
				-                   str(_entity.begin_index),str(_entity.end_index)))
			
 
				+    # 采购内容
			
 
				+    data_res['requirement'] = requirement_text[:1500]
			
 
				+    # 打标签
			
 
				+    data_res['label_dic'] = label_dic
			
 
				+    # 开标地点
			
 
				+    data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
			
 
				+
			
 
				+    # for _article in list_articles:
			
 
				+    #         log(_article.content)
			
 
				+    #
			
 
				+    # for list_entity in list_entitys:
			
 
				+    #     for _entity in list_entity:
			
 
				+    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
			
 
				+    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
			
 
				+    #                str(_entity.begin_index),str(_entity.end_index)))
			
 
				     _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
			
 
				     _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
			
 
				     return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -4088,7 +4088,7 @@ def  confirm_prem(prem, channel_dic):
 
				             if prem[k]['roleList'] == []:
			
 
				                 empty_roleList.append(k)
			
 
				             for d in prem[k]['roleList']:
			
 
				-                if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
			
 
				+                if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
			
 
				                     if k == 'Project':
			
 
				                         pro_winner.add(d['role_text'])
			
 
				                     else:
			
--- a/BiddingKG/dl/interface/get_label_dic.py
+++ b/BiddingKG/dl/interface/get_label_dic.py
@@ -0,0 +1,272 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+@author: bidikeji
			
 
				+@time: 2024/7/23 14:45
			
 
				+"""
			
 
				+
			
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+@author: bidikeji
			
 
				+@time: 2024/7/11 17:56
			
 
				+"""
			
 
				+from BiddingKG.dl.common.Utils import getUnifyMoney
			
 
				+import re
			
 
				+
			
 
				+def chinese_to_arabic(s):
			
 
				+    # 中文数字到阿拉伯数字的映射
			
 
				+    num_map = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
			
 
				+               '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
			
 
				+    # 单位到倍数的映射
			
 
				+    unit_map = {'十': 10, '百': 100}
			
 
				+
			
 
				+    # 初始化结果和当前数值
			
 
				+    result = 0
			
 
				+    current_num = 0
			
 
				+    has_unit = False
			
 
				+
			
 
				+    if s.startswith('十'):
			
 
				+        result = 10
			
 
				+    # 遍历字符串
			
 
				+    for char in s:
			
 
				+        if char in num_map:
			
 
				+            # 如果是数字，则进行处理
			
 
				+            if has_unit:
			
 
				+                # 如果之前已经有单位了，则需要将当前数字乘以前面的单位
			
 
				+                result += current_num * unit_map[last_unit]
			
 
				+                current_num = num_map[char]
			
 
				+                has_unit = False
			
 
				+            else:
			
 
				+                # 如果之前没有单位，则直接累加
			
 
				+                current_num = current_num * 10 + num_map[char]
			
 
				+        elif char in unit_map:
			
 
				+            # 如果是单位，则标记为已有单位，并保存最后一个单位
			
 
				+            last_unit = char
			
 
				+            has_unit = True
			
 
				+
			
 
				+            # 处理字符串末尾的数字（如果没有单位，则直接加上）
			
 
				+    if current_num != 0:
			
 
				+        if has_unit:
			
 
				+            result += current_num * unit_map[last_unit]
			
 
				+        else:
			
 
				+            result += current_num
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+def get_all_label(title, content):
			
 
				+    def is_direct_procurement():
			
 
				+        # 企业直采
			
 
				+        if re.search('询比价|询比|竞价|竞价|议价|报价', title) or re.search('我要报价|竞价起止时间|报价起止时间', content) or \
			
 
				+                (re.search('公司|集团|企业', content) and re.search('招标|中标|投标', content) == None):
			
 
				+            return 1
			
 
				+        return 0
			
 
				+
			
 
				+    def is_target_small():
			
 
				+        # 专门面向中小企业
			
 
				+        if re.search('专门面向中小微?企业', content) and re.search('(非|不属于|不|是/否)）?专门面向中小微?企业|部分面向中小微?企业', content) == None:
			
 
				+            return 1
			
 
				+        elif re.search('仅面向小微企业|专门面向.{,30}中小企业采购|是否专门面向中小微?企业(采购)?：是|本项目为中小型企业预留项目|专门面向中小微?企业', content):
			
 
				+            return 1
			
 
				+        elif re.search('落实政府采购政策需满足的资格要求.{,30}供应商为中小企业', content) and re.search('(非|不属于|不|是/否)）?专门面向中小微?企业|部分面向中小微?企业',
			
 
				+                                                                                content) == None:
			
 
				+            return 1
			
 
				+        return 0
			
 
				+
			
 
				+    def registered_years():
			
 
				+        # 注册年限
			
 
				+        ser = None
			
 
				+        if re.search('禁止\w{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
			
 
				+            ser = re.search('禁止\w{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content)
			
 
				+        elif re.search('(成立|注册)时间：?\w{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
			
 
				+            ser = re.search('(成立|注册)时间：?\w{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))',
			
 
				+                            content)
			
 
				+        elif re.search('(成立|注册)时间：?\w{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content):
			
 
				+            ser = re.search('(成立|注册)时间：?\w{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content)
			
 
				+        elif re.search('(成立|注册)时间：?\w{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)\w{,5}请勿报价)', content):
			
 
				+            ser = re.search('(成立|注册)时间：?\w{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)\w{,5}请勿报价)', content)
			
 
				+        if ser:
			
 
				+            num = ser.group('num')
			
 
				+            unit = ser.group('unit')
			
 
				+            if num.isdigit():
			
 
				+                num = int(num)
			
 
				+            else:
			
 
				+                num = chinese_to_arabic(num)
			
 
				+            if unit == '年':
			
 
				+                num *= 12
			
 
				+            return num
			
 
				+        return 0
			
 
				+
			
 
				+    def registered_capital():
			
 
				+        # 注册资本
			
 
				+        ser = None
			
 
				+        if re.search('注册(资本|资金)：?\w{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content):
			
 
				+            ser = re.search('注册(资本|资金)：?\w{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content)
			
 
				+        elif re.search('注册(资本|资金)：?\w{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content):
			
 
				+            ser = re.search('注册(资本|资金)：?\w{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content)
			
 
				+        if ser:
			
 
				+            num = ser.group('num')
			
 
				+            unit = ser.group('unit')
			
 
				+            return float(getUnifyMoney(num + unit))
			
 
				+        return 0
			
 
				+
			
 
				+    def need_qualification():
			
 
				+        # 有资质证书要求
			
 
				+        if re.search('资质要求.{,150}(行业资质|证书|许可证|认证|经营范围|一级|二级|三级|甲级|乙级|丙级|特级|壹级|贰级|叁级)', content):
			
 
				+            return 1
			
 
				+        elif re.search('(提供|有|具备)\w{,50}(资质|认证|证书|许可证)', content):
			
 
				+            return 1
			
 
				+        elif re.search('资格）?要求：?\w{,30}(甲级|丙级|乙级|一级|二级|三级|特级|壹级|贰级|叁级)', content):
			
 
				+            return 1
			
 
				+        elif re.search('认证体系要求', content):
			
 
				+            return 1
			
 
				+        return 0
			
 
				+
			
 
				+    def need_ca():
			
 
				+        # 7 是否需要办CA
			
 
				+        if re.search('需要\w{,20}数字证书|使用\w{,20}签章', content):
			
 
				+            return 1
			
 
				+        elif re.search('办理\w{,20}(数字证书|CA|ca)', content) and re.search('无需\w{,15}办理', content) == None:
			
 
				+            return 1
			
 
				+        elif re.search('(数字证书|CA|ca)\w{,5}办理|是否要求供应商使用(CA|ca)数字证书参与：是', content):
			
 
				+            return 1
			
 
				+        if re.search('(不使用|无需)\w{,20}(数字证书|CA|ca)|是否要求供应商使用(CA|ca)数字证书参与：不要求', content):
			
 
				+            return 0
			
 
				+        return 0
			
 
				+
			
 
				+    def need_performance():
			
 
				+        # 有业绩要求
			
 
				+        if re.search('业绩证明|业绩要求|行业业绩|相关业绩', content):
			
 
				+            return 1
			
 
				+        elif re.search('类似\w{,10}业绩', content) or re.search('业绩.{,5}如有', content) == None:
			
 
				+            return 1
			
 
				+        elif re.search('完成[^，。]{,100}项目', content):
			
 
				+            return 1
			
 
				+        elif re.search('(提供|有|完成).{,100}业绩', content):
			
 
				+            return 1
			
 
				+        return 0
			
 
				+
			
 
				+    def mode_of_partipation():
			
 
				+        # 参与方式 1线上 2线下 0其他
			
 
				+        if re.search('(平台|网站|http|www|官网|网址|网页|网上中介|邮件|邮箱|客户端|采购网|系统|邮寄).{,20}(注册|报名)', content):
			
 
				+            return 1
			
 
				+        elif re.search('现场报名', content):
			
 
				+            if re.search(
			
 
				+                    '非现场报名|(在线报名|邮件|邮寄|邮箱|线上报名|网络报名).{,10}现场报名|现场报名.{,10}(在线报名|邮件|邮寄|邮箱|线上报名|网络报名)', content) == None:
			
 
				+                return 1
			
 
				+            return 2
			
 
				+        elif re.search('(获取采购文件|文件的获取|文件获取|获取竞价文件|获取招标文件|文件的领取|文件领取|获取投标文件).{,200}'
			
 
				+                       '(平台|线上|客户端|邮寄|网上获取|网站|网址|http|www|邮箱|寄送|网络获取|采购网|网络领购|系统|邮件|在线报名|网络报名|非现场报名|线上报名)', content):
			
 
				+            return 1
			
 
				+        elif re.search('(获取采购文件|文件的获取|文件获取|获取竞价文件|获取招标文件|文件的领取|文件领取|获取投标文件).{,200}'
			
 
				+                       '([\d一二三四五六七八九十]号|接待室|开标室|现场领取|会议室|线下购买|现场获取|X[\d一二三四五六七八九十]|办公楼|现场报名)', content):
			
 
				+            return 2
			
 
				+        elif re.search('(报价地址|报价信息|报价请点击|报价方法|报价式|报价方式|报价提交|报价地点).{,30}'
			
 
				+                       '(平台|网站|http|www|官网|网址|网页|网上中介|邮件|邮箱|客户端|采购网|系统|在线报价|线上报价)', content):
			
 
				+            return 1
			
 
				+        elif re.search('线下报价', content) and re.search('不接受线下报价|线下报价无效', content) == None:
			
 
				+            return 2
			
 
				+        elif re.search('(文件提交|文件递交|递交方式|文件的提交|文件送达地点|递交响应文件|证明材料的递交)', content):
			
 
				+            b = re.search('(文件提交|文件递交|递交方式|文件的提交|文件送达地点|递交响应文件|证明材料的递交)', content).end()
			
 
				+            ser = re.search('联系方式|发布公告的媒介', content[b:b + 200])
			
 
				+            text = content[b:b + ser.start()] if ser else content[b:b + 200]
			
 
				+            if re.search('平台|线上|客户端|邮寄|网站|网址|http|www|邮箱|寄送|采购网|系统|邮件|网页', text):
			
 
				+                return 1
			
 
				+            elif re.search('[\d一二三四五六七八九十]号|接待室|现场递交|开标室|会议室|[\d一二三四五六七八九十]楼|线下递交|办公楼', text):
			
 
				+                return 2
			
 
				+        if re.search('(平台|线上|客户端|网站|网址|http|www|采购网|系统|网页).{,10}递交.{,10}文件', content):
			
 
				+            return 1
			
 
				+        if re.search('(开标地点|投标地点|开标时间和地点|开标时间及地点|开标方式)', content):
			
 
				+            b = re.search('(开标地点|投标地点|开标时间和地点|开标时间及地点|开标方式)', content).end()
			
 
				+            ser = re.search('联系方式|发布公告的媒介', content[b:b + 70])
			
 
				+            text = content[b:b + ser.start()] if ser else content[b:b + 70]
			
 
				+            if re.search(
			
 
				+                    '平台|线上|客户端|网站|网址|http|www|线上开标|采购网|非现场开标|不见面开标|远程异地开启|系统|线上观看开标|网上开标|在线直播的方式开标|远程开标|现场开启|电子卖场|电子开标|开标现场电话联系',
			
 
				+                    text):
			
 
				+                return 1
			
 
				+            elif re.search('[\d一二三四五六七八九十]号|线下开标|接待室|现场递交|开标室|现场开标|会议室|[\d一二三四五六七八九十]楼|办公楼|街道', text):
			
 
				+                return 2
			
 
				+        if re.search('(平台|线上|客户端|网站|网址|http|www|采购网|系统|网页).{,10}开标', content):
			
 
				+            return 1
			
 
				+        elif re.search('不见面开标|非现场开标|远程异地开启|线上观看开标|网上开标|在线直播的方式开标|远程开标|现场开启|非公开开启|电子开标|开标现场电话联系', content):
			
 
				+            return 1
			
 
				+        elif re.search('开启.{,20}地点', content):
			
 
				+            b = re.search('开启.{,20}地点', content).end()
			
 
				+            ser = re.search('联系方式|发布公告的媒介', content[b:b + 70])
			
 
				+            text = content[b:b + ser.start()] if ser else content[b:b + 70]
			
 
				+            if re.search(
			
 
				+                    '平台|线上|客户端|网站|网址|http|www|线上开标|采购网|非现场开标|不见面开标|远程异地开启|系统|线上观看开标|网上开标|在线直播的方式开标|远程开标|电子卖场|电子开标|开标现场电话联系',
			
 
				+                    text):
			
 
				+                return 1
			
 
				+            elif re.search('[\d一二三四五六七八九十]号|线下开标|接待室|现场递交|开标室|现场开标|会议室|[\d一二三四五六七八九十]楼|办公楼|街道', text):
			
 
				+                return 2
			
 
				+        return 0
			
 
				+
			
 
				+    def suitable_small():
			
 
				+        # 适合小微企业投标
			
 
				+        if re.search('属于专门面向中小企业|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
			
 
				+            return 1
			
 
				+        elif re.search('属于企业直采|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
			
 
				+            return 2
			
 
				+        elif re.search('有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
			
 
				+            return 3
			
 
				+        return 0
			
 
				+
			
 
				+    label_dic = {}
			
 
				+    is_direct_procurement = is_direct_procurement() # 是否直接采购
			
 
				+    is_target_small = is_target_small() # 是否面向中小企业
			
 
				+    mode_of_partipation = mode_of_partipation() # 参与方式
			
 
				+    need_ca = need_ca() # 是否需要CA
			
 
				+    need_performance = need_performance() # 有业绩要求
			
 
				+    need_qualification = need_qualification() # 资质要求
			
 
				+    registered_capital = registered_capital() # 注册资本
			
 
				+    registered_years = registered_years() # 注册年限
			
 
				+    suitable_small = suitable_small() # 适合小微企业
			
 
				+
			
 
				+    label_dic['is_direct_procurement'] = is_direct_procurement
			
 
				+    label_dic['is_target_small'] = is_target_small
			
 
				+    label_dic['mode_of_partipation'] = mode_of_partipation
			
 
				+    label_dic['need_ca'] = need_ca
			
 
				+    label_dic['need_performance'] = need_performance
			
 
				+    label_dic['need_qualification'] = need_qualification
			
 
				+    label_dic['registered_capital'] = registered_capital
			
 
				+    label_dic['registered_years'] = registered_years
			
 
				+    label_dic['suitable_small'] = suitable_small
			
 
				+
			
 
				+    label_dic = {k: v for k, v in label_dic.items() if v!=0}
			
 
				+
			
 
				+    return label_dic
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
			
 
				+    #     html = f.read()
			
 
				+    # rs = get_all_label('', html)
			
 
				+    # print('rs: ', rs)
			
 
				+
			
 
				+    import pandas as pd
			
 
				+    from bs4 import BeautifulSoup
			
 
				+    import json
			
 
				+
			
 
				+    df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')[:]
			
 
				+    print(df.columns, len(df))
			
 
				+    df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
			
 
				+    print(len(df))
			
 
				+    def get_text(html):
			
 
				+        soup = BeautifulSoup(html, 'lxml')
			
 
				+        text = soup.get_text()
			
 
				+        return text
			
 
				+    df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
			
 
				+    df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
			
 
				+    df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
			
 
				+    df = df[['docid', 'docchannel', 'web_source_name', 'exist_table', '标签']]
			
 
				+    df.to_excel('E:/公告标签提取结果.xlsx', index=False)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/interface/htmlparser.py
+++ b/BiddingKG/dl/interface/htmlparser.py
@@ -205,9 +205,6 @@ class ParseDocument():
 
				         if _html is None:
			
 
				             _html = ""
			
 
				         self.html = _html
			
 
				-
			
 
				-        # self.soup = BeautifulSoup(self.html,"lxml")
			
 
				-        # self.soup = BeautifulSoup(self.html,"html.parser")
			
 
				         self.auto_merge_table = auto_merge_table
			
 
				 
			
 
				         if list_obj:
			
@@ -288,6 +285,9 @@ class ParseDocument():
 
				         _se = re.search(_pattern,_text)
			
 
				         groups = []
			
 
				         if _se is not None:
			
 
				+            e = _se.end()
			
 
				+            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减)[:：]?\d', _se.group(0)) or (re.search('\d[.:：]?$', _se.group(0)) and re.search('^[\d年月日万元天]', _text[e:])):
			
 
				+                return None
			
 
				             _gd = _se.groupdict()
			
 
				             for k,v in _gd.items():
			
 
				                 if v is not None:
			
@@ -827,15 +827,18 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
 
				     return list_result
			
 
				 
			
 
				 
			
 
				-def get_childs(childs):
			
 
				+def get_childs(childs, max_depth=None):
			
 
				     list_data = []
			
 
				     for _child in childs:
			
 
				         list_data.append(_child)
			
 
				         childs2 = _child.get("child_title",[])
			
 
				 
			
 
				-        if len(childs2)>0:
			
 
				+        if len(childs2)>0 and (max_depth==None or max_depth>0):
			
 
				             for _child2 in childs2:
			
 
				-                list_data.extend(get_childs([_child2]))
			
 
				+                if max_depth != None:
			
 
				+                    list_data.extend(get_childs([_child2], max_depth-1))
			
 
				+                else:
			
 
				+                    list_data.extend(get_childs([_child2], None))
			
 
				     return list_data
			
 
				 
			
 
				 def get_range_data_by_childs(list_data,childs):
			
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -27,14 +27,14 @@ def extract_sentence_list(sentence_list):
 
				         sentence_text = sentence.sentence_text
			
 
				         begin_index = 0
			
 
				         end_index = 0
			
 
				-        for it in re.finditer('([\w：][一二三四五六七八九十]{1,3}|[^\d，。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例：289699210 1、招标内容：滑触线及配件2、招标品牌：3、参标供应商经营形式要求：厂家4、参标供应商资质要求：5、
			
 
				+        for it in re.finditer('([^一二三四五六七八九十，。][一二三四五六七八九十]{1,3}|[^\d，。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例：289699210 1、招标内容：滑触线及配件2、招标品牌：3、参标供应商经营形式要求：厂家4、参标供应商资质要求：5、
			
 
				             temp = it.group(0)
			
 
				             sentence_text = sentence_text.replace(temp, temp[0] + '，' + temp[1:])
			
 
				-        for item in re.finditer('[，。；;!！？?]+', sentence_text):
			
 
				+        for item in re.finditer('[，。；;!！？]+', sentence_text): # 20240725去掉英文问号，避免网址被分隔
			
 
				             end_index = item.end()
			
 
				-            if end_index!=len(sentence_text):
			
 
				-                if end_index-begin_index<6 and item.group()[-1] in ['，', ';', '；'] and re.match('[一二三四五六七八九十\d.]+、', item.group())==None:
			
 
				-                    continue
			
 
				+            # if end_index!=len(sentence_text):
			
 
				+            #     # if end_index-begin_index<6 and item.group(0) in ['，', ';', '；'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销，避免标题提取错误
			
 
				+            #     #     continue
			
 
				             new_sentence_text = sentence_text[begin_index:end_index]
			
 
				             sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
			
 
				             if sentence.in_attachment:
			
@@ -53,16 +53,23 @@ def extract_sentence_list(sentence_list):
 
				 
			
 
				     return new_sentence2_list, new_sentence2_list_attach
			
 
				 
			
 
				-requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围)([及与和](其它|\w{,2})要求)?" \
			
 
				-                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([:：]|$)"
			
 
				+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围|信息)([及与和](其它|\w{,2})要求)?" \
			
 
				+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([:：，]|$)"
			
 
				 aptitude_pattern = "(资格要求|资质要求)([:：，]|$)"
			
 
				+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[)）]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([:：，]|$)|开启([:：，]|$)"
			
 
				+out_lines = []
			
 
				 
			
 
				-# out_lines = []
			
 
				-
			
 
				-def extract_parameters(parse_document):
			
 
				+def extract_parameters(parse_document, content):
			
 
				+    '''
			
 
				+    通过大纲、预处理后文本正则获取需要字段
			
 
				+    :param parse_document: ParseDocument() 方法返回结果
			
 
				+    :param content: 公告预处理后文本
			
 
				+    :return:
			
 
				+    '''
			
 
				     list_data = parse_document.tree
			
 
				     requirement_text = ''
			
 
				     aptitude_text = ''
			
 
				+    addr_bidopen_text = ''
			
 
				 
			
 
				     _find_count = 0
			
 
				     _data_i = -1
			
@@ -74,10 +81,16 @@ def extract_parameters(parse_document):
 
				         # print(_data.keys())
			
 
				         if _type=="sentence":
			
 
				             if _data["sentence_title"] is not None:
			
 
				-                if re.search(requirement_pattern,_text) is not None:
			
 
				+
			
 
				+                outline = re.sub('（?[一二三四五六七八九十\d.]+）?\s*、?', '',
			
 
				+                                 re.split('[：:，]', _text)[0].replace('(', '（').replace(')', '）'))
			
 
				+
			
 
				+                if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求，', _text[:30])==None:
			
 
				+                    out_lines.append(outline)
			
 
				                     childs = get_childs([_data])
			
 
				                     for c in childs:
			
 
				-                        requirement_text += c["text"]+"\n"
			
 
				+                        # requirement_text += c["text"]+"\n"
			
 
				+                        requirement_text += c["text"]
			
 
				                     _data_i += len(childs)
			
 
				                     _data_i -= 1
			
 
				     _data_i = -1
			
@@ -108,7 +121,6 @@ def extract_parameters(parse_document):
 
				 
			
 
				                 # elif re.match('[（(\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', _text) and len(_text)<30 and re.search('资质|资格', _text):
			
 
				                 #     out_lines.append(outline)
			
 
				-
			
 
				         if _type=="table":
			
 
				             list_table = _data["list_table"]
			
 
				             parent_title = _data["parent_title"]
			
@@ -119,18 +131,67 @@ def extract_parameters(parse_document):
 
				                         cell_text = cell[0]
			
 
				                         if len(cell_text)>120 and re.search(aptitude_pattern,cell_text) is not None:
			
 
				                             aptitude_text += cell_text+"\n"
			
 
				-
			
 
				-    return requirement_text,aptitude_text
			
 
				+    _data_i = -1
			
 
				+    while _data_i < len(list_data) - 1:
			
 
				+        _data_i += 1
			
 
				+        _data = list_data[_data_i]
			
 
				+        _type = _data["type"]
			
 
				+        _text = _data["text"].strip()
			
 
				+        # print(_data.keys())
			
 
				+        if _type == "sentence":
			
 
				+            if _data["sentence_title"] is not None:
			
 
				+                if re.search(addr_bidopen_pattern, _text[:20]) is not None:
			
 
				+                    childs = get_childs([_data], max_depth=1)
			
 
				+                    for c in childs:
			
 
				+                        addr_bidopen_text += c["text"]
			
 
				+                    _data_i += len(childs)
			
 
				+                    _data_i -= 1
			
 
				+    if re.search('时间：', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidopen_text):
			
 
				+        for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidopen_text):
			
 
				+            b, e = ser.span()
			
 
				+        addr_bidopen_text = addr_bidopen_text[b:e]
			
 
				+    elif re.search('开启', addr_bidopen_text) and re.search('时间：\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
			
 
				+        addr_bidopen_text = ""
			
 
				+    if addr_bidopen_text == "":
			
 
				+        ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)）?(会议)?地[点址]([(（]网址[)）])?[：为][^，；。]{2,100}[，；。]', content)
			
 
				+        if ser:
			
 
				+            addr_bidopen_text = ser.group(0)
			
 
				+    return requirement_text, aptitude_text, addr_bidopen_text
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
			
 
				     #     html = f.read()
			
 
				     #
			
 
				-    # l = []
			
 
				+    l = []
			
 
				     import pandas as pd
			
 
				-    # from collections import Counter
			
 
				-    # from BiddingKG.dl.interface import Preprocessing
			
 
				+    from collections import Counter
			
 
				+    from BiddingKG.dl.interface import Preprocessing
			
 
				+    from BiddingKG.dl.interface.get_label_dic import get_all_label
			
 
				+    from bs4 import BeautifulSoup
			
 
				+    import json
			
 
				+
			
 
				+    df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
			
 
				+    df['len']= df['招标内容'].apply(lambda x: len(x))
			
 
				+    print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
			
 
				+    print(len([it for it in df['len'] if it>1500]))
			
 
				+
			
 
				     # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
			
 
				+    # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx')
			
 
				+    # df = df[df['docid'].isin(df1['docid'])]
			
 
				+    #
			
 
				+    # df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
			
 
				+    # print(df.columns, len(df))
			
 
				+    #
			
 
				+    #
			
 
				+    # # def get_text(html):
			
 
				+    # #     soup = BeautifulSoup(html, 'lxml')
			
 
				+    # #     text = soup.get_text()
			
 
				+    # #     return text
			
 
				+    # # df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
			
 
				+    # # df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
			
 
				+    # # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
			
 
				+    # # df1 = df[['docid', '标签']]
			
 
				+    #
			
 
				     # n = 0
			
 
				     # datas = []
			
 
				     # for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']):
			
@@ -139,8 +200,8 @@ if __name__ == "__main__":
 
				     #     # print(id, type(id))
			
 
				     #     # parse_document = ParseDocument(html, True)
			
 
				     #     # requirement_text, aptitude_text = extract_parameters(parse_document)
			
 
				-    #     if re.search('资\s*[格质]', html)==None:
			
 
				-    #         continue
			
 
				+    #     # if re.search('资\s*[格质]', html)==None:
			
 
				+    #     #     continue
			
 
				     #
			
 
				     #     list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True)
			
 
				     #     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
			
@@ -149,12 +210,12 @@ if __name__ == "__main__":
 
				     #
			
 
				     #     parse_document = ParseDocument(html, True, list_obj=sentence2_list)
			
 
				     #     requirement_text, aptitude_text = extract_parameters(parse_document)
			
 
				-    #     if len(aptitude_text)>0:
			
 
				-    #         datas.append((id, aptitude_text[:1500]))
			
 
				-    #         print(id, aptitude_text[:10], aptitude_text[-20:])
			
 
				-    #     else:
			
 
				-    #         parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
			
 
				-    #         requirement_text, aptitude_text = extract_parameters(parse_document)
			
 
				+    #     # if len(aptitude_text)>0:
			
 
				+    #     #     datas.append((id, aptitude_text[:1500]))
			
 
				+    #     #     print(id, aptitude_text[:10], aptitude_text[-20:])
			
 
				+    #     # else:
			
 
				+    #     #     parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
			
 
				+    #     #     requirement_text, aptitude_text = extract_parameters(parse_document)
			
 
				     #
			
 
				     #     # if 0<len(aptitude_text)<20:
			
 
				     #     #     l.append(len(aptitude_text))
			
@@ -163,11 +224,20 @@ if __name__ == "__main__":
 
				     #     #     if n > 5:
			
 
				     #     #         break
			
 
				     #
			
 
				+    #     if len(requirement_text)>0:
			
 
				+    #         label_dic = get_all_label(title, list_articles[0].content)
			
 
				+    #         # datas.append((id, requirement_text))
			
 
				+    #         datas.append((id, requirement_text, label_dic))
			
 
				+    #
			
 
				     # c = Counter(out_lines)
			
 
				     # print(c.most_common(1000))
			
 
				+    # #
			
 
				+    # # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
			
 
				+    # # df.to_excel('E:/公告资质要求提取结果.xlsx')
			
 
				     #
			
 
				-    # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
			
 
				-    # df.to_excel('E:/公告资质要求提取结果.xlsx')
			
 
				+    # df = pd.DataFrame(datas, columns=['docid', '招标内容', '标签'])
			
 
				+    # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
			
 
				+    # df.to_excel('E:/公告招标内容提取结果2.xlsx')
			
 
				 
			
 
				     #     if len(aptitude_text)> 1000:
			
 
				     #         print(id, aptitude_text[:10], aptitude_text[-20:])
			
@@ -193,14 +263,17 @@ if __name__ == "__main__":
 
				     #     new_sentence_text = sentence_text[begin_index:end_index]
			
 
				     #     print(new_sentence_text)
			
 
				 
			
 
				-    df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
			
 
				-    pos = neg = 0
			
 
				-    for docid, text in zip(df['docid'], df['资质要求']):
			
 
				-        if re.match('[（(\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
			
 
				-            pos += 1
			
 
				-            pass
			
 
				-        else:
			
 
				-            neg += 1
			
 
				-            print(docid, text[:50])
			
 
				-    print('异常：%d, 正常：%d'%(neg, pos))
			
 
				+    # df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
			
 
				+    # docids = []
			
 
				+    # pos = neg = 0
			
 
				+    # for docid, text in zip(df['docid'], df['资质要求']):
			
 
				+    #     if re.match('[（(\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
			
 
				+    #         pos += 1
			
 
				+    #         pass
			
 
				+    #     else:
			
 
				+    #         neg += 1
			
 
				+    #         print(docid, text[:50])
			
 
				+    #         docids.append(docid)
			
 
				+    # print('异常：%d, 正常：%d'%(neg, pos))
			
 
				+    # print(docids)