Prechádzať zdrojové kódy

新增采购内容、开标地点、需求标签

lsm 10 mesiacov pred
rodič
commit
665768140e

+ 10 - 4
BiddingKG/dl/interface/Preprocessing.py

@@ -2069,7 +2069,7 @@ def segment(soup,final=True):
             # text = re.sub("\s+","##space##",text)
             return text
     segList = ["title"]
-    commaList = ["div","br","td","p","li"]
+    commaList = ["div","br","td","p","li","h1","h2","h3","h4","h5","h6"]
     #commaList = []
     spaceList = ["span"]
     tbodies = soup.find_all('tbody')
@@ -2117,11 +2117,13 @@ def segment(soup,final=True):
         for _sent in re.split("。+",text):
             for _sent2 in re.split(',+',_sent):
                 for _sent3 in re.split(":+",_sent2):
+                    pre_t = ''
                     for _t in re.split("\s{4,}",_sent3):
-                        if len(_t)<3:
+                        if len(_t)<3 or len(pre_t)<3 or re.search('[^\w\s]$', pre_t):  # 20240726 前文小于3字或以符合结尾的不加 避免乱加逗号 例:2)    申请人的资格要求
                             _text += _t
                         else:
                             _text += ","+_t
+                        pre_t = _t
                     _text += ":"
                 _text = _text[:-1]
                 _text += ","
@@ -3124,7 +3126,8 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
             sentences_set = set()
             for _iter in re.finditer(split_patten,article_processed):
                 _sen = article_processed[_begin:_iter.span()[1]]
-                if len(_sen)>0 and _sen not in sentences_set:
+                # if len(_sen)>0 and _sen not in sentences_set: # 去重导致内容丢失
+                if len(_sen)>0 and (len(sentences)>0 and _sen != sentences[-1] or len(sentences)==0): # 2024/07/25 改为顺序去重
                     # 标识在附件里的句子
                     if re.search("##attachment##",_sen):
                         attachment_begin_index = len(sentences)
@@ -3136,7 +3139,8 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
             if re.search("##attachment##", _sen):
                 # _sen = re.sub("##attachment##", "", _sen)
                 attachment_begin_index = len(sentences)
-            if len(_sen)>0 and _sen not in sentences_set:
+            # if len(_sen)>0 and _sen not in sentences_set:
+            if len(_sen)>0 and (len(sentences)>0 and _sen != sentences[-1] or len(sentences)==0):  # 2024/07/25 改为顺序去重
                 sentences.append(_sen)
                 sentences_set.add(_sen)
             # 解析outline大纲分段
@@ -3262,6 +3266,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     # print('all_match:', all_match)
     for _match in all_match:
         # print('_match: ', _match.group())
+        if re.search('^元/1\d{10},$', _match.group(0)): # 修复 495042766 现场负责人 姚元 / 13488160460 预测为金额
+            continue
         if len(_match.group()) > 0:
             # print("===",_match.group())
             # # print(_match.groupdict())

+ 23 - 15
BiddingKG/dl/interface/extract.py

@@ -28,6 +28,7 @@ import json
 from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
 from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list
+from BiddingKG.dl.interface.get_label_dic import get_all_label
 
 
 # 自定义jsonEncoder
@@ -251,13 +252,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
-    tt = time.time()
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text = extract_parameters(parse_document)
-    if aptitude_text == '':
+    requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
+    if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text = extract_parameters(parse_document)
-    parse_document.print_tree(parse_document.tree)
+        requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -428,9 +427,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("pb_extract done of doc_id%s"%(doc_id))
     cost_time["pb_extract"] = round(time.time() - start_time, 2)
 
+    '''打标签'''
+    label_dic = get_all_label(title, list_articles[0].content)
+
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-07-22'}
+    version_date = {'version_date': '2024-07-26'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -474,15 +476,21 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # 资质要求
     data_res['aptitude'] = aptitude_text[:1500]
-
-    for _article in list_articles:
-            log(_article.content)
-
-    for list_entity in list_entitys:
-        for _entity in list_entity:
-            log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
-                  (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
-                   str(_entity.begin_index),str(_entity.end_index)))
+    # 采购内容
+    data_res['requirement'] = requirement_text[:1500]
+    # 打标签
+    data_res['label_dic'] = label_dic
+    # 开标地点
+    data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
+
+    # for _article in list_articles:
+    #         log(_article.content)
+    #
+    # for list_entity in list_entitys:
+    #     for _entity in list_entity:
+    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
+    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
+    #                str(_entity.begin_index),str(_entity.end_index)))
     _extract_json = json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
     _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "")
     return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)

+ 1 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -4088,7 +4088,7 @@ def  confirm_prem(prem, channel_dic):
             if prem[k]['roleList'] == []:
                 empty_roleList.append(k)
             for d in prem[k]['roleList']:
-                if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
+                if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
                     if k == 'Project':
                         pro_winner.add(d['role_text'])
                     else:

+ 272 - 0
BiddingKG/dl/interface/get_label_dic.py

@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2024/7/23 14:45
+"""
+
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2024/7/11 17:56
+"""
+from BiddingKG.dl.common.Utils import getUnifyMoney
+import re
+
+def chinese_to_arabic(s):
+    # 中文数字到阿拉伯数字的映射
+    num_map = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
+               '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
+    # 单位到倍数的映射
+    unit_map = {'十': 10, '百': 100}
+
+    # 初始化结果和当前数值
+    result = 0
+    current_num = 0
+    has_unit = False
+
+    if s.startswith('十'):
+        result = 10
+    # 遍历字符串
+    for char in s:
+        if char in num_map:
+            # 如果是数字,则进行处理
+            if has_unit:
+                # 如果之前已经有单位了,则需要将当前数字乘以前面的单位
+                result += current_num * unit_map[last_unit]
+                current_num = num_map[char]
+                has_unit = False
+            else:
+                # 如果之前没有单位,则直接累加
+                current_num = current_num * 10 + num_map[char]
+        elif char in unit_map:
+            # 如果是单位,则标记为已有单位,并保存最后一个单位
+            last_unit = char
+            has_unit = True
+
+            # 处理字符串末尾的数字(如果没有单位,则直接加上)
+    if current_num != 0:
+        if has_unit:
+            result += current_num * unit_map[last_unit]
+        else:
+            result += current_num
+
+    return result
+
+def get_all_label(title, content):
+    def is_direct_procurement():
+        # 企业直采
+        if re.search('询比价|询比|竞价|竞价|议价|报价', title) or re.search('我要报价|竞价起止时间|报价起止时间', content) or \
+                (re.search('公司|集团|企业', content) and re.search('招标|中标|投标', content) == None):
+            return 1
+        return 0
+
+    def is_target_small():
+        # 专门面向中小企业
+        if re.search('专门面向中小微?企业', content) and re.search('(非|不属于|不|是/否))?专门面向中小微?企业|部分面向中小微?企业', content) == None:
+            return 1
+        elif re.search('仅面向小微企业|专门面向.{,30}中小企业采购|是否专门面向中小微?企业(采购)?:是|本项目为中小型企业预留项目|专门面向中小微?企业', content):
+            return 1
+        elif re.search('落实政府采购政策需满足的资格要求.{,30}供应商为中小企业', content) and re.search('(非|不属于|不|是/否))?专门面向中小微?企业|部分面向中小微?企业',
+                                                                                content) == None:
+            return 1
+        return 0
+
+    def registered_years():
+        # 注册年限
+        ser = None
+        if re.search('禁止\w{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
+            ser = re.search('禁止\w{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content)
+        elif re.search('(成立|注册)时间:?\w{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
+            ser = re.search('(成立|注册)时间:?\w{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))',
+                            content)
+        elif re.search('(成立|注册)时间:?\w{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content):
+            ser = re.search('(成立|注册)时间:?\w{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content)
+        elif re.search('(成立|注册)时间:?\w{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)\w{,5}请勿报价)', content):
+            ser = re.search('(成立|注册)时间:?\w{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)\w{,5}请勿报价)', content)
+        if ser:
+            num = ser.group('num')
+            unit = ser.group('unit')
+            if num.isdigit():
+                num = int(num)
+            else:
+                num = chinese_to_arabic(num)
+            if unit == '年':
+                num *= 12
+            return num
+        return 0
+
+    def registered_capital():
+        # 注册资本
+        ser = None
+        if re.search('注册(资本|资金):?\w{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content):
+            ser = re.search('注册(资本|资金):?\w{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content)
+        elif re.search('注册(资本|资金):?\w{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content):
+            ser = re.search('注册(资本|资金):?\w{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content)
+        if ser:
+            num = ser.group('num')
+            unit = ser.group('unit')
+            return float(getUnifyMoney(num + unit))
+        return 0
+
+    def need_qualification():
+        # 有资质证书要求
+        if re.search('资质要求.{,150}(行业资质|证书|许可证|认证|经营范围|一级|二级|三级|甲级|乙级|丙级|特级|壹级|贰级|叁级)', content):
+            return 1
+        elif re.search('(提供|有|具备)\w{,50}(资质|认证|证书|许可证)', content):
+            return 1
+        elif re.search('资格)?要求:?\w{,30}(甲级|丙级|乙级|一级|二级|三级|特级|壹级|贰级|叁级)', content):
+            return 1
+        elif re.search('认证体系要求', content):
+            return 1
+        return 0
+
+    def need_ca():
+        # 7 是否需要办CA
+        if re.search('需要\w{,20}数字证书|使用\w{,20}签章', content):
+            return 1
+        elif re.search('办理\w{,20}(数字证书|CA|ca)', content) and re.search('无需\w{,15}办理', content) == None:
+            return 1
+        elif re.search('(数字证书|CA|ca)\w{,5}办理|是否要求供应商使用(CA|ca)数字证书参与:是', content):
+            return 1
+        if re.search('(不使用|无需)\w{,20}(数字证书|CA|ca)|是否要求供应商使用(CA|ca)数字证书参与:不要求', content):
+            return 0
+        return 0
+
+    def need_performance():
+        # 有业绩要求
+        if re.search('业绩证明|业绩要求|行业业绩|相关业绩', content):
+            return 1
+        elif re.search('类似\w{,10}业绩', content) or re.search('业绩.{,5}如有', content) == None:
+            return 1
+        elif re.search('完成[^,。]{,100}项目', content):
+            return 1
+        elif re.search('(提供|有|完成).{,100}业绩', content):
+            return 1
+        return 0
+
+    def mode_of_partipation():
+        # 参与方式 1线上 2线下 0其他
+        if re.search('(平台|网站|http|www|官网|网址|网页|网上中介|邮件|邮箱|客户端|采购网|系统|邮寄).{,20}(注册|报名)', content):
+            return 1
+        elif re.search('现场报名', content):
+            if re.search(
+                    '非现场报名|(在线报名|邮件|邮寄|邮箱|线上报名|网络报名).{,10}现场报名|现场报名.{,10}(在线报名|邮件|邮寄|邮箱|线上报名|网络报名)', content) == None:
+                return 1
+            return 2
+        elif re.search('(获取采购文件|文件的获取|文件获取|获取竞价文件|获取招标文件|文件的领取|文件领取|获取投标文件).{,200}'
+                       '(平台|线上|客户端|邮寄|网上获取|网站|网址|http|www|邮箱|寄送|网络获取|采购网|网络领购|系统|邮件|在线报名|网络报名|非现场报名|线上报名)', content):
+            return 1
+        elif re.search('(获取采购文件|文件的获取|文件获取|获取竞价文件|获取招标文件|文件的领取|文件领取|获取投标文件).{,200}'
+                       '([\d一二三四五六七八九十]号|接待室|开标室|现场领取|会议室|线下购买|现场获取|X[\d一二三四五六七八九十]|办公楼|现场报名)', content):
+            return 2
+        elif re.search('(报价地址|报价信息|报价请点击|报价方法|报价式|报价方式|报价提交|报价地点).{,30}'
+                       '(平台|网站|http|www|官网|网址|网页|网上中介|邮件|邮箱|客户端|采购网|系统|在线报价|线上报价)', content):
+            return 1
+        elif re.search('线下报价', content) and re.search('不接受线下报价|线下报价无效', content) == None:
+            return 2
+        elif re.search('(文件提交|文件递交|递交方式|文件的提交|文件送达地点|递交响应文件|证明材料的递交)', content):
+            b = re.search('(文件提交|文件递交|递交方式|文件的提交|文件送达地点|递交响应文件|证明材料的递交)', content).end()
+            ser = re.search('联系方式|发布公告的媒介', content[b:b + 200])
+            text = content[b:b + ser.start()] if ser else content[b:b + 200]
+            if re.search('平台|线上|客户端|邮寄|网站|网址|http|www|邮箱|寄送|采购网|系统|邮件|网页', text):
+                return 1
+            elif re.search('[\d一二三四五六七八九十]号|接待室|现场递交|开标室|会议室|[\d一二三四五六七八九十]楼|线下递交|办公楼', text):
+                return 2
+        if re.search('(平台|线上|客户端|网站|网址|http|www|采购网|系统|网页).{,10}递交.{,10}文件', content):
+            return 1
+        if re.search('(开标地点|投标地点|开标时间和地点|开标时间及地点|开标方式)', content):
+            b = re.search('(开标地点|投标地点|开标时间和地点|开标时间及地点|开标方式)', content).end()
+            ser = re.search('联系方式|发布公告的媒介', content[b:b + 70])
+            text = content[b:b + ser.start()] if ser else content[b:b + 70]
+            if re.search(
+                    '平台|线上|客户端|网站|网址|http|www|线上开标|采购网|非现场开标|不见面开标|远程异地开启|系统|线上观看开标|网上开标|在线直播的方式开标|远程开标|现场开启|电子卖场|电子开标|开标现场电话联系',
+                    text):
+                return 1
+            elif re.search('[\d一二三四五六七八九十]号|线下开标|接待室|现场递交|开标室|现场开标|会议室|[\d一二三四五六七八九十]楼|办公楼|街道', text):
+                return 2
+        if re.search('(平台|线上|客户端|网站|网址|http|www|采购网|系统|网页).{,10}开标', content):
+            return 1
+        elif re.search('不见面开标|非现场开标|远程异地开启|线上观看开标|网上开标|在线直播的方式开标|远程开标|现场开启|非公开开启|电子开标|开标现场电话联系', content):
+            return 1
+        elif re.search('开启.{,20}地点', content):
+            b = re.search('开启.{,20}地点', content).end()
+            ser = re.search('联系方式|发布公告的媒介', content[b:b + 70])
+            text = content[b:b + ser.start()] if ser else content[b:b + 70]
+            if re.search(
+                    '平台|线上|客户端|网站|网址|http|www|线上开标|采购网|非现场开标|不见面开标|远程异地开启|系统|线上观看开标|网上开标|在线直播的方式开标|远程开标|电子卖场|电子开标|开标现场电话联系',
+                    text):
+                return 1
+            elif re.search('[\d一二三四五六七八九十]号|线下开标|接待室|现场递交|开标室|现场开标|会议室|[\d一二三四五六七八九十]楼|办公楼|街道', text):
+                return 2
+        return 0
+
+    def suitable_small():
+        # 适合小微企业投标
+        if re.search('属于专门面向中小企业|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
+            return 1
+        elif re.search('属于企业直采|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
+            return 2
+        elif re.search('有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
+            return 3
+        return 0
+
+    label_dic = {}
+    is_direct_procurement = is_direct_procurement() # 是否直接采购
+    is_target_small = is_target_small() # 是否面向中小企业
+    mode_of_partipation = mode_of_partipation() # 参与方式
+    need_ca = need_ca() # 是否需要CA
+    need_performance = need_performance() # 有业绩要求
+    need_qualification = need_qualification() # 资质要求
+    registered_capital = registered_capital() # 注册资本
+    registered_years = registered_years() # 注册年限
+    suitable_small = suitable_small() # 适合小微企业
+
+    label_dic['is_direct_procurement'] = is_direct_procurement
+    label_dic['is_target_small'] = is_target_small
+    label_dic['mode_of_partipation'] = mode_of_partipation
+    label_dic['need_ca'] = need_ca
+    label_dic['need_performance'] = need_performance
+    label_dic['need_qualification'] = need_qualification
+    label_dic['registered_capital'] = registered_capital
+    label_dic['registered_years'] = registered_years
+    label_dic['suitable_small'] = suitable_small
+
+    label_dic = {k: v for k, v in label_dic.items() if v!=0}
+
+    return label_dic
+
+if __name__ == "__main__":
+    # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
+    #     html = f.read()
+    # rs = get_all_label('', html)
+    # print('rs: ', rs)
+
+    import pandas as pd
+    from bs4 import BeautifulSoup
+    import json
+
+    df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')[:]
+    print(df.columns, len(df))
+    df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
+    print(len(df))
+    def get_text(html):
+        soup = BeautifulSoup(html, 'lxml')
+        text = soup.get_text()
+        return text
+    df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
+    df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
+    df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
+    df = df[['docid', 'docchannel', 'web_source_name', 'exist_table', '标签']]
+    df.to_excel('E:/公告标签提取结果.xlsx', index=False)
+
+
+
+
+
+
+
+

+ 9 - 6
BiddingKG/dl/interface/htmlparser.py

@@ -205,9 +205,6 @@ class ParseDocument():
         if _html is None:
             _html = ""
         self.html = _html
-
-        # self.soup = BeautifulSoup(self.html,"lxml")
-        # self.soup = BeautifulSoup(self.html,"html.parser")
         self.auto_merge_table = auto_merge_table
 
         if list_obj:
@@ -288,6 +285,9 @@ class ParseDocument():
         _se = re.search(_pattern,_text)
         groups = []
         if _se is not None:
+            e = _se.end()
+            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天]', _text[e:])):
+                return None
             _gd = _se.groupdict()
             for k,v in _gd.items():
                 if v is not None:
@@ -827,15 +827,18 @@ def extract_products(list_data,_product,_param_pattern = "产品名称|设备材
     return list_result
 
 
-def get_childs(childs):
+def get_childs(childs, max_depth=None):
     list_data = []
     for _child in childs:
         list_data.append(_child)
         childs2 = _child.get("child_title",[])
 
-        if len(childs2)>0:
+        if len(childs2)>0 and (max_depth==None or max_depth>0):
             for _child2 in childs2:
-                list_data.extend(get_childs([_child2]))
+                if max_depth != None:
+                    list_data.extend(get_childs([_child2], max_depth-1))
+                else:
+                    list_data.extend(get_childs([_child2], None))
     return list_data
 
 def get_range_data_by_childs(list_data,childs):

+ 111 - 38
BiddingKG/dl/interface/outline_extractor.py

@@ -27,14 +27,14 @@ def extract_sentence_list(sentence_list):
         sentence_text = sentence.sentence_text
         begin_index = 0
         end_index = 0
-        for it in re.finditer('([\w:][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
+        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
             temp = it.group(0)
             sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
-        for item in re.finditer('[,。;;!!??]+', sentence_text):
+        for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔
             end_index = item.end()
-            if end_index!=len(sentence_text):
-                if end_index-begin_index<6 and item.group()[-1] in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', item.group())==None:
-                    continue
+            # if end_index!=len(sentence_text):
+            #     # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误
+            #     #     continue
             new_sentence_text = sentence_text[begin_index:end_index]
             sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
             if sentence.in_attachment:
@@ -53,16 +53,23 @@ def extract_sentence_list(sentence_list):
 
     return new_sentence2_list, new_sentence2_list_attach
 
-requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围)([及与和](其它|\w{,2})要求)?" \
-                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::]|$)"
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围|信息)([及与和](其它|\w{,2})要求)?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::]|$)"
 aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
+out_lines = []
 
-# out_lines = []
-
-def extract_parameters(parse_document):
+def extract_parameters(parse_document, content):
+    '''
+    通过大纲、预处理后文本正则获取需要字段
+    :param parse_document: ParseDocument() 方法返回结果
+    :param content: 公告预处理后文本
+    :return:
+    '''
     list_data = parse_document.tree
     requirement_text = ''
     aptitude_text = ''
+    addr_bidopen_text = ''
 
     _find_count = 0
     _data_i = -1
@@ -74,10 +81,16 @@ def extract_parameters(parse_document):
         # print(_data.keys())
         if _type=="sentence":
             if _data["sentence_title"] is not None:
-                if re.search(requirement_pattern,_text) is not None:
+
+                outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
+                                 re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
+
+                if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
+                    out_lines.append(outline)
                     childs = get_childs([_data])
                     for c in childs:
-                        requirement_text += c["text"]+"\n"
+                        # requirement_text += c["text"]+"\n"
+                        requirement_text += c["text"]
                     _data_i += len(childs)
                     _data_i -= 1
     _data_i = -1
@@ -108,7 +121,6 @@ def extract_parameters(parse_document):
 
                 # elif re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', _text) and len(_text)<30 and re.search('资质|资格', _text):
                 #     out_lines.append(outline)
-
         if _type=="table":
             list_table = _data["list_table"]
             parent_title = _data["parent_title"]
@@ -119,18 +131,67 @@ def extract_parameters(parse_document):
                         cell_text = cell[0]
                         if len(cell_text)>120 and re.search(aptitude_pattern,cell_text) is not None:
                             aptitude_text += cell_text+"\n"
-
-    return requirement_text,aptitude_text
+    _data_i = -1
+    while _data_i < len(list_data) - 1:
+        _data_i += 1
+        _data = list_data[_data_i]
+        _type = _data["type"]
+        _text = _data["text"].strip()
+        # print(_data.keys())
+        if _type == "sentence":
+            if _data["sentence_title"] is not None:
+                if re.search(addr_bidopen_pattern, _text[:20]) is not None:
+                    childs = get_childs([_data], max_depth=1)
+                    for c in childs:
+                        addr_bidopen_text += c["text"]
+                    _data_i += len(childs)
+                    _data_i -= 1
+    if re.search('时间:', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
+        for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
+            b, e = ser.span()
+        addr_bidopen_text = addr_bidopen_text[b:e]
+    elif re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
+        addr_bidopen_text = ""
+    if addr_bidopen_text == "":
+        ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
+        if ser:
+            addr_bidopen_text = ser.group(0)
+    return requirement_text, aptitude_text, addr_bidopen_text
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
     #     html = f.read()
     #
-    # l = []
+    l = []
     import pandas as pd
-    # from collections import Counter
-    # from BiddingKG.dl.interface import Preprocessing
+    from collections import Counter
+    from BiddingKG.dl.interface import Preprocessing
+    from BiddingKG.dl.interface.get_label_dic import get_all_label
+    from bs4 import BeautifulSoup
+    import json
+
+    df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
+    df['len']= df['招标内容'].apply(lambda x: len(x))
+    print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
+    print(len([it for it in df['len'] if it>1500]))
+
     # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
+    # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx')
+    # df = df[df['docid'].isin(df1['docid'])]
+    #
+    # df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
+    # print(df.columns, len(df))
+    #
+    #
+    # # def get_text(html):
+    # #     soup = BeautifulSoup(html, 'lxml')
+    # #     text = soup.get_text()
+    # #     return text
+    # # df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
+    # # df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
+    # # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
+    # # df1 = df[['docid', '标签']]
+    #
     # n = 0
     # datas = []
     # for id,title, html in zip(df['docid'],df['doctitle'], df['dochtmlcon']):
@@ -139,8 +200,8 @@ if __name__ == "__main__":
     #     # print(id, type(id))
     #     # parse_document = ParseDocument(html, True)
     #     # requirement_text, aptitude_text = extract_parameters(parse_document)
-    #     if re.search('资\s*[格质]', html)==None:
-    #         continue
+    #     # if re.search('资\s*[格质]', html)==None:
+    #     #     continue
     #
     #     list_articles, list_sentences, list_entitys, list_outlines, _cost_time = Preprocessing.get_preprocessed([[id,html,"","",title,'', '']],useselffool=True)
     #     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
@@ -149,12 +210,12 @@ if __name__ == "__main__":
     #
     #     parse_document = ParseDocument(html, True, list_obj=sentence2_list)
     #     requirement_text, aptitude_text = extract_parameters(parse_document)
-    #     if len(aptitude_text)>0:
-    #         datas.append((id, aptitude_text[:1500]))
-    #         print(id, aptitude_text[:10], aptitude_text[-20:])
-    #     else:
-    #         parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
-    #         requirement_text, aptitude_text = extract_parameters(parse_document)
+    #     # if len(aptitude_text)>0:
+    #     #     datas.append((id, aptitude_text[:1500]))
+    #     #     print(id, aptitude_text[:10], aptitude_text[-20:])
+    #     # else:
+    #     #     parse_document = ParseDocument(html, True, list_obj=sentence2_list_attach)
+    #     #     requirement_text, aptitude_text = extract_parameters(parse_document)
     #
     #     # if 0<len(aptitude_text)<20:
     #     #     l.append(len(aptitude_text))
@@ -163,11 +224,20 @@ if __name__ == "__main__":
     #     #     if n > 5:
     #     #         break
     #
+    #     if len(requirement_text)>0:
+    #         label_dic = get_all_label(title, list_articles[0].content)
+    #         # datas.append((id, requirement_text))
+    #         datas.append((id, requirement_text, label_dic))
+    #
     # c = Counter(out_lines)
     # print(c.most_common(1000))
+    # #
+    # # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
+    # # df.to_excel('E:/公告资质要求提取结果.xlsx')
     #
-    # df = pd.DataFrame(datas, columns=['docid', '资质要求'])
-    # df.to_excel('E:/公告资质要求提取结果.xlsx')
+    # df = pd.DataFrame(datas, columns=['docid', '招标内容', '标签'])
+    # df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
+    # df.to_excel('E:/公告招标内容提取结果2.xlsx')
 
     #     if len(aptitude_text)> 1000:
     #         print(id, aptitude_text[:10], aptitude_text[-20:])
@@ -193,14 +263,17 @@ if __name__ == "__main__":
     #     new_sentence_text = sentence_text[begin_index:end_index]
     #     print(new_sentence_text)
 
-    df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
-    pos = neg = 0
-    for docid, text in zip(df['docid'], df['资质要求']):
-        if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
-            pos += 1
-            pass
-        else:
-            neg += 1
-            print(docid, text[:50])
-    print('异常:%d, 正常:%d'%(neg, pos))
+    # df = pd.read_excel('E:/公告资质要求提取结果.xlsx')
+    # docids = []
+    # pos = neg = 0
+    # for docid, text in zip(df['docid'], df['资质要求']):
+    #     if re.match('[((\s★▲\*]?[一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+', text) and re.search(aptitude_pattern, text[:15]):
+    #         pos += 1
+    #         pass
+    #     else:
+    #         neg += 1
+    #         print(docid, text[:50])
+    #         docids.append(docid)
+    # print('异常:%d, 正常:%d'%(neg, pos))
+    # print(docids)