Browse Source

Merge remote-tracking branch 'origin/master'

fangjiasheng 4 months ago
parent
commit
ab3d4cdcd0

+ 2 - 1
.gitignore

@@ -20,4 +20,5 @@ node_modules
 /BiddingKG/dl/LEGAL_ENTERPRISE.txt
 /BiddingKG/dl_dev/
 BiddingKG.iml
-misc.xml
+misc.xml
+/.scannerwork/css-bundle/

+ 1 - 0
.idea/.gitignore

@@ -6,3 +6,4 @@
 /dataSources.local.xml
 # Editor-based HTTP Client requests
 /httpRequests/
+/sonarlint/issuestore/

+ 1 - 1
BiddingKG.iml

@@ -7,7 +7,7 @@
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (py37)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
     <orderEntry type="library" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>

+ 116 - 12
BiddingKG/dl/channel/channel_bert.py

@@ -339,6 +339,10 @@ def text_process(text):
     # text = re.sub("\s+", "", text)
     text = re.sub("\s+", " ", text)
 
+    # 优化部分未识别表达
+    text = re.sub("中止", "终止", text)
+    text = re.sub("遴选", "招标", text)
+
     return text
 
 label2class_dict = {
@@ -413,12 +417,20 @@ def channel_predict(title,text):
     # process text
     if title in text:
         text = text.replace(title, '', 1)
-    text = text_process(text)
+        text = text.lstrip(",")
+        text = text.lstrip("。")
+    if "##attachment##" in text:
+        main_text,attachment_text = text.split("##attachment##",maxsplit=1)
+        # print('main_text',main_text)
+        if len(main_text)>=500: # 正文有足够的内容时不需要使用附件预测
+            text = main_text
     text = re.sub("##attachment##。?","",text)
+    text = text_process(text)
+
     if len(text)<=100:
         # 正文内容过短时,不预测
         return
-    elif len(text)<=200:
+    elif len(text)<=150:
         # 正文内容过短时,重复正文
         text = text * 2
     text = text[:2000]
@@ -426,6 +438,7 @@ def channel_predict(title,text):
     title = title[:100]
     text = "公告标题:" + title + "。" + "公告内容:" + text
     text = text[:2000]
+    # print('predict text:',text)
 
     # to torch data
     text = [text]
@@ -445,8 +458,22 @@ def channel_predict(title,text):
     with torch.no_grad():
         outputs = model(None, text)
         predic = torch.max(outputs.data, 1)[1].cpu().numpy()
-        pred_label = predic[0]
-        pred_class = label2class_dict[pred_label]
+        pred_prob = torch.max(outputs.data, 1)[0].cpu().numpy()
+        # print('pred_prob',pred_prob)
+        if pred_prob>0.5:
+            pred_label = predic[0]
+            pred_class = label2class_dict[pred_label]
+        else:
+            return
+    # print('check rule before',pred_class)
+    # check rule
+    if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标
+        pred_class = 105
+    elif pred_class==122 and re.search("验收服务",title):
+        pred_class = None
+    # elif pred_class==118 and re.search("重新招标",title): #重新招标类公告,因之前公告的废标原因而错识别为废标公告
+    #     pred_class = 52
+
 
     return pred_class
 
@@ -477,14 +504,73 @@ class_dict = {51: '公告变更',
        122: '验收合同'
               }
 
+tenderee_type = ['公告变更','招标公告','招标预告','招标答疑','资审结果','采购意向']
+win_type = ['中标信息','废标公告','候选人公示','合同公告','开标记录','验收合同']
+
 def merge_channel(list_articles,channel_dic,original_docchannel):
+
+    def merge_rule(title,text,docchannel,pred_channel,channel_dic,original_docchannel):
+        front_text_len = len(text)//3 if len(text)>300 else 100
+        front_text = text[:front_text_len]
+        pred_channel = class_dict[pred_channel]
+        if pred_channel == docchannel:
+            channel_dic['docchannel']['use_original_docchannel'] = 0
+        else:
+            if pred_channel in ['采购意向','招标预告'] and docchannel in ['采购意向','招标预告']:
+                merge_res = '采购意向' if re.search("意向|意愿",title) or re.search("意向|意愿",front_text) else "招标预告"
+                channel_dic['docchannel']['docchannel'] = merge_res
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif pred_channel in ['公告变更','招标答疑'] and docchannel in ['公告变更','招标答疑']:
+                channel_dic['docchannel']['docchannel'] = docchannel
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif pred_channel=='公告变更' and docchannel in ['中标信息','废标公告','候选人公示','合同公告']: #中标类的变更还是中标类公告
+                channel_dic['docchannel']['docchannel'] = docchannel
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif docchannel=='公告变更' and pred_channel in ['中标信息','废标公告','候选人公示','合同公告']:
+                channel_dic['docchannel']['docchannel'] = pred_channel
+                channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif docchannel in ['中标信息','候选人公示'] and pred_channel in ['中标信息','候选人公示']:
+                if re.search('候选人(变更)?公[告示]|评标(结果)?(公[告示]|报告)|评审结果', title):
+                    channel_dic['docchannel']['docchannel'] = '候选人公示'
+                    channel_dic['docchannel']['use_original_docchannel'] = 0
+                else:
+                    if original_docchannel in [101,119]:
+                        channel_dic['docchannel']['docchannel'] = class_dict.get(original_docchannel, '原始类别')
+                        channel_dic['docchannel']['use_original_docchannel'] = 1
+                    else:
+                        channel_dic['docchannel']['docchannel'] = pred_channel
+                        channel_dic['docchannel']['use_original_docchannel'] = 0
+            elif docchannel in ['中标信息','候选人公示'] and pred_channel=='开标记录':
+                re_text_len = max(500,len(text)//3)
+                re_text = text[:re_text_len]
+                if re.search('开标记录|截标信息|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果',re_text):
+                    channel_dic['docchannel']['docchannel'] = '开标记录'
+                    channel_dic['docchannel']['use_original_docchannel'] = 0
+
+            if 'use_original_docchannel' not in channel_dic['docchannel']:
+                original_type = class_dict.get(original_docchannel, '原始类别')
+                if pred_channel in tenderee_type and docchannel in tenderee_type and original_type not in tenderee_type:
+                    # pred_channel和docchannel都是同一(招标/中标)类型时,original_docchannel不一致时不使用原网类型
+                    channel_dic['docchannel']['use_original_docchannel'] = 0
+                elif pred_channel in win_type and docchannel in win_type and original_type not in win_type:
+                    # pred_channel和docchannel都是同一(招标/中标)类型时,original_docchannel不一致时不使用原网类型
+                    channel_dic['docchannel']['use_original_docchannel'] = 0
+                else:
+                    channel_dic = {'docchannel': {'doctype': '采招数据',
+                                                  'docchannel': original_type,
+                                                  'life_docchannel': original_type}}
+                    channel_dic['docchannel']['use_original_docchannel'] = 1
+
+        return channel_dic
+
+
     article = list_articles[0]
     title = article.title
     text = article.content
 
     doctype = channel_dic['docchannel']['doctype']
     docchannel = channel_dic['docchannel']['docchannel']
-    # print('doctype',doctype,'docchannel',docchannel)
+    # print('doctype',doctype,'docchannel',docchannel,'original_docchannel',original_docchannel)
     compare_type = ['公告变更','招标公告','中标信息','招标预告','招标答疑','资审结果','采购意向','废标公告','候选人公示',
                       '合同公告','开标记录','验收合同']
     # 仅比较部分数据
@@ -493,13 +579,31 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             pred = channel_predict(title, text)
             # print('pred_res', pred)
             if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
-                if class_dict[pred] == docchannel:
-                    channel_dic['docchannel']['use_original_docchannel'] = 0
-                else:
-                    channel_dic = {'docchannel': {'docchannel': '采招数据',
-                                                  'doctype': class_dict.get(original_docchannel, '原始类别'),
-                                                  'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
-                    channel_dic['docchannel']['use_original_docchannel'] = 1
+                channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)
+
+    elif doctype=='采招数据' and docchannel=="":
+        pred = channel_predict(title, text)
+        if pred is not None:
+            pred = class_dict[pred]
+            channel_dic['docchannel']['docchannel'] = pred
+            channel_dic['docchannel']['use_original_docchannel'] = 0
+
+    # '招标预告'类 规则纠正
+    if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告":
+        if "##attachment##" in text:
+            main_text, attachment_text = text.split("##attachment##", maxsplit=1)
+        else:
+            main_text = text
+        main_text = text_process(main_text)
+        # if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:max(500,len(main_text)//2)]):
+        if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text):
+            front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
+            front_text = main_text[:front_text_len]
+            if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):
+                channel_dic['docchannel']['docchannel'] = "采购意向"
+            else:
+                channel_dic['docchannel']['docchannel'] = "招标预告"
+            channel_dic['docchannel']['use_original_docchannel'] = 0
 
     return channel_dic
 

+ 11 - 11
BiddingKG/dl/common/Utils.py

@@ -984,7 +984,7 @@ package_number_pattern = re.compile(
 |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
 |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
 |((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
-|([,;。、:(]|^)(标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
+|([,;。、:(]|^)(标的?|(招标|采购)?项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
 |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
 |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
 filter_package_pattern =  'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
@@ -1139,7 +1139,7 @@ def is_all_winner(title):
     '''
     if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
         return 1
-    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商', title):
+    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商|集中采购', title):
         return 2
     return False
 
@@ -1171,15 +1171,15 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
     # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价(元):3年投标报价(元)含税6299700.00 3年作为金额
 
-    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
-        found_yeji += 1
-    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
-        all_match = []
-    else:
-        ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
-        if ser:
-            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
-        all_match = re.finditer(pattern_money, sentence_text)
+    # if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
+    #     found_yeji += 1
+    # if found_yeji >= 2:  # 过滤掉业绩后面的所有金额 # 20250210修复逻辑错误,中标金额被前面句子业绩表达过滤 评分因素:业绩(9分),评分标准:提供2021年1月1日以来类似项目业绩, 589003579
+    #     all_match = []
+    # else:
+    ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
+    if ser:
+        sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
+    all_match = re.finditer(pattern_money, sentence_text)
     # print('all_match:', all_match)
     for _match in all_match:
         # print('_match: ', _match.group())

+ 33 - 11
BiddingKG/dl/interface/Preprocessing.py

@@ -115,9 +115,15 @@ def tableToText(soup, docid=None, return_kv=False):
             tr_line = []
             tds = tr.findChildren(['td','th'], recursive=False)
             if len(tds)==0:
-                tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
+                if return_kv:
+                    tr_line.append([re.sub('\xa0','',tr.get_text()),0])
+                else:
+                    tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
             for td in tds:
-                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
+                if return_kv:
+                    tr_line.append([re.sub('\xa0','',td.get_text()),0])
+                else:
+                    tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
                 #tr_line.append([td.get_text(),0])
             inner_table.append(tr_line)
         return inner_table                          
@@ -1681,11 +1687,11 @@ def tableToText(soup, docid=None, return_kv=False):
         table2list = TableTag2List()
         return_html_table = True if return_kv else False
         if return_html_table:
-            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table)
+            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table,return_kv=return_kv)
             inner_table = fixTable(inner_table)
             html_table = fixTable(html_table, "")
         else:
-            inner_table = table2list.table2list(tbody, segment)
+            inner_table = table2list.table2list(tbody, segment,return_kv=return_kv)
             inner_table = fixTable(inner_table)
 
         if inner_table == []:
@@ -1778,6 +1784,8 @@ def tableToText(soup, docid=None, return_kv=False):
     # 遍历表格中的每个tbody
     tbodies = []
     in_attachment = False
+    if soup.name=="table":
+        tbodies.append((soup,in_attachment))
     for _part in soup.find_all():
         if _part.name=='table':
             tbodies.append((_part,in_attachment))
@@ -1803,7 +1811,8 @@ def tableToText(soup, docid=None, return_kv=False):
                 else:
                     tbodies[tbody_index - 1][0].append(row)
             inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
-            list_innerTable.append(inner_table)
+            if inner_table:
+                list_innerTable.append(inner_table)
             tbody_index += 2
             continue
         inner_table = trunTable(tbody,_in_attachment)
@@ -1814,6 +1823,8 @@ def tableToText(soup, docid=None, return_kv=False):
     # 遍历表格中的每个tbody
     tbodies = []
     in_attachment = False
+    if soup.name=="tbody":
+        tbodies.append((soup,in_attachment))
     for _part in soup.find_all():
         if _part.name == 'tbody':
             tbodies.append((_part, in_attachment))
@@ -1838,7 +1849,8 @@ def tableToText(soup, docid=None, return_kv=False):
                 else:
                     tbodies[tbody_index - 1][0].append(row)
             inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
-            list_innerTable.append(inner_table)
+            if inner_table:
+                list_innerTable.append(inner_table)
             tbody_index += 2
             continue
         inner_table = trunTable(tbody,_in_attachment)
@@ -1846,9 +1858,14 @@ def tableToText(soup, docid=None, return_kv=False):
         tbody_index += 1
 
     if return_kv:
-        kv_list = [x[1] for x in list_innerTable]
-        text = [x[2] for x in list_innerTable]
-        list_innerTable = [x[0] for x in list_innerTable]
+        kv_list = []
+        for x in list_innerTable:
+            if x[1] is not None:
+                kv_list.extend(x[1])
+        text = ""
+        for x in list_innerTable:
+            if x[2] is not None:
+                text += x[2]
         return soup, kv_list, text
     return soup
     # return list_innerTable
@@ -2261,7 +2278,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
     def repair_by_summation(inner_table):
         # 修复合计在中间的特殊情况
         if len(inner_table) >= 3 and len(inner_table[1]) == 2 \
-                and inner_table[1][0][0] == '合计' and inner_table[1][1][0][-1] == '%':
+                and inner_table[1][0][0] == '合计' and inner_table[1][1][0].endswith('%'):
             inner_table[1][0][1] = 0
             inner_table[1][1][1] = 0
         return inner_table
@@ -3347,6 +3364,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
         article_processed = article_processed.replace('侯选人', '候选人')  #2024/09/03 修复错别字避免预测错误。
         article_processed = article_processed.replace('人选人', '入选人')  #2024/09/03 修复错别字避免预测错误。
+        article_processed = article_processed.replace('⺠', '民')  # 2025/02/17 修复错别字 例:连云港市第一人⺠医院。
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
         article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", ":", article_processed)
         article_processed = article_processed.replace('.','.').replace('-', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
@@ -3397,6 +3415,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed = article_processed.replace(ser.group(0), '项目代码:%s,项目名称:%s' % (
             ser.group('code'), ser.group('name')))
         article_processed = re.sub('四舍五入至', '', article_processed) # 修复 533537050 ,中标价(四舍五入至万元):6468万元
+        if re.search('推荐供应商:', article_processed) and re.search('入围供应商:', article_processed): # 修复 中国工商银行 类网站 入围的才算中标
+            article_processed = article_processed.replace('推荐供应商:', '公司名称:')
+        if web_source_no.startswith('DX016489') and re.search('排名', article_processed) and re.search('成交供应商单位名称', article_processed): # 20250219 处理特殊站源有排名却叫成交供应商
+            article_processed = article_processed.replace('成交供应商单位名称', '成交候选人单位名称')
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3634,7 +3656,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
     list_entitys = []
-    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司', '中标(成交)公司'] # 需要过滤掉的企业单位
+    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司', '中标(成交)公司', '贵州茅台', '贵州茅台酒', '陕西省省级国'] # 需要过滤掉的企业单位
     for list_sentence in list_sentences:
         sentences = []
         list_entitys_temp = []

+ 35 - 8
BiddingKG/dl/interface/extract.py

@@ -30,6 +30,7 @@ from BiddingKG.dl.ratio.re_ratio import extract_ratio
 from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
 from BiddingKG.dl.interface.get_label_dic import get_all_label
 from BiddingKG.dl.channel.channel_bert import merge_channel
+from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
 
 
 # 自定义jsonEncoder
@@ -162,7 +163,8 @@ def extractCount(extract_dict,page_attachments,web_source_name):
                     if str(classification)=='采购清单':
                         has_qingdan = True
 
-                extract_count += 2
+
+                extract_count += len(_attachments)//2+1
             if has_zhaobiao:
                 extract_count += 2
             if has_qingdan:
@@ -186,6 +188,9 @@ def extractCount(extract_dict,page_attachments,web_source_name):
     if web_source_name in set_login_web:
         extract_count -= 3
 
+    product = _extract.get("product","")
+    extract_count += len(str(product).split(","))//5
+
     return extract_count
 
 # 字符编码标准化
@@ -266,12 +271,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time.update(_cost_time)
 
     '''大纲提取及大纲内容相关提取'''
+    start_time = time.time()
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
     requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
         requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
+    # print('out_lines',out_lines)
     # if addr_bidopen_text == '':
     #     addr_bidopen_text = extract_addr(list_articles[0].content)
     addr_dic, time_dic, code_investment = predictor.getPredictor('entity_type_rule').predict(list_entitys, list_sentences, list_articles)
@@ -279,6 +286,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         addr_dic['addr_bidopen'] = addr_bidopen_text
     if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
         addr_dic['addr_bidsend'] = addr_bidsend_text
+    log("get outline done of doc_id%s"%(doc_id))
+    cost_time["outline"] = round(time.time()-start_time,2)
+
+    '''从 kvtree 正则匹配要素'''
+    start_time = time.time()
+    kv_single_dic, kv_addr_dic = get_kvtree_value(text)
+    log("get kvtree done of doc_id%s"%(doc_id))
+    cost_time["kvtree"] = round(time.time()-start_time,2)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -402,8 +417,6 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         channel_dic = {"docchannel":
              { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
         }
-        prem[0]['prem'] = {}  # 审批项目不要这项
-
     else:
         channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
     # print('msc', msc)
@@ -429,11 +442,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
 
     '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
-    industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
+    industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem, product_attrs=product_attrs)
 
     '''地区获取'''
     start_time = time.time()
-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
+    # district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
+    district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic, list_entity=list_entitys[0])
     cost_time["district"] = round(time.time() - start_time, 2)
 
     '''根据district提取结果修复实体'''
@@ -471,12 +485,15 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-12-12'}
+    version_date = {'version_date': '2025-02-19'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
-        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text)
-        data_res['approval'] = approval
+        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text, nlp_enterprise=nlp_enterprise+nlp_enterprise_attachment)
+        approval = predictor.getPredictor("approval").add_ree2approval(approval , prem[0]['prem'])
+        approval = predictor.getPredictor("approval").add_codename2approval(approval , codeName)
+        data_res['prem'] = {}  # 审批项目不要这项
+        data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目
 
     if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
         start_time = time.time() #失信数据要素提取
@@ -501,6 +518,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
 
+    '''利用采购意向需求信息补充项目'''
+    if channel_dic['docchannel']['docchannel'] == '采购意向':
+        getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
+
     data_res["project_label"] = project_label
     data_res["property_label"] = property_label
     data_res["doctitle_refine"] = doctitle_refine
@@ -545,6 +566,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     data_res['bid_score'] = bid_score # 评标得分
     data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
     data_res['code_investment'] = code_investment # 投资项目编号
+    for k, v in kv_single_dic.items(): # 没获取到的用kv_tree补充
+        if data_res.get(k, '') == '':
+            data_res[k] = v
+    for k, v in kv_addr_dic.items(): # 没获取到地址的用kv_tree补充
+        if data_res['addr_dic'].get(k, '') == '' or re.search('时间:', data_res['addr_dic'][k]):
+            data_res['addr_dic'][k] = v
 
     # for _article in list_articles:
     #         log(_article.content)

+ 28 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -3638,7 +3638,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
             #                     content_text += c["text"] + ""
             #                 print('concat_text', content_text)
 
-            if re.search(",(完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0:
+            if re.search("[;](完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0:
                 if entity.sentence_index == last_sentence_index:
                     time_type = last_time_index.get(last_time_type)
                     if time_type:
@@ -3704,7 +3704,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
                         last_time_type = ''
                 elif entity.label==6 and label_prob>0.5:
                     if len(extract_time)==1:
-                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                        if (re.search("前|截?",entity_right) and re.search("前|截?(?!时间|日期)",entity_right2[:len(entity_right)+3])) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
                             dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment))
                             last_time_type = 'time_getFileEnd'
                         else:
@@ -4895,6 +4895,32 @@ def fix_single_source(prem, channel_dic, original_docchannel):
                 if d['role_name'] == "win_tenderer":
                     d['role_name'] = 'pre_win_tenderer'
 
+def demand_to_prem(demand, prem):
+    if len(demand.get('data', [])) > len(prem):
+        i = 1
+        for d in demand.get('data', []):
+            d['demand_id'] = i
+            if d.get('project_name', '') != '' and d.get('budget', '') != '':
+                if d.get('project_name', '') not in prem:
+                    prem[d.get('project_name', '')] = {
+                        'demand_id': i,
+                        'code': '',
+                        'name': d.get('project_name', ''),
+                        'roleList': [],
+                        'tendereeMoney': d.get('budget', ''),
+                        'tendereeMoneyUnit': ""
+                    }
+                else:
+                    prem[d.get('project_name', '')+'_%d'%i] = {
+                        'demand_id': i,
+                        'code': '',
+                        'name': d.get('project_name', ''),
+                        'roleList': [],
+                        'tendereeMoney': d.get('budget', ''),
+                        'tendereeMoneyUnit': ""
+                    }
+            i += 1
+
 if __name__=="__main__":
     '''
     conn = getConnection()

File diff suppressed because it is too large
+ 1618 - 0
BiddingKG/dl/interface/html_2_kvtree.py


+ 1 - 1
BiddingKG/dl/interface/htmlparser.py

@@ -297,7 +297,7 @@ class ParseDocument():
                 if v is not None:
                     groups.append((k,v))
         if len(groups):
-            # groups.sort(key=lambda x:x[0])
+            groups.sort(key=lambda x:x[0])
             return groups
         return None
 

+ 66 - 0
BiddingKG/dl/interface/kvtree_search.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2024/12/26 10:31
+"""
+from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
+import re
+
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
+aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求|项目要求"
+
+pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)"
+addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
+
+pattern_dic_single = {'requirement': requirement_pattern,
+               'aptitude': aptitude_pattern,
+               'pinmu_name': pinmu_name_pattern}
+pattern_dic_addr = {'addr_bidopen': addr_bidopen_pattern,
+                    'addr_bidsend': addr_bidsend_pattern}
+
+def get_kvtree_value(html):
+    '''
+    通过kv数解析,正则匹配 k 值获取内容
+    :param html:
+    :return:
+    '''
+    _pd = Html2KVTree(html)
+    kv_single_dic = {} # 单独放在外面的字段
+    kv_addr_dic = {} # 放在地址字典的字段
+    for k, v in pattern_dic_single.items():
+        kv_l = _pd.extract_kv(v)
+        value = ''
+        for d in kv_l:
+            ser = re.search(v, d.get('key', ''))
+            if ser and ser.end()/len(d.get('key', ''))<0.5: # 571545382 过滤错误提取,例 供应商要求澄清采购文件的截止时间 提取到 供应商要求 aptitude
+                continue
+            elif d.get('key', '').strip() == d.get('value', '').strip(): # 修复 571425136 k: aptitude, key: 1.供应商资质, value: 1.供应商资质
+                continue
+            if d.get('value', '').strip() != '':
+                value = d['value'].strip()
+                break
+        if value != '' and re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要
+            kv_single_dic[k] = value
+    for k, v in pattern_dic_addr.items():
+        kv_l = _pd.extract_kv(v)
+        value = ''
+        for d in kv_l:
+            if d.get('value', '').strip() != '':
+                value = d['value'].strip()
+                if re.search('时间:', value) and re.search('地[点址]:(?P<addr>[\w()()【】-]{5,50})[,。]', value):
+                    value = re.search('地[点址]:(?P<addr>[\w()()【】-]{5,50})[,。]', value).group('addr')
+                break
+        if value != '' and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', value): # 包含两个中文以上的才要 避免 571236792 文件获取地点:-- 这种也提取
+            kv_addr_dic[k] = value
+    return kv_single_dic, kv_addr_dic
+
+if __name__ == "__main__":
+    with open('d:/html/2.html', encoding='utf-8') as f:
+        html = f.read()
+        rs = get_kvtree_value(html)
+        print(rs)

+ 9 - 9
BiddingKG/dl/interface/outline_extractor.py

@@ -57,10 +57,11 @@ def extract_sentence_list(sentence_list):
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
                           "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
-aptitude_pattern = "((资格|资质)[的及]?(要求|条件)|竞买资格及要求|供应商报价须知)([::,]|$)|(竞买|竞买人|竞投人)?资格(条件)?:|按以下要求参与竞买|(报名|竞买)(条件|资格)"
+aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求"
+
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
-pinmu_name_pattern = "采购品目名称([::,]|$)"
+pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
 out_lines = []
 policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
 not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
@@ -119,7 +120,7 @@ def extract_parameters(parse_document):
                 # outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
                 #                  re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
 
-                if re.search(aptitude_pattern,_text[:30]) is not None:
+                if re.search(aptitude_pattern,_text[:15]) is not None:
                     childs = get_childs([_data])
                     for c in childs:
                         aptitude_text += c["text"]
@@ -181,12 +182,11 @@ def extract_parameters(parse_document):
                 if it not in list_policy:
                     list_policy.append(it.group(0))
 
-    ser = re.search('地[址点][:为](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidopen_text) or re.search('[:,](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidopen_text)
-    if ser:
-        addr_bidopen_text = ser.group('addr')
-    ser = re.search('地[址点][:为](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidsend_text) or re.search('[:,](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidsend_text)
-    if ser:
-        addr_bidsend_text = ser.group('addr')
+    ser = re.search('地[址点][:为](?P<addr>([\w()()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w()()【】-]{,60}))[,。]', addr_bidopen_text)
+    addr_bidopen_text = ser.group('addr') if ser else ''
+
+    ser = re.search('地[址点][:为](?P<addr>([\w()()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w()()【】-]{,60}))[,。]', addr_bidsend_text)
+    addr_bidsend_text = ser.group('addr') if ser else ''
     if re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
         addr_bidopen_text = ""
     ser = re.search(pinmu_name_pattern, pinmu_name)

File diff suppressed because it is too large
+ 391 - 314
BiddingKG/dl/interface/predictor.py


Some files were not shown because too many files changed in this diff