4 months ago · ab3d4cdcd0
--- a/.gitignore
+++ b/.gitignore
@@ -20,4 +20,5 @@ node_modules
 
				 /BiddingKG/dl/LEGAL_ENTERPRISE.txt
			
 
				 /BiddingKG/dl_dev/
			
 
				 BiddingKG.iml
			
 
				-misc.xml
			
 
				+misc.xml
			
 
				+/.scannerwork/css-bundle/
			
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -6,3 +6,4 @@
 
				 /dataSources.local.xml
			
 
				 # Editor-based HTTP Client requests
			
 
				 /httpRequests/
			
 
				+/sonarlint/issuestore/
			
--- a/BiddingKG.iml
+++ b/BiddingKG.iml
@@ -7,7 +7,7 @@
 
				   </component>
			
 
				   <component name="NewModuleRootManager">
			
 
				     <content url="file://$MODULE_DIR$" />
			
 
				-    <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
			
 
				+    <orderEntry type="jdk" jdkName="Python 3.7 (py37)" jdkType="Python SDK" />
			
 
				     <orderEntry type="sourceFolder" forTests="false" />
			
 
				     <orderEntry type="library" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
			
 
				   </component>
			
--- a/BiddingKG/dl/channel/channel_bert.py
+++ b/BiddingKG/dl/channel/channel_bert.py
@@ -339,6 +339,10 @@ def text_process(text):
 
				     # text = re.sub("\s+", "", text)
			
 
				     text = re.sub("\s+", " ", text)
			
 
				 
			
 
				+    # 优化部分未识别表达
			
 
				+    text = re.sub("中止", "终止", text)
			
 
				+    text = re.sub("遴选", "招标", text)
			
 
				+
			
 
				     return text
			
 
				 
			
 
				 label2class_dict = {
			
@@ -413,12 +417,20 @@ def channel_predict(title,text):
 
				     # process text
			
 
				     if title in text:
			
 
				         text = text.replace(title, '', 1)
			
 
				-    text = text_process(text)
			
 
				+        text = text.lstrip("，")
			
 
				+        text = text.lstrip("。")
			
 
				+    if "##attachment##" in text:
			
 
				+        main_text,attachment_text = text.split("##attachment##",maxsplit=1)
			
 
				+        # print('main_text',main_text)
			
 
				+        if len(main_text)>=500: # 正文有足够的内容时不需要使用附件预测
			
 
				+            text = main_text
			
 
				     text = re.sub("##attachment##。?","",text)
			
 
				+    text = text_process(text)
			
 
				+
			
 
				     if len(text)<=100:
			
 
				         # 正文内容过短时，不预测
			
 
				         return
			
 
				-    elif len(text)<=200:
			
 
				+    elif len(text)<=150:
			
 
				         # 正文内容过短时，重复正文
			
 
				         text = text * 2
			
 
				     text = text[:2000]
			
@@ -426,6 +438,7 @@ def channel_predict(title,text):
 
				     title = title[:100]
			
 
				     text = "公告标题：" + title + "。" + "公告内容：" + text
			
 
				     text = text[:2000]
			
 
				+    # print('predict text:',text)
			
 
				 
			
 
				     # to torch data
			
 
				     text = [text]
			
@@ -445,8 +458,22 @@ def channel_predict(title,text):
 
				     with torch.no_grad():
			
 
				         outputs = model(None, text)
			
 
				         predic = torch.max(outputs.data, 1)[1].cpu().numpy()
			
 
				-        pred_label = predic[0]
			
 
				-        pred_class = label2class_dict[pred_label]
			
 
				+        pred_prob = torch.max(outputs.data, 1)[0].cpu().numpy()
			
 
				+        # print('pred_prob',pred_prob)
			
 
				+        if pred_prob>0.5:
			
 
				+            pred_label = predic[0]
			
 
				+            pred_class = label2class_dict[pred_label]
			
 
				+        else:
			
 
				+            return
			
 
				+    # print('check rule before',pred_class)
			
 
				+    # check rule
			
 
				+    if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标
			
 
				+        pred_class = 105
			
 
				+    elif pred_class==122 and re.search("验收服务",title):
			
 
				+        pred_class = None
			
 
				+    # elif pred_class==118 and re.search("重新招标",title): #重新招标类公告，因之前公告的废标原因而错识别为废标公告
			
 
				+    #     pred_class = 52
			
 
				+
			
 
				 
			
 
				     return pred_class
			
 
				 
			
@@ -477,14 +504,73 @@ class_dict = {51: '公告变更',
 
				        122: '验收合同'
			
 
				               }
			
 
				 
			
 
				+tenderee_type = ['公告变更','招标公告','招标预告','招标答疑','资审结果','采购意向']
			
 
				+win_type = ['中标信息','废标公告','候选人公示','合同公告','开标记录','验收合同']
			
 
				+
			
 
				 def merge_channel(list_articles,channel_dic,original_docchannel):
			
 
				+
			
 
				+    def merge_rule(title,text,docchannel,pred_channel,channel_dic,original_docchannel):
			
 
				+        front_text_len = len(text)//3 if len(text)>300 else 100
			
 
				+        front_text = text[:front_text_len]
			
 
				+        pred_channel = class_dict[pred_channel]
			
 
				+        if pred_channel == docchannel:
			
 
				+            channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+        else:
			
 
				+            if pred_channel in ['采购意向','招标预告'] and docchannel in ['采购意向','招标预告']:
			
 
				+                merge_res = '采购意向' if re.search("意向|意愿",title) or re.search("意向|意愿",front_text) else "招标预告"
			
 
				+                channel_dic['docchannel']['docchannel'] = merge_res
			
 
				+                channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+            elif pred_channel in ['公告变更','招标答疑'] and docchannel in ['公告变更','招标答疑']:
			
 
				+                channel_dic['docchannel']['docchannel'] = docchannel
			
 
				+                channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+            elif pred_channel=='公告变更' and docchannel in ['中标信息','废标公告','候选人公示','合同公告']: #中标类的变更还是中标类公告
			
 
				+                channel_dic['docchannel']['docchannel'] = docchannel
			
 
				+                channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+            elif docchannel=='公告变更' and pred_channel in ['中标信息','废标公告','候选人公示','合同公告']:
			
 
				+                channel_dic['docchannel']['docchannel'] = pred_channel
			
 
				+                channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+            elif docchannel in ['中标信息','候选人公示'] and pred_channel in ['中标信息','候选人公示']:
			
 
				+                if re.search('候选人(变更)?公[告示]|评标(结果)?(公[告示]|报告)|评审结果', title):
			
 
				+                    channel_dic['docchannel']['docchannel'] = '候选人公示'
			
 
				+                    channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+                else:
			
 
				+                    if original_docchannel in [101,119]:
			
 
				+                        channel_dic['docchannel']['docchannel'] = class_dict.get(original_docchannel, '原始类别')
			
 
				+                        channel_dic['docchannel']['use_original_docchannel'] = 1
			
 
				+                    else:
			
 
				+                        channel_dic['docchannel']['docchannel'] = pred_channel
			
 
				+                        channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+            elif docchannel in ['中标信息','候选人公示'] and pred_channel=='开标记录':
			
 
				+                re_text_len = max(500,len(text)//3)
			
 
				+                re_text = text[:re_text_len]
			
 
				+                if re.search('开标记录|截标信息|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果',re_text):
			
 
				+                    channel_dic['docchannel']['docchannel'] = '开标记录'
			
 
				+                    channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+
			
 
				+            if 'use_original_docchannel' not in channel_dic['docchannel']:
			
 
				+                original_type = class_dict.get(original_docchannel, '原始类别')
			
 
				+                if pred_channel in tenderee_type and docchannel in tenderee_type and original_type not in tenderee_type:
			
 
				+                    # pred_channel和docchannel都是同一（招标/中标）类型时，original_docchannel不一致时不使用原网类型
			
 
				+                    channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+                elif pred_channel in win_type and docchannel in win_type and original_type not in win_type:
			
 
				+                    # pred_channel和docchannel都是同一（招标/中标）类型时，original_docchannel不一致时不使用原网类型
			
 
				+                    channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+                else:
			
 
				+                    channel_dic = {'docchannel': {'doctype': '采招数据',
			
 
				+                                                  'docchannel': original_type,
			
 
				+                                                  'life_docchannel': original_type}}
			
 
				+                    channel_dic['docchannel']['use_original_docchannel'] = 1
			
 
				+
			
 
				+        return channel_dic
			
 
				+
			
 
				+
			
 
				     article = list_articles[0]
			
 
				     title = article.title
			
 
				     text = article.content
			
 
				 
			
 
				     doctype = channel_dic['docchannel']['doctype']
			
 
				     docchannel = channel_dic['docchannel']['docchannel']
			
 
				-    # print('doctype',doctype,'docchannel',docchannel)
			
 
				+    # print('doctype',doctype,'docchannel',docchannel,'original_docchannel',original_docchannel)
			
 
				     compare_type = ['公告变更','招标公告','中标信息','招标预告','招标答疑','资审结果','采购意向','废标公告','候选人公示',
			
 
				                       '合同公告','开标记录','验收合同']
			
 
				     # 仅比较部分数据
			
@@ -493,13 +579,31 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
 
				             pred = channel_predict(title, text)
			
 
				             # print('pred_res', pred)
			
 
				             if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
			
 
				-                if class_dict[pred] == docchannel:
			
 
				-                    channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				-                else:
			
 
				-                    channel_dic = {'docchannel': {'docchannel': '采招数据',
			
 
				-                                                  'doctype': class_dict.get(original_docchannel, '原始类别'),
			
 
				-                                                  'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
			
 
				-                    channel_dic['docchannel']['use_original_docchannel'] = 1
			
 
				+                channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)
			
 
				+
			
 
				+    elif doctype=='采招数据' and docchannel=="":
			
 
				+        pred = channel_predict(title, text)
			
 
				+        if pred is not None:
			
 
				+            pred = class_dict[pred]
			
 
				+            channel_dic['docchannel']['docchannel'] = pred
			
 
				+            channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				+
			
 
				+    # '招标预告'类 规则纠正
			
 
				+    if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告":
			
 
				+        if "##attachment##" in text:
			
 
				+            main_text, attachment_text = text.split("##attachment##", maxsplit=1)
			
 
				+        else:
			
 
				+            main_text = text
			
 
				+        main_text = text_process(main_text)
			
 
				+        # if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:max(500,len(main_text)//2)]):
			
 
				+        if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text):
			
 
				+            front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
			
 
				+            front_text = main_text[:front_text_len]
			
 
				+            if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):
			
 
				+                channel_dic['docchannel']['docchannel'] = "采购意向"
			
 
				+            else:
			
 
				+                channel_dic['docchannel']['docchannel'] = "招标预告"
			
 
				+            channel_dic['docchannel']['use_original_docchannel'] = 0
			
 
				 
			
 
				     return channel_dic
			
 
				 
			
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -984,7 +984,7 @@ package_number_pattern = re.compile(
 
				 |(([a-zA-Z]包[：（）]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
			
 
				 |(([，；。、：（]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
			
 
				 |((标[段包项]|品目|标段（包）|包[组件标]|[标分子（]包)(\[|【)?：?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
			
 
				-|([，；。、：（]|^)(标的?|项目|子项目?)(\[|【)?：?([一二三四五六七八九十]+|[0-9]{1,9})\
			
 
				+|([，；。、：（]|^)(标的?|(招标|采购)?项目|子项目?)(\[|【)?：?([一二三四五六七八九十]+|[0-9]{1,9})\
			
 
				 |((([标分子（]|合同|项目|采购)包|[，。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[:：]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
			
 
				 |[，；。、：（]?(合同|分|子)?包：?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
			
 
				 filter_package_pattern =  'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
			
@@ -1139,7 +1139,7 @@ def is_all_winner(title):
 
				     '''
			
 
				     if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
			
 
				         return 1
			
 
				-    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商', title):
			
 
				+    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商|集中采购', title):
			
 
				         return 2
			
 
				     return False
			
 
				 
			
@@ -1171,15 +1171,15 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
				 
			
 
				     # sentence_text = re.sub('\d+[年月日]', '', sentence_text) # 修复560180018 中标价（元）：3年投标报价（元）含税6299700.00 3年作为金额
			
 
				 
			
 
				-    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
 
				-        found_yeji += 1
			
 
				-    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
			
 
				-        all_match = []
			
 
				-    else:
			
 
				-        ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
			
 
				-        if ser:
			
 
				-            sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
			
 
				-        all_match = re.finditer(pattern_money, sentence_text)
			
 
				+    # if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
			
 
				+    #     found_yeji += 1
			
 
				+    # if found_yeji >= 2:  # 过滤掉业绩后面的所有金额 # 20250210修复逻辑错误，中标金额被前面句子业绩表达过滤 评分因素：业绩（9分），评分标准：提供2021年1月1日以来类似项目业绩， 589003579
			
 
				+    #     all_match = []
			
 
				+    # else:
			
 
				+    ser = re.search('((收费标准|计算[方公]?式)：|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s（）()\[\]【】\d\.%％‰\+\-*×/]{20,}[，。]?', sentence_text)  # 过滤掉收费标准里面的金额
			
 
				+    if ser:
			
 
				+        sentence_text = sentence_text.replace(ser.group(0), ' ' * len(ser.group(0)))
			
 
				+    all_match = re.finditer(pattern_money, sentence_text)
			
 
				     # print('all_match:', all_match)
			
 
				     for _match in all_match:
			
 
				         # print('_match: ', _match.group())
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -115,9 +115,15 @@ def tableToText(soup, docid=None, return_kv=False):
 
				             tr_line = []
			
 
				             tds = tr.findChildren(['td','th'], recursive=False)
			
 
				             if len(tds)==0:
			
 
				-                tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
			
 
				+                if return_kv:
			
 
				+                    tr_line.append([re.sub('\xa0','',tr.get_text()),0])
			
 
				+                else:
			
 
				+                    tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
			
 
				             for td in tds:
			
 
				-                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
			
 
				+                if return_kv:
			
 
				+                    tr_line.append([re.sub('\xa0','',td.get_text()),0])
			
 
				+                else:
			
 
				+                    tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
			
 
				                 #tr_line.append([td.get_text(),0])
			
 
				             inner_table.append(tr_line)
			
 
				         return inner_table                          
			
@@ -1681,11 +1687,11 @@ def tableToText(soup, docid=None, return_kv=False):
 
				         table2list = TableTag2List()
			
 
				         return_html_table = True if return_kv else False
			
 
				         if return_html_table:
			
 
				-            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table)
			
 
				+            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table,return_kv=return_kv)
			
 
				             inner_table = fixTable(inner_table)
			
 
				             html_table = fixTable(html_table, "")
			
 
				         else:
			
 
				-            inner_table = table2list.table2list(tbody, segment)
			
 
				+            inner_table = table2list.table2list(tbody, segment,return_kv=return_kv)
			
 
				             inner_table = fixTable(inner_table)
			
 
				 
			
 
				         if inner_table == []:
			
@@ -1778,6 +1784,8 @@ def tableToText(soup, docid=None, return_kv=False):
 
				     # 遍历表格中的每个tbody
			
 
				     tbodies = []
			
 
				     in_attachment = False
			
 
				+    if soup.name=="table":
			
 
				+        tbodies.append((soup,in_attachment))
			
 
				     for _part in soup.find_all():
			
 
				         if _part.name=='table':
			
 
				             tbodies.append((_part,in_attachment))
			
@@ -1803,7 +1811,8 @@ def tableToText(soup, docid=None, return_kv=False):
 
				                 else:
			
 
				                     tbodies[tbody_index - 1][0].append(row)
			
 
				             inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
			
 
				-            list_innerTable.append(inner_table)
			
 
				+            if inner_table:
			
 
				+                list_innerTable.append(inner_table)
			
 
				             tbody_index += 2
			
 
				             continue
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
@@ -1814,6 +1823,8 @@ def tableToText(soup, docid=None, return_kv=False):
 
				     # 遍历表格中的每个tbody
			
 
				     tbodies = []
			
 
				     in_attachment = False
			
 
				+    if soup.name=="tbody":
			
 
				+        tbodies.append((soup,in_attachment))
			
 
				     for _part in soup.find_all():
			
 
				         if _part.name == 'tbody':
			
 
				             tbodies.append((_part, in_attachment))
			
@@ -1838,7 +1849,8 @@ def tableToText(soup, docid=None, return_kv=False):
 
				                 else:
			
 
				                     tbodies[tbody_index - 1][0].append(row)
			
 
				             inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
			
 
				-            list_innerTable.append(inner_table)
			
 
				+            if inner_table:
			
 
				+                list_innerTable.append(inner_table)
			
 
				             tbody_index += 2
			
 
				             continue
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
@@ -1846,9 +1858,14 @@ def tableToText(soup, docid=None, return_kv=False):
 
				         tbody_index += 1
			
 
				 
			
 
				     if return_kv:
			
 
				-        kv_list = [x[1] for x in list_innerTable]
			
 
				-        text = [x[2] for x in list_innerTable]
			
 
				-        list_innerTable = [x[0] for x in list_innerTable]
			
 
				+        kv_list = []
			
 
				+        for x in list_innerTable:
			
 
				+            if x[1] is not None:
			
 
				+                kv_list.extend(x[1])
			
 
				+        text = ""
			
 
				+        for x in list_innerTable:
			
 
				+            if x[2] is not None:
			
 
				+                text += x[2]
			
 
				         return soup, kv_list, text
			
 
				     return soup
			
 
				     # return list_innerTable
			
@@ -2261,7 +2278,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
 
				     def repair_by_summation(inner_table):
			
 
				         # 修复合计在中间的特殊情况
			
 
				         if len(inner_table) >= 3 and len(inner_table[1]) == 2 \
			
 
				-                and inner_table[1][0][0] == '合计' and inner_table[1][1][0][-1] == '%':
			
 
				+                and inner_table[1][0][0] == '合计' and inner_table[1][1][0].endswith('%'):
			
 
				             inner_table[1][0][1] = 0
			
 
				             inner_table[1][1][1] = 0
			
 
				         return inner_table
			
@@ -3347,6 +3364,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = article_processed.replace('(', '（').replace(')', '）')  #2022/8/10 统一为中文括号
			
 
				         article_processed = article_processed.replace('侯选人', '候选人')  #2024/09/03 修复错别字避免预测错误。
			
 
				         article_processed = article_processed.replace('人选人', '入选人')  #2024/09/03 修复错别字避免预测错误。
			
 
				+        article_processed = article_processed.replace('⺠', '民')  # 2025/02/17 修复错别字 例：连云港市第一人⺠医院。
			
 
				         # article_processed = article_processed.replace(':', '：')  #2023/1/5 统一为中文冒号
			
 
				         article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", "：", article_processed)
			
 
				         article_processed = article_processed.replace('．','.').replace('－', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
			
@@ -3397,6 +3415,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				             article_processed = article_processed.replace(ser.group(0), '项目代码：%s，项目名称：%s' % (
			
 
				             ser.group('code'), ser.group('name')))
			
 
				         article_processed = re.sub('四舍五入至', '', article_processed) # 修复 533537050 ，中标价（四舍五入至万元）：6468万元
			
 
				+        if re.search('推荐供应商：', article_processed) and re.search('入围供应商：', article_processed): # 修复 中国工商银行 类网站 入围的才算中标
			
 
				+            article_processed = article_processed.replace('推荐供应商：', '公司名称：')
			
 
				+        if web_source_no.startswith('DX016489') and re.search('排名', article_processed) and re.search('成交供应商单位名称', article_processed): # 20250219 处理特殊站源有排名却叫成交供应商
			
 
				+            article_processed = article_processed.replace('成交供应商单位名称', '成交候选人单位名称')
			
 
				 
			
 
				         '''去除业绩内容'''
			
 
				         article_processed = del_achievement(article_processed)
			
@@ -3634,7 +3656,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				     '''
			
 
				 
			
 
				     list_entitys = []
			
 
				-    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司', '中标（成交）公司'] # 需要过滤掉的企业单位
			
 
				+    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司', '中标（成交）公司', '贵州茅台', '贵州茅台酒', '陕西省省级国'] # 需要过滤掉的企业单位
			
 
				     for list_sentence in list_sentences:
			
 
				         sentences = []
			
 
				         list_entitys_temp = []
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -30,6 +30,7 @@ from BiddingKG.dl.ratio.re_ratio import extract_ratio
 
				 from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
			
 
				 from BiddingKG.dl.interface.get_label_dic import get_all_label
			
 
				 from BiddingKG.dl.channel.channel_bert import merge_channel
			
 
				+from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
			
 
				 
			
 
				 
			
 
				 # 自定义jsonEncoder
			
@@ -162,7 +163,8 @@ def extractCount(extract_dict,page_attachments,web_source_name):
 
				                     if str(classification)=='采购清单':
			
 
				                         has_qingdan = True
			
 
				 
			
 
				-                extract_count += 2
			
 
				+
			
 
				+                extract_count += len(_attachments)//2+1
			
 
				             if has_zhaobiao:
			
 
				                 extract_count += 2
			
 
				             if has_qingdan:
			
@@ -186,6 +188,9 @@ def extractCount(extract_dict,page_attachments,web_source_name):
 
				     if web_source_name in set_login_web:
			
 
				         extract_count -= 3
			
 
				 
			
 
				+    product = _extract.get("product","")
			
 
				+    extract_count += len(str(product).split(","))//5
			
 
				+
			
 
				     return extract_count
			
 
				 
			
 
				 # 字符编码标准化
			
@@ -266,12 +271,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     cost_time.update(_cost_time)
			
 
				 
			
 
				     '''大纲提取及大纲内容相关提取'''
			
 
				+    start_time = time.time()
			
 
				     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
			
 
				     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
			
 
				     requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
			
 
				     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
			
 
				         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
			
 
				         requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
			
 
				+    # print('out_lines',out_lines)
			
 
				     # if addr_bidopen_text == '':
			
 
				     #     addr_bidopen_text = extract_addr(list_articles[0].content)
			
 
				     addr_dic, time_dic, code_investment = predictor.getPredictor('entity_type_rule').predict(list_entitys, list_sentences, list_articles)
			
@@ -279,6 +286,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				         addr_dic['addr_bidopen'] = addr_bidopen_text
			
 
				     if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
			
 
				         addr_dic['addr_bidsend'] = addr_bidsend_text
			
 
				+    log("get outline done of doc_id%s"%(doc_id))
			
 
				+    cost_time["outline"] = round(time.time()-start_time,2)
			
 
				+
			
 
				+    '''从 kvtree 正则匹配要素'''
			
 
				+    start_time = time.time()
			
 
				+    kv_single_dic, kv_addr_dic = get_kvtree_value(text)
			
 
				+    log("get kvtree done of doc_id%s"%(doc_id))
			
 
				+    cost_time["kvtree"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     # 过滤掉Redis里值为0的错误实体
			
 
				     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
			
@@ -402,8 +417,6 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				         channel_dic = {"docchannel":
			
 
				              { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
			
 
				         }
			
 
				-        prem[0]['prem'] = {}  # 审批项目不要这项
			
 
				-
			
 
				     else:
			
 
				         channel_dic, msc = predictor.getPredictor("channel").final_change(channel_dic, prem[0], original_docchannel, msc)
			
 
				     # print('msc', msc)
			
@@ -429,11 +442,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
			
 
				 
			
 
				     '''行业分类提取，需要用标题、项目名称、产品、及prem 里面的角色'''
			
 
				-    industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
			
 
				+    industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem, product_attrs=product_attrs)
			
 
				 
			
 
				     '''地区获取'''
			
 
				     start_time = time.time()
			
 
				-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
			
 
				+    # district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
			
 
				+    district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic, list_entity=list_entitys[0])
			
 
				     cost_time["district"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				     '''根据district提取结果修复实体'''
			
@@ -471,12 +485,15 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-12-12'}
			
 
				+    version_date = {'version_date': '2025-02-19'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
 
				-        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text)
			
 
				-        data_res['approval'] = approval
			
 
				+        approval = predictor.getPredictor("approval").predict(list_sentences, list_entitys, text, nlp_enterprise=nlp_enterprise+nlp_enterprise_attachment)
			
 
				+        approval = predictor.getPredictor("approval").add_ree2approval(approval , prem[0]['prem'])
			
 
				+        approval = predictor.getPredictor("approval").add_codename2approval(approval , codeName)
			
 
				+        data_res['prem'] = {}  # 审批项目不要这项
			
 
				+        data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目
			
 
				 
			
 
				     if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
			
 
				         start_time = time.time() #失信数据要素提取
			
@@ -501,6 +518,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     '''最终检查修正招标、中标金额'''
			
 
				     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
			
 
				 
			
 
				+    '''利用采购意向需求信息补充项目'''
			
 
				+    if channel_dic['docchannel']['docchannel'] == '采购意向':
			
 
				+        getAttributes.demand_to_prem(data_res.get('demand_info', {}), prem[0]['prem'])
			
 
				+
			
 
				     data_res["project_label"] = project_label
			
 
				     data_res["property_label"] = property_label
			
 
				     data_res["doctitle_refine"] = doctitle_refine
			
@@ -545,6 +566,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     data_res['bid_score'] = bid_score # 评标得分
			
 
				     data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
			
 
				     data_res['code_investment'] = code_investment # 投资项目编号
			
 
				+    for k, v in kv_single_dic.items(): # 没获取到的用kv_tree补充
			
 
				+        if data_res.get(k, '') == '':
			
 
				+            data_res[k] = v
			
 
				+    for k, v in kv_addr_dic.items(): # 没获取到地址的用kv_tree补充
			
 
				+        if data_res['addr_dic'].get(k, '') == '' or re.search('时间：', data_res['addr_dic'][k]):
			
 
				+            data_res['addr_dic'][k] = v
			
 
				 
			
 
				     # for _article in list_articles:
			
 
				     #         log(_article.content)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -3638,7 +3638,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
 
				             #                     content_text += c["text"] + ""
			
 
				             #                 print('concat_text', content_text)
			
 
				 
			
 
				-            if re.search("，(完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0:
			
 
				+            if re.search("[，；](完成|截止|结束)(时间|日期)", entity_left2[-8:]) and entity.label==0:
			
 
				                 if entity.sentence_index == last_sentence_index:
			
 
				                     time_type = last_time_index.get(last_time_type)
			
 
				                     if time_type:
			
@@ -3704,7 +3704,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
 
				                         last_time_type = ''
			
 
				                 elif entity.label==6 and label_prob>0.5:
			
 
				                     if len(extract_time)==1:
			
 
				-                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				+                        if (re.search("前|截?止",entity_right) and re.search("前|截?止(?!时间|日期)",entity_right2[:len(entity_right)+3])) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
			
 
				                             dict_time['time_getFileEnd'].append((extract_time[0], label_prob,in_attachment))
			
 
				                             last_time_type = 'time_getFileEnd'
			
 
				                         else:
			
@@ -4895,6 +4895,32 @@ def fix_single_source(prem, channel_dic, original_docchannel):
 
				                 if d['role_name'] == "win_tenderer":
			
 
				                     d['role_name'] = 'pre_win_tenderer'
			
 
				 
			
 
				+def demand_to_prem(demand, prem):
			
 
				+    if len(demand.get('data', [])) > len(prem):
			
 
				+        i = 1
			
 
				+        for d in demand.get('data', []):
			
 
				+            d['demand_id'] = i
			
 
				+            if d.get('project_name', '') != '' and d.get('budget', '') != '':
			
 
				+                if d.get('project_name', '') not in prem:
			
 
				+                    prem[d.get('project_name', '')] = {
			
 
				+                        'demand_id': i,
			
 
				+                        'code': '',
			
 
				+                        'name': d.get('project_name', ''),
			
 
				+                        'roleList': [],
			
 
				+                        'tendereeMoney': d.get('budget', ''),
			
 
				+                        'tendereeMoneyUnit': ""
			
 
				+                    }
			
 
				+                else:
			
 
				+                    prem[d.get('project_name', '')+'_%d'%i] = {
			
 
				+                        'demand_id': i,
			
 
				+                        'code': '',
			
 
				+                        'name': d.get('project_name', ''),
			
 
				+                        'roleList': [],
			
 
				+                        'tendereeMoney': d.get('budget', ''),
			
 
				+                        'tendereeMoneyUnit': ""
			
 
				+                    }
			
 
				+            i += 1
			
 
				+
			
 
				 if __name__=="__main__":
			
 
				     '''
			
 
				     conn = getConnection()
			
--- a/BiddingKG/dl/interface/html_2_kvtree.py
+++ b/BiddingKG/dl/interface/html_2_kvtree.py
--- a/BiddingKG/dl/interface/htmlparser.py
+++ b/BiddingKG/dl/interface/htmlparser.py
@@ -297,7 +297,7 @@ class ParseDocument():
 
				                 if v is not None:
			
 
				                     groups.append((k,v))
			
 
				         if len(groups):
			
 
				-            # groups.sort(key=lambda x:x[0])
			
 
				+            groups.sort(key=lambda x:x[0])
			
 
				             return groups
			
 
				         return None
			
 
				 
			
--- a/BiddingKG/dl/interface/kvtree_search.py
+++ b/BiddingKG/dl/interface/kvtree_search.py
@@ -0,0 +1,66 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+@author: bidikeji
			
 
				+@time: 2024/12/26 10:31
			
 
				+"""
			
 
				+from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
			
 
				+import re
			
 
				+
			
 
				+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
			
 
				+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
			
 
				+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([:：，]|$)"
			
 
				+aptitude_pattern = "资质（资格）要求|资格（资质）要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求|项目要求"
			
 
				+
			
 
				+pinmu_name_pattern = "采购品目(名称)?([:：，]|$)"
			
 
				+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[)）]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([:：，]|$)"
			
 
				+addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([:：，]|$)"
			
 
				+
			
 
				+pattern_dic_single = {'requirement': requirement_pattern,
			
 
				+               'aptitude': aptitude_pattern,
			
 
				+               'pinmu_name': pinmu_name_pattern}
			
 
				+pattern_dic_addr = {'addr_bidopen': addr_bidopen_pattern,
			
 
				+                    'addr_bidsend': addr_bidsend_pattern}
			
 
				+
			
 
				+def get_kvtree_value(html):
			
 
				+    '''
			
 
				+    通过kv数解析，正则匹配 k 值获取内容
			
 
				+    :param html:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    _pd = Html2KVTree(html)
			
 
				+    kv_single_dic = {} # 单独放在外面的字段
			
 
				+    kv_addr_dic = {} # 放在地址字典的字段
			
 
				+    for k, v in pattern_dic_single.items():
			
 
				+        kv_l = _pd.extract_kv(v)
			
 
				+        value = ''
			
 
				+        for d in kv_l:
			
 
				+            ser = re.search(v, d.get('key', ''))
			
 
				+            if ser and ser.end()/len(d.get('key', ''))<0.5: # 571545382 过滤错误提取，例 供应商要求澄清采购文件的截止时间 提取到 供应商要求 aptitude
			
 
				+                continue
			
 
				+            elif d.get('key', '').strip() == d.get('value', '').strip(): # 修复 571425136 k: aptitude， key: 1．供应商资质, value: 1．供应商资质
			
 
				+                continue
			
 
				+            if d.get('value', '').strip() != '':
			
 
				+                value = d['value'].strip()
			
 
				+                break
			
 
				+        if value != '' and re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要
			
 
				+            kv_single_dic[k] = value
			
 
				+    for k, v in pattern_dic_addr.items():
			
 
				+        kv_l = _pd.extract_kv(v)
			
 
				+        value = ''
			
 
				+        for d in kv_l:
			
 
				+            if d.get('value', '').strip() != '':
			
 
				+                value = d['value'].strip()
			
 
				+                if re.search('时间：', value) and re.search('地[点址]：(?P<addr>[\w（）()【】-]{5,50})[，。]', value):
			
 
				+                    value = re.search('地[点址]：(?P<addr>[\w（）()【】-]{5,50})[，。]', value).group('addr')
			
 
				+                break
			
 
				+        if value != '' and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', value): # 包含两个中文以上的才要 避免 571236792 文件获取地点:-- 这种也提取
			
 
				+            kv_addr_dic[k] = value
			
 
				+    return kv_single_dic, kv_addr_dic
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    with open('d:/html/2.html', encoding='utf-8') as f:
			
 
				+        html = f.read()
			
 
				+        rs = get_kvtree_value(html)
			
 
				+        print(rs)
			
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -57,10 +57,11 @@ def extract_sentence_list(sentence_list):
 
				 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
			
 
				                           "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
			
 
				                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([:：，]|$)"
			
 
				-aptitude_pattern = "((资格|资质)[的及]?(要求|条件)|竞买资格及要求|供应商报价须知)([:：，]|$)|(竞买|竞买人|竞投人)?资格(条件)?：|按以下要求参与竞买|(报名|竞买)(条件|资格)"
			
 
				+aptitude_pattern = "资质（资格）要求|资格（资质）要求|单位要求|资质及业绩要求|((资格|资质|准入)[的及]?(要求|条件|标准|限定|门槛)|竞买资格及要求|供应商报价须知)|按以下要求参与竞买|((报名|应征|竞买|投标|竞投|受让|报价|竞价|竞包|竞租|承租|申请|参与|参选|遴选)的?(人|方|单位|企业|客户|机构)?|供应商|受让方)((必?须|需|应[该当]?)(具备|满足|符合|提供)+以?下?)?的?(一般|基本|主要)?(条件|要求|资格(能力)?|资质)+|乙方应当符合下列要求|参与比选条件|合格的投标人|询价要求"
			
 
				+
			
 
				 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[)）]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([:：，]|$)|开启([:：，]|$)"
			
 
				 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([:：，]|$)"
			
 
				-pinmu_name_pattern = "采购品目名称([:：，]|$)"
			
 
				+pinmu_name_pattern = "采购品目(名称)?([:：，]|$)"
			
 
				 out_lines = []
			
 
				 policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
			
 
				 not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
			
@@ -119,7 +120,7 @@ def extract_parameters(parse_document):
 
				                 # outline = re.sub('（?[一二三四五六七八九十\d.]+）?\s*、?', '',
			
 
				                 #                  re.split('[：:，]', _text)[0].replace('(', '（').replace(')', '）'))
			
 
				 
			
 
				-                if re.search(aptitude_pattern,_text[:30]) is not None:
			
 
				+                if re.search(aptitude_pattern,_text[:15]) is not None:
			
 
				                     childs = get_childs([_data])
			
 
				                     for c in childs:
			
 
				                         aptitude_text += c["text"]
			
@@ -181,12 +182,11 @@ def extract_parameters(parse_document):
 
				                 if it not in list_policy:
			
 
				                     list_policy.append(it.group(0))
			
 
				 
			
 
				-    ser = re.search('地[址点][：为](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidopen_text) or re.search('[：，](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidopen_text)
			
 
				-    if ser:
			
 
				-        addr_bidopen_text = ser.group('addr')
			
 
				-    ser = re.search('地[址点][：为](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidsend_text) or re.search('[：，](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidsend_text)
			
 
				-    if ser:
			
 
				-        addr_bidsend_text = ser.group('addr')
			
 
				+    ser = re.search('地[址点][：为](?P<addr>([\w（）()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w（）()【】-]{,60}))[，。]', addr_bidopen_text)
			
 
				+    addr_bidopen_text = ser.group('addr') if ser else ''
			
 
				+
			
 
				+    ser = re.search('地[址点][：为](?P<addr>([\w（）()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w（）()【】-]{,60}))[，。]', addr_bidsend_text)
			
 
				+    addr_bidsend_text = ser.group('addr') if ser else ''
			
 
				     if re.search('开启', addr_bidopen_text) and re.search('时间：\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
			
 
				         addr_bidopen_text = ""
			
 
				     ser = re.search(pinmu_name_pattern, pinmu_name)
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py