6 mesiacov pred · 375d7f8698
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@
 
				 /BiddingKG/dl/product/data/
			
 
				 /BiddingKG/dl/channel/data/
			
 
				 /BiddingKG/dl_dev/test
			
 
				+/BiddingKG/dl_dev/test2
			
 
				 /BiddingKG/dl/test
			
 
				 node_modules
			
 
				 /BiddingKG/dl/table_head/train_data/
			
@@ -16,3 +17,7 @@ node_modules
 
				 /BiddingKG/dl/table_head/checkpoints/
			
 
				 /BiddingKG/dl/table_head/data_new.csv
			
 
				 /BiddingKG/dl/table_head/has_table_no_attach.xlsx
			
 
				+/BiddingKG/dl/LEGAL_ENTERPRISE.txt
			
 
				+/BiddingKG/dl_dev/
			
 
				+BiddingKG.iml
			
 
				+misc.xml
			
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -1009,7 +1009,7 @@ def find_package(content):
 
				             '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
			
 
				             # print('过滤掉错误包：', iter.group())
			
 
				             continue
			
 
				-        elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书',
			
 
				+        elif iter.end() + 2 < len(content) and re.search('标的物|包装|划分|标(书|准|志|记|识|签|贴|帜|本|底|价|量)',
			
 
				                                                          content[iter.start():iter.end() + 2]):
			
 
				             # print('过滤掉错误包：', iter.group())
			
 
				             continue
			
@@ -1122,6 +1122,30 @@ def del_tabel_achievement(soup):
 
				             del_tag = tr.extract()
			
 
				             # print('删除表格业绩内容', del_tag.text)
			
 
				 
			
 
				+def is_all_winner(title):
			
 
				+    '''
			
 
				+    是否提取所有投标人作为中标人，存管类不分排名都作中标人；入围类按排名，无排名都做中标人
			
 
				+    :param title: 标题
			
 
				+    :return:
			
 
				+    '''
			
 
				+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
			
 
				+        return 1
			
 
				+    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商', title):
			
 
				+        return 2
			
 
				+    return False
			
 
				+
			
 
				+def is_deposit_project(title, name, requirement):
			
 
				+    '''
			
 
				+    通过正则判断项目是否为银行存款类项目
			
 
				+    :param title: 标题
			
 
				+    :param name: 项目名称
			
 
				+    :param requirement: 采购内容
			
 
				+    :return:
			
 
				+    '''
			
 
				+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)(（.{2,10}）)?存放|存放银行|存款(服务|业务|项目)|国库现金管理|存款账户开户|(管理|存款|合作)(定点|专户)?银行|贷款合作银行|资金监管账户|开户银行项目|专户开户银行|银行专户选择|定期存[款放]|专项债券?专用账户', title+name+requirement):
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				 def recall(y_true, y_pred):
			
 
				     '''
			
 
				     计算召回率
			
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -179,7 +179,7 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
 
				                     if have_bus:
			
 
				                         lb, prob = get_role(dic)
			
 
				                         bus_dic[_entity.entity_text] = (lb, prob)
			
 
				-                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
			
 
				+                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
			
 
				                             bus_tenderee.append(_entity)
			
 
				                     elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据
			
 
				                         have_bus = True
			
@@ -236,6 +236,8 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
 
				                         _ent = long_entity[second_i]
			
 
				                         if _ent.label in [0,1,5]:
			
 
				                             if len(_entity.entity_text)<len(_ent.entity_text) and is_short(_entity.entity_text, _ent.entity_text):  # 简称顺序包含在工商名称内的替换
			
 
				+                                if _entity.entity_text.endswith('大学'): # 修复 533357339 东北大学 替换为 中国银行沈阳东北大学支行
			
 
				+                                    continue
			
 
				                                 _entity.entity_text = _ent.entity_text
			
 
				                                 lb, prob = bus_dic[_entity.entity_text]
			
 
				                                 if lb in [0, 1] and prob > 0.9 and _entity.values[
			
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -342,7 +342,7 @@ class Role():
 
				         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
			
 
				                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
			
 
				                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}
			
 
				-        if result['role_name'] == 'tenderee':
			
 
				+        if result['role_name'] in ['tenderee', 'win_tenderer']:
			
 
				             result['role_prob'] = self.role_prob
			
 
				         if result['role_name'] == 'win_tenderer' and self.multi_winner != set():
			
 
				             self.multi_winner.add(result['role_text'])
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -8,7 +8,7 @@ import time
 
				 import codecs
			
 
				 
			
 
				 from BiddingKG.dl.ratio.re_ratio import extract_ratio
			
 
				-from BiddingKG.dl.table_head.predict import predict
			
 
				+from BiddingKG.dl.table_head.predict_torch import predict
			
 
				 
			
 
				 sys.setrecursionlimit(1000000)
			
 
				 sys.path.append(os.path.abspath("../.."))
			
@@ -708,6 +708,8 @@ def tableToText(soup, docid=None):
 
				         for i in range(len(inner_table)):
			
 
				             for j in range(len(inner_table[i])):
			
 
				                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
			
 
				+                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式'] and predict_list[i][j]!=1:
			
 
				+                    inner_table[i][j] = [origin_inner_table[i][j][0], 1]
			
 
				 
			
 
				         if show:
			
 
				             print(inner_table)
			
@@ -1402,10 +1404,25 @@ def tableToText(soup, docid=None):
 
				                 in_attachment = True
			
 
				     #逆序处理嵌套表格
			
 
				     # print('len(tbodies)1', len(tbodies))
			
 
				-    for tbody_index in range(1,len(tbodies)+1):
			
 
				+    # for tbody_index in range(1,len(tbodies)+1):
			
 
				+    tbody_index = 1
			
 
				+    while tbody_index < len(tbodies)+1:
			
 
				         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
			
 
				+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
			
 
				+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
			
 
				+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况；例：526321576
			
 
				+            for row in tbody.find_all('tr'):
			
 
				+                if tbodies[tbody_index - 1][0].tbody:
			
 
				+                    tbodies[tbody_index - 1][0].tbody.append(row)
			
 
				+                else:
			
 
				+                    tbodies[tbody_index - 1][0].append(row)
			
 
				+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
			
 
				+            list_innerTable.append(inner_table)
			
 
				+            tbody_index += 2
			
 
				+            continue
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
 
				         list_innerTable.append(inner_table)
			
 
				+        tbody_index += 1
			
 
				 
			
 
				     # tbodies = soup.find_all('tbody')
			
 
				     # 遍历表格中的每个tbody
			
@@ -1418,11 +1435,25 @@ def tableToText(soup, docid=None):
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				     #逆序处理嵌套表格
			
 
				-    # print('len(tbodies)2', len(tbodies))
			
 
				-    for tbody_index in range(1,len(tbodies)+1):
			
 
				+    tbody_index = 1
			
 
				+    # for tbody_index in range(1,len(tbodies)+1):
			
 
				+    while tbody_index < len(tbodies) + 1:
			
 
				         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
			
 
				+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
			
 
				+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
			
 
				+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况；例：526321576
			
 
				+            for row in tbody.find_all('tr'):
			
 
				+                if tbodies[tbody_index - 1][0].tbody:
			
 
				+                    tbodies[tbody_index - 1][0].tbody.append(row)
			
 
				+                else:
			
 
				+                    tbodies[tbody_index - 1][0].append(row)
			
 
				+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
			
 
				+            list_innerTable.append(inner_table)
			
 
				+            tbody_index += 2
			
 
				+            continue
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
 
				         list_innerTable.append(inner_table)
			
 
				+        tbody_index += 1
			
 
				 
			
 
				     return soup
			
 
				     # return list_innerTable
			
@@ -2085,8 +2116,8 @@ def segment(soup,final=True):
 
				             child.insert_after("。")
			
 
				         if child.name in commaList:
			
 
				             child.insert_after("，")
			
 
				-            if child.name != "td" and re.match('[（(][一二三四五六七八九十]+[)）]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
			
 
				-                child.insert_before("。")
			
 
				+            # if child.name != "td" and re.match('[（(][一二三四五六七八九十]+[)）]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割 20240930 注销，修复529501491关键词 三、中标供应商：（一）单位名称被分句
			
 
				+            #     child.insert_before("。")
			
 
				         # if child.name == 'div' and 'class' in child.attrs:
			
 
				         #     # 添加附件"attachment"标识
			
 
				         #     if "richTextFetch" in child['class']:
			
@@ -2919,6 +2950,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = segment(article_processed)
			
 
				 
			
 
				         article_processed = article_processed.replace('(', '（').replace(')', '）')  #2022/8/10 统一为中文括号
			
 
				+        article_processed = article_processed.replace('侯选人', '候选人')  #2024/09/03 修复错别字避免预测错误。
			
 
				+        article_processed = article_processed.replace('人选人', '入选人')  #2024/09/03 修复错别字避免预测错误。
			
 
				         # article_processed = article_processed.replace(':', '：')  #2023/1/5 统一为中文冒号
			
 
				         article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", "：", article_processed)
			
 
				         article_processed = article_processed.replace('．','.').replace('－', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
			
@@ -2929,7 +2962,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
			
 
				         article_processed = re.sub('(招标|采购)人(概况|信息)：?[，。]', '采购人信息：', article_processed)  # 2022/8/10统一表达
			
 
				         article_processed = article_processed.replace('\（%）', '')    # 中标（成交）金额（元）\（%）：498888.00， 处理 江西省政府采购网  金额特殊问题
			
 
				-        article_processed = re.sub('金额：?（(可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元（）\d]{8,20})）：?', '金额：', article_processed)    # 中标（成交）金额：（可填写下浮率、折扣率或费率）：29.3万元  金额特殊问题
			
 
				+        article_processed = re.sub('金额：?（(可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元（）\d]{8,20}）)）?：?', '金额：', article_processed)    # 中标（成交）金额：（可填写下浮率、折扣率或费率）：29.3万元  金额特殊问题 例：530377517
			
 
				         article_processed = re.sub('（不?含(可抵扣增值|\w{,8})税）', '', article_processed)    # 120637247 投标报价（元），（含可抵扣增值税）：277,560.00。
			
 
				         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式：名称)', '供应商名称', article_processed)  # 18889217, 84422177
			
 
				         article_processed = re.sub('，最高有效报价者：', '，中标人名称：', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
			
@@ -2937,6 +2970,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
			
 
				         article_processed = re.sub('例：建设银行（甲方全称）', ' ', article_processed)  # 2024/06/12 特殊表达修改 修改 481513912 金采网 附件模板导致错误提取招标人
			
 
				         article_processed = re.sub('^[,，.。；;、]+', '', article_processed)
			
 
				+        article_processed = re.sub('资，金', '资金', article_processed)
			
 
				+        article_processed = re.sub('金，额', '金额', article_processed)
			
 
				+        article_processed = re.sub('存，款', '存款', article_processed)
			
 
				         if web_source_no.startswith('DX002756-'):
			
 
				             article_processed = re.sub('状态：(进行中|已结束)单位', '，项目单位', article_processed)  # 376225646
			
 
				         if web_source_no.startswith('DX006116-') and re.search('结果公告如下：.{5,50}，单位名称：', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况
			
@@ -2961,6 +2997,11 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				             article_processed = article_processed[:idx]
			
 
				         for it in re.finditer('[一二三四五六七八九十\d]、中标候选人名称，', article_processed): # 修复大纲类标点导致提取不到，例：515521734
			
 
				             article_processed = re.sub(it.group(0), it.group(0)[:-1]+'：', article_processed)
			
 
				+        ser = re.search('项目[编代][码号]/项目名称：(?P<code>(\[审批\])?[\d\-]{10,30})/?(?P<name>[\u4e00-\u9fa5（）]{4,35}[，。])', article_processed) # 优化项目编号名称一起写的情况 spxm-53340116.html
			
 
				+        if ser:
			
 
				+            article_processed = article_processed.replace(ser.group(0), '项目代码：%s，项目名称：%s' % (
			
 
				+            ser.group('code'), ser.group('name')))
			
 
				+        article_processed = re.sub('四舍五入至', '', article_processed) # 修复 533537050 ，中标价（四舍五入至万元）：6468万元
			
 
				 
			
 
				         '''去除业绩内容'''
			
 
				         article_processed = del_achievement(article_processed)
			
@@ -3053,6 +3094,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
				         cost_time[key_preprocess] += time.time()-start_time
			
 
				 
			
 
				         #nlp处理
			
 
				+        outline_list = [] # 20240906 修复下面条件不成立时，后面 list_outlines.append(outline_list) 名称未定义报错
			
 
				         if article_processed is not None and len(article_processed)!=0:
			
 
				             split_patten = "。"
			
 
				             sentences = []
			
@@ -3180,7 +3222,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				     # 使用正则识别金额
			
 
				     entity_type = "money"
			
 
				     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				                           "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				     # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
			
@@ -3230,7 +3272,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				                         if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾，应该小数点识别错，直接去掉
			
 
				                             entity_text = entity_text[:-3]
			
 
				                     if k.split("_")[0] == "unit":
			
 
				-                        if v == '万元' or unit == "":  # 处理  预算金额(元)：160万元 这种出现前后单位不一致情况
			
 
				+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元)：160万元  总价（万元）：最终报价：695000.00（元）
			
 
				                             unit = v
			
 
				                     if k.split("_")[0] == "text":
			
 
				                         # print('text_before: ', _match.group(k))
			
@@ -3286,6 +3328,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				                     unit = '万元'
			
 
				                 elif re.search('^，?(价格币种：\w{2,3}，)?价格单位：万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格：39501.094425，价格币种：人民币，价格单位：万元，
			
 
				                     unit = '万元'
			
 
				+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
			
 
				+                    unit = '万元'
			
 
				                 elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
			
 
				                     if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
			
 
				                         unit = '万元'
			
@@ -3320,16 +3364,16 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
				 
			
 
				             entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
			
 
				             # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
			
 
				-            if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额',
			
 
				-                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
			
 
				+            if re.search('总投资|投资总额|总预算|总概算|(投资|招标|资金|存放|操作|融资)规模|批复概算|投资额|总规模|工程造价|总金额',
			
 
				+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额  20241031工程造价作总投资
			
 
				                 # print('总投资金额: ', _match.group(0))
			
 
				                 notes = '总投资'
			
 
				             elif re.search('投资|概算|建安费|其他费用|基本预备费',
			
 
				                            sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
			
 
				                 notes = '投资'
			
 
				-            elif re.search('工程造价',
			
 
				-                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
			
 
				-                notes = '工程造价'
			
 
				+            # elif re.search('工程造价',
			
 
				+            #                sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
			
 
				+            #     notes = '工程造价'
			
 
				             elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
			
 
				                   or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\(（]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\)）]*[:：为]',
			
 
				                                sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
			
@@ -3507,11 +3551,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                     re.search('\d[楼层号]', entity_text)==None: # 2024/06/07 修改错误地址实体为角色
			
 
				                     entity_type = 'org'
			
 
				 
			
 
				-                if entity_text.startswith('石山县'): # 2024/04/24 修复实体识别积石山县 识别少字问题
			
 
				-                    entity_text = '积' + entity_text
			
 
				-                    if 0<=begin_index_temp-1<len(sentence_text) and sentence_text[begin_index_temp-1] == '积':
			
 
				-                        begin_index_temp -= 1
			
 
				-                        ner_entity = (begin_index_temp, end_index_temp, entity_type, entity_text)
			
 
				+                if begin_index_temp>0 and '县' in entity_text and re.match('前郭尔罗斯蒙古族自治县|积石山县', sentence_text[begin_index_temp-1:end_index_temp]): #20240905 修复实体识别少字问题
			
 
				+                    entity_text = sentence_text[begin_index_temp-1] + entity_text
			
 
				+                    begin_index_temp -= 1
			
 
				+                    ner_entity = (begin_index_temp, end_index_temp, entity_type, entity_text)
			
 
				                 elif entity_text == '中华人民共和国' and re.search('^\w{2,4}海关', sentence_text[end_index_temp: end_index_temp+6]):  # 2024/04/24 修复 采购单位：中华人民共和国汕尾海关， 识别不到海关
			
 
				                     ser = re.search('^\w{2,4}海关', sentence_text[end_index_temp: end_index_temp+6])
			
 
				                     entity_text += ser.group(0)
			
@@ -4037,11 +4080,17 @@ if __name__=="__main__":
 
				     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				     # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
			
 
				     # getPredictTable()
			
 
				-    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
			
 
				-        sourceContent = f.read()
			
 
				-        # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
			
 
				-        # print(article_processed)
			
 
				 
			
 
				-        list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
			
 
				-        for entity in list_entitys[0]:
			
 
				-            print(entity.entity_type, entity.entity_text)
			
 
				+    text = '是否拟中标人：是，评标排名：1，价格类型：（万元）报价：13311.1582，得分：84.46，项目负责人：邓焱文'
			
 
				+    text = '，采购包1：采购包预算金额（元：1,500000.00，采购包最高限价（元：1,430600.00，'
			
 
				+    text = '成交人：中坤电力有限公司，成交价格：11493,603.52元，质量：合格，项目工期：117天，'
			
 
				+    # text = '数量及单位1：65台，单价2：800，投标报价3=1×2：52000。'
			
 
				+    print(get_money_entity(text, found_yeji=0))
			
 
				+    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
			
 
				+    #     sourceContent = f.read()
			
 
				+    #     # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
			
 
				+    #     # print(article_processed)
			
 
				+    #
			
 
				+    #     list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
			
 
				+    #     for entity in list_entitys[0]:
			
 
				+    #         print(entity.entity_type, entity.entity_text)
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -464,6 +464,12 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				     win_tenderer_set = set() # 记录所有预测为中标的实体集合
			
 
				   # print(PackageList)
			
 
				     #拿到各个实体的packageName,packageCode
			
 
				+    main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
			
 
				+    for entity in list_entity:
			
 
				+        if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
			
 
				+            main_contain_winner = True
			
 
				+            break
			
 
				+
			
 
				     for entity in list_entity:
			
 
				         if entity.entity_type in ['org','company']:
			
 
				             #限制附件里角色values[label]最大概率prob
			
@@ -477,6 +483,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				             values = entity.values
			
 
				             role_prob = float(values[int(entity.label)])
			
 
				             if role_prob>=on_value and str(entity.label)!="5":
			
 
				+                if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人，不再提取附件中标人 避免 例：504046747 附件角色OCR错字变两个标段
			
 
				+                    continue
			
 
				                 if str(entity.label) in ["0","1"]:
			
 
				                     packageName = "Project"
			
 
				                 else:
			
@@ -583,7 +591,7 @@ def getPackageScopePattern():
 
				     for item in df["list_word"]:
			
 
				         item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
			
 
				         pattern += item+"|"
			
 
				-    pattern = pattern[:-1]+")[:：是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
			
 
				+    pattern = pattern[:-1]+")[:：是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##"
			
 
				     return pattern
			
 
				         
			
 
				 pattern_packageScope = getPackageScopePattern()   
			
@@ -824,11 +832,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
 
				                         scope_begin = [PackageList_scope[j]["sentence_index"],
			
 
				                                        PackageList_scope[j]["offsetWords_begin"]]
			
 
				                     else:
			
 
				-                        if j == 0:
			
 
				-                            scope_begin = [0, 0]
			
 
				-                        else:
			
 
				-                            scope_begin = [PackageList_scope[j - 1]["sentence_index"],
			
 
				-                                           PackageList_scope[j - 1]["offsetWords_begin"]]
			
 
				+                        scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
			
 
				+                        # if j == 0:
			
 
				+                        #     scope_begin = [0, 0]
			
 
				+                        # else:
			
 
				+                        #     scope_begin = [PackageList_scope[j - 1]["sentence_index"],
			
 
				+                        #                    PackageList_scope[j - 1]["offsetWords_begin"]]
			
 
				+
			
 
				                     if j == len(PackageList_scope) - 1:
			
 
				                         scope_end = [list_sentence[-1].sentence_index,
			
 
				                                      changeIndexFromWordToWords(list_sentence[-1].tokens,
			
@@ -943,7 +953,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                 packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
			
 
				     def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
			
 
				         for i in range(len(packDict[packageName]["roleList"])):
			
 
				-            if packDict[packageName]["roleList"][i].entity_text==entity:
			
 
				+            if packDict[packageName]["roleList"][i].entity_text==entity and not packDict[packageName]["roleList"][i].serviceTime:
			
 
				                 # packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
			
 
				                 packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"")
			
 
				 
			
@@ -1591,7 +1601,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
			
 
				                             continue
			
 
				                         # 角色为中标候选人，排除"质疑|投诉|监督|受理"相关的联系人
			
 
				-                        if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
			
 
				+                        if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
			
 
				+                            continue
			
 
				+                        # 角色为招标/代理人，排除"纪检|监察"相关的联系人
			
 
				+                        if _subject.label in [0,1] and re.search("纪检|监察",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
			
 
				                             continue
			
 
				                         if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
			
 
				                             if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
			
@@ -2024,7 +2037,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                                 if entity.label in [2, 3, 4] and distance>=20:
			
 
				                                     break
			
 
				                                 # 角色为中标候选人，排除"质疑|投诉|监督|受理"相关的联系人
			
 
				-                                if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
			
 
				+                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
			
 
				+                                    break
			
 
				+                                # 角色为招标/代理人，排除"纪检|监察"相关的联系人
			
 
				+                                if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
			
 
				                                     break
			
 
				                                 if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
			
 
				                                     if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
			
@@ -2109,7 +2125,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                                             new_split_list[split_index][1]:
			
 
				                                         mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace("，", "")
			
 
				                                         if re.search(key_phone, mid_sentence):
			
 
				-                                            if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
			
 
				+                                            if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
			
 
				                                                 pass
			
 
				                                             else:
			
 
				                                                 distance = 1
			
@@ -2162,7 +2178,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                                         p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
			
 
				                                         if next_entity.entity_type == 'person' and _phone in p_phone:
			
 
				                                             pass
			
 
				-                                        elif entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
			
 
				+                                        elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
			
 
				                                             pass
			
 
				                                         else:
			
 
				                                             distance = (tokens_num_dict[
			
@@ -2913,6 +2929,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                             if get_tenderee_contacts:
			
 
				                                 break
			
 
				 
			
 
				+    # 如果同一个电话连到了不同的单位就直接去掉(2024-09-03 新增)
			
 
				+    get_phone_dict = dict()
			
 
				+    for k in PackDict.keys():
			
 
				+        for i in range(len(PackDict[k]["roleList"])):
			
 
				+            for item in PackDict[k]["roleList"][i].linklist:
			
 
				+                if item[1]:
			
 
				+                    if item[1] not in get_phone_dict:
			
 
				+                        get_phone_dict[item[1]] = set()
			
 
				+                    get_phone_dict[item[1]].add(PackDict[k]["roleList"][i].entity_text)
			
 
				+    # print(get_phone_dict)
			
 
				+    remove_phone = []
			
 
				+    for phone,role_list in get_phone_dict.items():
			
 
				+        if len(role_list)>1:
			
 
				+            remove_phone.append(phone)
			
 
				+    for k in PackDict.keys():
			
 
				+        for i in range(len(PackDict[k]["roleList"])):
			
 
				+            remove_list = []
			
 
				+            for item in PackDict[k]["roleList"][i].linklist:
			
 
				+                if item[1] and item[1] in remove_phone:
			
 
				+                    remove_list.append(item)
			
 
				+            for _item in remove_list:
			
 
				+                PackDict[k]["roleList"][i].linklist.remove(_item)
			
 
				+
			
 
				     for pack in PackDict.keys():
			
 
				         for i in range(len(PackDict[pack]["roleList"])):
			
 
				             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
			
@@ -4223,9 +4262,9 @@ def limit_maximum_amount(dic, list_entity):
 
				                     if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
			
 
				                         # print('单位元小金额且格式类似万元的乘以万倍')
			
 
				                         l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
			
 
				-                    else:
			
 
				-                        # print('中标金额小于限额：%d元 去除' % minximum_amount)
			
 
				-                        l["role_money"]['money'] = 0
			
 
				+                    # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例：520248605
			
 
				+                    #     # print('中标金额小于限额：%d元 去除' % minximum_amount)
			
 
				+                    #     l["role_money"]['money'] = 0
			
 
				 
			
 
				             if float(value['tendereeMoney']) > maximum_amount:
			
 
				                 flag = 1
			
@@ -4246,9 +4285,9 @@ def limit_maximum_amount(dic, list_entity):
 
				                 if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
			
 
				                     # print('单位元小金额且格式类似万元的乘以万倍')
			
 
				                     value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
			
 
				-                else:
			
 
				-                    # print('招标金额小于限额：%d元 去除' % minximum_amount)
			
 
				-                    value['tendereeMoney'] = 0
			
 
				+                # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例：520248605
			
 
				+                #     # print('招标金额小于限额：%d元 去除' % minximum_amount)
			
 
				+                #     value['tendereeMoney'] = 0
			
 
				 
			
 
				 
			
 
				 def limit_maximum_amount_backup(prem, industry):
			
@@ -4296,69 +4335,66 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
 
				     :return:
			
 
				     '''
			
 
				     try:
			
 
				-        if 'win_tenderer' in str(prem) and re.search('联合(体|方|投标人)：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|（联合(体|投标人)）|（联合体(成员|单位)方?[12345一二三四五]?）|（(联合体)?成员单位[12345一二三四五]?）|（特殊普通合伙|成员?）|[，；]成：|（成[），]|与[^，。]{6,100}联合体', list_articles[0].content):
			
 
				+        if 'win_tenderer' in str(prem[0]['prem']) and re.search('联合(体|方|投标人)：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|（联合(体|投标人)）|（联合体(成员|单位)方?[12345一二三四五]?）|（(联合体)?成员单位[12345一二三四五]?）|（特殊普通合伙|成员?）|[，；]成：|（成[），]|与[^，。]{6,100}联合体', list_articles[0].content):
			
 
				             sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
			
 
				-            for project in prem[0].values():
			
 
				-                if not isinstance(project, dict):
			
 
				-                    continue
			
 
				-                for v in project.values():
			
 
				-                    for d in v['roleList']:
			
 
				-                        if d.get('role_name', '') == 'win_tenderer':
			
 
				-                            winner = d.get('role_text')
			
 
				-                            join_l = [winner]
			
 
				-                            for list_entity in list_entitys:
			
 
				-                                for i in range(len(list_entity)-1):
			
 
				-                                    _entity = list_entity[i]
			
 
				-                                    b = _entity.wordOffset_begin
			
 
				-                                    e = _entity.wordOffset_end
			
 
				-                                    if _entity.entity_type in ['org', 'company'] and _entity.label==2\
			
 
				-                                            and _entity.entity_text==winner:
			
 
				-                                        s = sentences[_entity.sentence_index].sentence_text
			
 
				-                                        find_joint = 0 # 是否包含联合体
			
 
				-                                        for j in range(i+1, len(list_entity)):
			
 
				-                                            behind_entity = list_entity[j]
			
 
				-                                            b2 = behind_entity.wordOffset_begin
			
 
				-                                            e2 = behind_entity.wordOffset_end
			
 
				-                                            if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
			
 
				-                                                    and b2-e<13 and re.search('联合(体|方|投标人)：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|[，；]成：|（成）$', s[e:b2]) or \
			
 
				-                                                re.search('（联合(体|方|投标人)）|（联合体(成员|单位)方?[12345一二三四五]?）|（(联合体)?成员单位[12345一二三四五]?）|（特殊普通合伙|成员?）|^（成[），]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
			
 
				-                                                join_l.append(behind_entity.entity_text)
			
 
				-                                                b = b2
			
 
				-                                                e = e2
			
 
				-                                                find_joint = 1
			
 
				-                                            elif (find_joint or re.search('与[^，。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与','；','、','&','，','/','//'] and (len(s)==e2 or s[e2] in ['；','、','&','，','/','//', '。'] or s[e2:e2+3]=='联合体'):
			
 
				-                                                join_l.append(behind_entity.entity_text)
			
 
				-                                                b = b2
			
 
				-                                                e = e2
			
 
				-                                            elif e == e2: # 修复重复实体导致中断情况
			
 
				-                                                continue
			
 
				-                                            else:
			
 
				-                                                break
			
 
				-                                        if len(join_l)>1:
			
 
				-                                            d['win_tenderer_joint'] = ','.join(set(join_l))
			
 
				-
			
 
				-
			
 
				-
			
 
				-                                            # behind_entity = list_entity[i + 1]
			
 
				-                                    # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
			
 
				-                                    #         and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
			
 
				-                                    #     s = sentences[_entity.sentence_index].sentence_text
			
 
				-                                    #     b = _entity.wordOffset_begin
			
 
				-                                    #     e = _entity.wordOffset_end
			
 
				-                                    #     b2 = behind_entity.wordOffset_begin
			
 
				-                                    #     e2 = behind_entity.wordOffset_end
			
 
				-                                        # if re.search('（联合体）', s[e2:e2+6]) and b2-e<3:
			
 
				-                                        #     print('联合体：', s[max(0, b-10):e2+10])
			
 
				-                                        #     d['win_tenderer_joint'] = '%s，%s'%(_entity.entity_text, behind_entity.entity_text)
			
 
				-                                        #     break
			
 
				-                                        # elif re.search('（联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位)）|(联合体)?成员：|特殊普通合伙：', s[e:b2]) and b2-e<10:
			
 
				-                                        #     d['win_tenderer_joint'] = '%s，%s' % (_entity.entity_text, behind_entity.entity_text)
			
 
				-                                        #     print('联合体：', s[max(0, b - 10):e2 + 10])
			
 
				-                                        #     break
			
 
				+            for v in prem[0]['prem'].values():
			
 
				+                for d in v['roleList']:
			
 
				+                    if d.get('role_name', '') == 'win_tenderer':
			
 
				+                        winner = d.get('role_text')
			
 
				+                        join_l = [winner]
			
 
				+                        for list_entity in list_entitys:
			
 
				+                            for i in range(len(list_entity)-1):
			
 
				+                                _entity = list_entity[i]
			
 
				+                                b = _entity.wordOffset_begin
			
 
				+                                e = _entity.wordOffset_end
			
 
				+                                if _entity.entity_type in ['org', 'company'] and _entity.label==2\
			
 
				+                                        and _entity.entity_text==winner:
			
 
				+                                    s = sentences[_entity.sentence_index].sentence_text
			
 
				+                                    find_joint = 0 # 是否包含联合体
			
 
				+                                    for j in range(i+1, len(list_entity)):
			
 
				+                                        behind_entity = list_entity[j]
			
 
				+                                        b2 = behind_entity.wordOffset_begin
			
 
				+                                        e2 = behind_entity.wordOffset_end
			
 
				+                                        if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
			
 
				+                                                and b2-e<13 and re.search('联合(体|方|投标人)：|联合体(成员|单位)[12345一二三四五]?：|(联合体)?成员单位[12345一二三四五]?：|特殊普通合伙：|[，；]成：|（成）$', s[e:b2]) or \
			
 
				+                                            re.search('（联合(体|方|投标人)）|（联合体(成员|单位)方?[12345一二三四五]?）|（(联合体)?成员单位[12345一二三四五]?）|（特殊普通合伙|成员?）|^（成[），]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
			
 
				+                                            join_l.append(behind_entity.entity_text)
			
 
				+                                            b = b2
			
 
				+                                            e = e2
			
 
				+                                            find_joint = 1
			
 
				+                                        elif (find_joint or re.search('与[^，。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与','；','、','&','，','/','//'] and (len(s)==e2 or s[e2] in ['；','、','&','，','/','//', '。'] or s[e2:e2+3]=='联合体'):
			
 
				+                                            join_l.append(behind_entity.entity_text)
			
 
				+                                            b = b2
			
 
				+                                            e = e2
			
 
				+                                        elif e == e2: # 修复重复实体导致中断情况
			
 
				+                                            continue
			
 
				+                                        else:
			
 
				+                                            break
			
 
				+                                    if len(join_l)>1:
			
 
				+                                        d['win_tenderer_joint'] = ','.join(set(join_l))
			
 
				+
			
 
				+
			
 
				+
			
 
				+                                        # behind_entity = list_entity[i + 1]
			
 
				+                                # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
			
 
				+                                #         and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
			
 
				+                                #     s = sentences[_entity.sentence_index].sentence_text
			
 
				+                                #     b = _entity.wordOffset_begin
			
 
				+                                #     e = _entity.wordOffset_end
			
 
				+                                #     b2 = behind_entity.wordOffset_begin
			
 
				+                                #     e2 = behind_entity.wordOffset_end
			
 
				+                                    # if re.search('（联合体）', s[e2:e2+6]) and b2-e<3:
			
 
				+                                    #     print('联合体：', s[max(0, b-10):e2+10])
			
 
				+                                    #     d['win_tenderer_joint'] = '%s，%s'%(_entity.entity_text, behind_entity.entity_text)
			
 
				+                                    #     break
			
 
				+                                    # elif re.search('（联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位)）|(联合体)?成员：|特殊普通合伙：', s[e:b2]) and b2-e<10:
			
 
				+                                    #     d['win_tenderer_joint'] = '%s，%s' % (_entity.entity_text, behind_entity.entity_text)
			
 
				+                                    #     print('联合体：', s[max(0, b - 10):e2 + 10])
			
 
				+                                    #     break
			
 
				     except Exception as e:
			
 
				         print('获取联合体抛出异常', e)
			
 
				 
			
 
				-def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
			
 
				+def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
			
 
				     '''
			
 
				     获取多中标人及正文、附件所有金额，多中标人multi_winner写入prem，返回金额列表
			
 
				     :param channel_dic:
			
@@ -4369,7 +4405,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				     '''
			
 
				 
			
 
				     def add_multi_winner(pack_l, winner_l):
			
 
				-        if len(prem[0]) > 1 and len(set([it[0] for it in pack_l])) > 1:  # 多标段多中标人处理
			
 
				+        if len(prem[0]['prem']) > 1 and len(set([it[0] for it in pack_l])) > 1:  # 多标段多中标人处理
			
 
				             pk_dic = {}
			
 
				             for ent in winner_l:
			
 
				                 for i in range(len(pack_l)):
			
@@ -4395,40 +4431,33 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                 multi_winner = multi_winner - tenderee_or_agency
			
 
				                 if len(multi_winner) < 2:
			
 
				                     continue
			
 
				-                for project in prem[0].values():
			
 
				-                    if not isinstance(project, dict):
			
 
				-                        continue
			
 
				-                    for k, v in project.items():
			
 
				-                        if pk == k:
			
 
				-                            for d in v['roleList']:
			
 
				-                                if d.get('role_name', '') == 'win_tenderer':
			
 
				-                                    if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
			
 
				-                                        d['multi_winner'] = ','.join(set(multi_winner))
			
 
				-        else:
			
 
				-            multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
			
 
				-            if len(multi_winner) > 1:
			
 
				-                for project in prem[0].values():
			
 
				-                    if not isinstance(project, dict):
			
 
				-                        continue
			
 
				-                    for v in project.values():
			
 
				+                for k, v in prem[0]['prem'].items():
			
 
				+                    if pk == k:
			
 
				                         for d in v['roleList']:
			
 
				                             if d.get('role_name', '') == 'win_tenderer':
			
 
				                                 if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
			
 
				                                     d['multi_winner'] = ','.join(set(multi_winner))
			
 
				-                                break
			
 
				+        elif 0 < len(prem[0]['prem']) < 3: # 修复 单包多中标人 例：285780273
			
 
				+            multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
			
 
				+            if len(multi_winner) > 1:
			
 
				+                for v in prem[0]['prem'].values():
			
 
				+                    for d in v['roleList']:
			
 
				+                        if d.get('role_name', '') == 'win_tenderer':
			
 
				+                            if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
			
 
				+                                d['multi_winner'] = ','.join(set(multi_winner))
			
 
				+                            break
			
 
				 
			
 
				     moneys = []
			
 
				     moneys_attachment = []
			
 
				-    if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
			
 
				+    if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
			
 
				         sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
			
 
				-        entitys = sorted(list_entitys[0], key=lambda x: x.sentence_index)
			
 
				         finalists = [] # 入围供应商
			
 
				         multi_winner_l = [] # 保存中标人名称列表
			
 
				         tenderee_or_agency = set()
			
 
				         package_l = []
			
 
				         i = 0
			
 
				-        while i < len(entitys)-1:
			
 
				-            ent = entitys[i]
			
 
				+        while i < len(list_entitys[0])-1:
			
 
				+            ent = list_entitys[0][i]
			
 
				             b_idx_fr = ent.wordOffset_begin
			
 
				             e_idx_fr = ent.wordOffset_end
			
 
				             i += 1
			
@@ -4440,19 +4469,18 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                     moneys.append(money)
			
 
				             elif ent.entity_type in ['package']:
			
 
				                 package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
			
 
				-            elif ent.entity_type in ['org', 'company'] and ent.label in [0,1] and ent.values[ent.label] > 0.8:
			
 
				-                tenderee_or_agency.add(ent.entity_text)
			
 
				-            elif ent.entity_type in ['org', 'company'] and ent.label == 2:
			
 
				+            elif ent.entity_type in ['org', 'company']:
			
 
				                 sentence_text = sentences[ent.sentence_index].sentence_text
			
 
				                 pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
			
 
				-                if ent.values[ent.label] > 0.8:
			
 
				+                if ent.label in [0,1] and ent.values[ent.label] > 0.8:
			
 
				+                    tenderee_or_agency.add(ent.entity_text)
			
 
				+                elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
			
 
				                     multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
			
 
				                     for j in range(i, len(list_entitys[0])):
			
 
				                         ent_bh = list_entitys[0][j]
			
 
				                         b_idx_bh = ent_bh.wordOffset_begin
			
 
				                         e_idx_bh = ent_bh.wordOffset_end
			
 
				-                        if ent_bh.entity_type in ['org','company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
			
 
				-                            sentence_text = sentences[ent_bh.sentence_index].sentence_text
			
 
				+                        if ent_bh.entity_type in ['org','company'] and ent_bh.label in [2,5] and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
			
 
				                             if sentence_text[e_idx_fr:b_idx_bh] in ['；', '、', '&', '，', '/', '//'] and (
			
 
				                                     len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in ['；', '、', '&', '，','/', '//','。']):  # 修复多中标人刚好在文末index超出报错，例子 407126558
			
 
				                                 multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
			
@@ -4460,7 +4488,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                                 i = j + 1
			
 
				                             else:
			
 
				                                 break
			
 
				-                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
			
 
				+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
			
 
				                             multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
			
 
				                             e_idx_fr = e_idx_bh
			
 
				                             i = j + 1
			
@@ -4470,6 +4498,8 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
 
				                             break
			
 
				                     if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
			
 
				                         finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
			
 
				+                elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
			
 
				+                    multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
			
 
				 
			
 
				         if len(multi_winner_l)>=2:
			
 
				             winner_main = [it for it in multi_winner_l if not it[3]]
			
@@ -4532,6 +4562,10 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				             k = list(old_prem.keys()-set(['Project']))[0]
			
 
				             k_new = list(new_prem.keys())[0]
			
 
				             new_prem[k] = new_prem.pop(k_new)
			
 
				+        elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同，把表格提取包名替换为非表格包名
			
 
				+            k = list(old_prem.keys()-set(['Project']))[0]
			
 
				+            k_new = list(new_prem.keys())[0]
			
 
				+            new_prem[k] = new_prem.pop(k_new)
			
 
				 
			
 
				         if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号，非表格没提取到，合并到Project
			
 
				             k = list(new_prem.keys())[0]
			
@@ -4552,6 +4586,8 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				                                 tmp_l.append(d2)
			
 
				                                 if d2['role_text'] != "":
			
 
				                                     d['role_text'] = d2['role_text']
			
 
				+                                if d2['serviceTime'] != "":
			
 
				+                                    d['serviceTime'] = d2['serviceTime']
			
 
				                                 if float(d2['role_money']['money']) != 0:  # 如果表格提取的金额不为0才替换
			
 
				                                     d['role_money']['money'] = d2['role_money']['money']
			
 
				                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
			
@@ -4585,12 +4621,14 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				                                 tmp_l.append(d2)
			
 
				                                 if d2['role_text'] != "":
			
 
				                                     d['role_text'] = d2['role_text']
			
 
				+                                if d2['serviceTime'] != "":
			
 
				+                                    d['serviceTime'] = d2['serviceTime']
			
 
				                                 if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
			
 
				                                     d['role_money']['money'] = d2['role_money']['money']
			
 
				                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
			
 
				-                                for k in set(d2)-set(d): # 把表格提取加的属性补充过来，比如：multi_winner other_winner_dic等
			
 
				-                                    if d2[k]:
			
 
				-                                        d[k] = d2[k]
			
 
				+                                for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来，比如：multi_winner other_winner_dic等
			
 
				+                                    if d2[k2]:
			
 
				+                                        d[k2] = d2[k2]
			
 
				                     for d2 in v['roleList']:
			
 
				                         if d2 not in tmp_l: # 把新预测有，旧没有的角色添加上去
			
 
				                             old_prem[k]['roleList'].append(d2)
			
@@ -4601,7 +4639,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				 
			
 
				     # return old_prem
			
 
				 
			
 
				-def  confirm_prem(prem, channel_dic):
			
 
				+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
			
 
				     '''
			
 
				     规则检查纠正prem，如果Project包中标人在其他包中标人，去掉project包中标角色；如果有其他包中标人，去掉roleList为空的包；
			
 
				     :param prem: prem 字段字典
			
@@ -4610,6 +4648,8 @@ def  confirm_prem(prem, channel_dic):
 
				     if len(prem) > 1:  # 表格提取到中标人的，去掉project包中标人
			
 
				         pro_winner = set()
			
 
				         other_winner = set()
			
 
				+        other_winner_prob = 0
			
 
				+        pro_winner_prob = 0
			
 
				         empty_roleList = []
			
 
				         for k in prem:
			
 
				             prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid
			
@@ -4623,21 +4663,33 @@ def  confirm_prem(prem, channel_dic):
 
				                             pro_winner.update(set(d['win_tenderer_joint'].split(',')))
			
 
				                         if 'multi_winner' in d:
			
 
				                             pro_winner.update(set(d['multi_winner'].split(',')))
			
 
				+                        if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
			
 
				+                            pro_winner_prob = d.get('role_prob', 0)
			
 
				                     else:
			
 
				                         other_winner.add(d['role_text'])
			
 
				                         if 'win_tenderer_joint' in d:
			
 
				                             other_winner.update(set(d['win_tenderer_joint'].split(',')))
			
 
				                         if 'multi_winner' in d:
			
 
				                             other_winner.update(set(d['multi_winner'].split(',')))
			
 
				-        if pro_winner & other_winner != set():
			
 
				+                        if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
			
 
				+                            other_winner_prob = d.get('role_prob', 0)
			
 
				+        if pro_winner!=set() and (pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob): # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大，删除默认包中标人
			
 
				             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
			
 
				                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',
			
 
				                                                                       'third_tenderer']]
			
 
				+        elif other_winner_prob<pro_winner_prob and len(prem)==2: # 两个包情况，如果默认包中标人概率比其他包大，删除其他包
			
 
				+            rm_k = [k for k in prem if k != 'Project']
			
 
				+            for k in rm_k:
			
 
				+                prem.pop(k)
			
 
				         if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
			
 
				             for k in empty_roleList:
			
 
				                 prem.pop(k)
			
 
				     elif "Project" in prem:
			
 
				         prem['Project']['uuid'] = str(uuid.uuid4())
			
 
				+    if is_deposit_project and float(total_tendereeMoney)!=0 and len(prem)==1: #20241107 存款类项目有总投资没招标金额且只有一个标段，把总投资作招标金额
			
 
				+        for k in prem:
			
 
				+            if float(prem[k]['tendereeMoney'])==0:
			
 
				+                prem[k]['tendereeMoney'] = total_tendereeMoney
			
 
				 
			
 
				 
			
 
				 def fix_single_source(prem, channel_dic, original_docchannel):
			
--- a/BiddingKG/dl/interface/header_set.pkl
+++ b/BiddingKG/dl/interface/header_set.pkl
--- a/BiddingKG/dl/interface/htmlparser.py
+++ b/BiddingKG/dl/interface/htmlparser.py
@@ -534,16 +534,17 @@ class ParseDocument():
 
				                                 has_product = True
			
 
				                                 break
			
 
				 
			
 
				-            if _type=="sentence":
			
 
				-                if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
			
 
				-                    list_data[-1]["text"] += _text
			
 
				-                    list_data[-1]["line_width"] = len(_text)
			
 
				-                    _append = True
			
 
				-                elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
			
 
				-                    if list_data[-1]["line_width"]>=max_length*0.7:
			
 
				-                        list_data[-1]["text"] += _text
			
 
				-                        list_data[-1]["line_width"] = len(_text)
			
 
				-                        _append = True
			
 
				+            # 合并两个非标题句子 20241106 注销，由于 485441521 招标内容结束位置不对
			
 
				+            # if _type=="sentence":
			
 
				+            #     if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
			
 
				+            #         list_data[-1]["text"] += _text
			
 
				+            #         list_data[-1]["line_width"] = len(_text)
			
 
				+            #         _append = True
			
 
				+            #     elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
			
 
				+            #         if list_data[-1]["line_width"]>=max_length*0.7:
			
 
				+            #             list_data[-1]["text"] += _text
			
 
				+            #             list_data[-1]["line_width"] = len(_text)
			
 
				+            #             _append = True
			
 
				 
			
 
				             if _type=="table":
			
 
				                 _soup = BeautifulSoup(_text,"lxml")
			
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -104,6 +104,9 @@ class Model_role_classify_word():
 
				             text = re.sub('(最终)?排名：', '    ', text)
			
 
				         text = re.sub('交易单位', '发布单位', text)
			
 
				         text = re.sub('[，：]各种数据：', '：', text) # 20240620优化 478331984 山东省交通运输厅站源提取不到 各种数据：中标单位，各种数据：济南金曰公路工程有限公司，
			
 
				+        text = re.sub('电子签章', '', text) # 20240924 修复 529923459 电子签名：投标人名称（电子签章：西君兰信息科技有限公司，2024年9月7日 预测为中标
			
 
				+        text = re.sub('采购方式', 'xxxx', text) # 修复 499096797 招标人预测错误
			
 
				+        text = re.sub('中标人\d名称', '中标人名称', text) # 修复 499096797 中标人预测错误
			
 
				         return text.replace('(', '（').replace(')', '）').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
			
 
				 
			
 
				     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
			
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -52,7 +52,6 @@ def extract_sentence_list(sentence_list):
 
				                 new_sentence2_list_attach.append(sentence2)
			
 
				             else:
			
 
				                 new_sentence2_list.append(sentence2)
			
 
				-
			
 
				     return new_sentence2_list, new_sentence2_list_attach
			
 
				 
			
 
				 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
			
@@ -63,18 +62,18 @@ addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|
 
				 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([:：，]|$)"
			
 
				 out_lines = []
			
 
				 
			
 
				-def extract_parameters(parse_document, content):
			
 
				+def extract_parameters(parse_document):
			
 
				     '''
			
 
				     通过大纲、预处理后文本正则获取需要字段
			
 
				     :param parse_document: ParseDocument() 方法返回结果
			
 
				-    :param content: 公告预处理后文本
			
 
				     :return:
			
 
				     '''
			
 
				     list_data = parse_document.tree
			
 
				-    requirement_text = ''
			
 
				-    aptitude_text = ''
			
 
				-    addr_bidopen_text = ''
			
 
				-    addr_bidsend_text = ''
			
 
				+    requirement_text = '' # 采购内容
			
 
				+    aptitude_text = '' # 资质要求
			
 
				+    addr_bidopen_text = '' # 开标地址
			
 
				+    addr_bidsend_text = '' # 投标地址
			
 
				+    requirement_scope = [] # 采购内容始末位置
			
 
				 
			
 
				     _find_count = 0
			
 
				     _data_i = -1
			
@@ -86,14 +85,18 @@ def extract_parameters(parse_document, content):
 
				         # print(_data.keys())
			
 
				         if _type=="sentence":
			
 
				             if _data["sentence_title"] is not None:
			
 
				-                if re.search('[（(][一二三四五六七八九十]+[)）]|[一二三四五六七八九十]+\s*、', _text[:10]):
			
 
				+                if re.search('[（(][一二三四五六七八九十}]+[)）]|[一二三四五六七八九十]+\s*、|^\d{1,2}[.、][\u4e00-\u9fa5]', _text[:10]):
			
 
				                     out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
			
 
				 
			
 
				                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求，', _text[:30])==None:
			
 
				+                    b = (_data['sentence_index'], _data['wordOffset_begin'])
			
 
				                     childs = get_childs([_data])
			
 
				                     for c in childs:
			
 
				                         # requirement_text += c["text"]+"\n"
			
 
				                         requirement_text += c["text"]
			
 
				+                    e = (c['sentence_index'], c["wordOffset_end"]) if len(childs)>0 else (_data['sentence_index'], _data['wordOffset_end'])
			
 
				+                    requirement_scope.append(b)
			
 
				+                    requirement_scope.append(e)
			
 
				                     _data_i += len(childs)
			
 
				                     _data_i -= 1
			
 
				     _data_i = -1
			
@@ -161,15 +164,23 @@ def extract_parameters(parse_document, content):
 
				         addr_bidopen_text = addr_bidopen_text[b:e]
			
 
				     elif re.search('开启', addr_bidopen_text) and re.search('时间：\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
			
 
				         addr_bidopen_text = ""
			
 
				-    if addr_bidopen_text == "":
			
 
				-        ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)）?(会议)?地[点址]([(（]网址[)）])?[：为][^，；。]{2,100}[，；。]', content)
			
 
				-        if ser:
			
 
				-            addr_bidopen_text = ser.group(0)
			
 
				     if re.search('时间：', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidsend_text):
			
 
				         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([(（]网址[)）])?：[^，；。]{2,100}[，；。]', addr_bidsend_text):
			
 
				             b, e = ser.span()
			
 
				         addr_bidsend_text = addr_bidsend_text[b:e]
			
 
				-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines
			
 
				+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope
			
 
				+
			
 
				+def extract_addr(content):
			
 
				+    '''
			
 
				+    通过正则提取地址
			
 
				+    :param content:  公告预处理后文本
			
 
				+    :return:
			
 
				+    '''
			
 
				+    addr_bidopen_text = ''
			
 
				+    ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)）?(会议)?地[点址]([(（]网址[)）])?[：为][^，；。]{2,100}[，；。]', content)
			
 
				+    if ser:
			
 
				+        addr_bidopen_text = ser.group(0)
			
 
				+    return addr_bidopen_text
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
--- a/BiddingKG/dl/table_head/model_40_0.951.pth
+++ b/BiddingKG/dl/table_head/model_40_0.951.pth
--- a/BiddingKG/dl/table_head/model_40_0.959.pth
+++ b/BiddingKG/dl/table_head/model_40_0.959.pth
--- a/BiddingKG/dl/table_head/models/model_torch.py
+++ b/BiddingKG/dl/table_head/models/model_torch.py
@@ -0,0 +1,83 @@
 
				+import torch.nn as nn
			
 
				+import torch
			
 
				+
			
 
				+
			
 
				+class TableHeadModel(nn.Module):
			
 
				+    def __init__(self):
			
 
				+        super(TableHeadModel, self).__init__()
			
 
				+        self.char_num = 20
			
 
				+        self.char_embed = 60
			
 
				+        self.char_embed_expand = 128
			
 
				+
			
 
				+        self.dense0 = nn.Linear(self.char_embed, self.char_embed_expand)
			
 
				+
			
 
				+        self.dense3 = nn.Linear(self.char_num * self.char_embed_expand, 64)
			
 
				+        self.dense4 = nn.Linear(64, 1)
			
 
				+
			
 
				+        self.sigmoid = nn.Sigmoid()
			
 
				+
			
 
				+        self.ln_dnn_2 = nn.LayerNorm([64])
			
 
				+
			
 
				+        self.device = torch.device("cpu")
			
 
				+
			
 
				+        self.relu = nn.LeakyReLU()
			
 
				+        self.dropout = nn.Dropout(0.3)
			
 
				+
			
 
				+        self.cnn1d_0 = nn.Conv1d(self.char_embed_expand,
			
 
				+                                 self.char_embed_expand,
			
 
				+                                 (3,), padding=self.get_padding(3))
			
 
				+        self.cnn1d_1 = nn.Conv1d(self.char_embed_expand,
			
 
				+                                 self.char_embed_expand,
			
 
				+                                 (3,), padding=self.get_padding(3))
			
 
				+
			
 
				+        self.cnn3d_0 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
			
 
				+                                 (3, 3, 3), padding=self.get_padding(3))
			
 
				+        self.cnn3d_1 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
			
 
				+                                 (3, 3, 3), padding=self.get_padding(3))
			
 
				+
			
 
				+    def get_padding(self, kernel_size, stride=1):
			
 
				+        return (kernel_size - 1) // 2 * stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        batch, row, col, char_num, char_embed = x.shape
			
 
				+
			
 
				+        # cnn 1d
			
 
				+        cnn1d_x = torch.squeeze(x, 0)
			
 
				+        cnn1d_x = cnn1d_x.view([row*col, char_num, char_embed])
			
 
				+
			
 
				+        cnn1d_x = self.dense0(cnn1d_x)
			
 
				+
			
 
				+        cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
			
 
				+        cnn1d_x = self.cnn1d_0(cnn1d_x)
			
 
				+        cnn1d_x = self.relu(cnn1d_x)
			
 
				+        cnn1d_x = self.dropout(cnn1d_x)
			
 
				+        cnn1d_x = self.cnn1d_1(cnn1d_x)
			
 
				+        cnn1d_x = self.relu(cnn1d_x)
			
 
				+        cnn1d_x = self.dropout(cnn1d_x)
			
 
				+
			
 
				+        cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
			
 
				+        cnn1d_x = cnn1d_x.contiguous().view(row, col, char_num, self.char_embed_expand)
			
 
				+        cnn1d_x = torch.unsqueeze(cnn1d_x, 0)
			
 
				+        # print(cnn1d_x.shape)
			
 
				+
			
 
				+        # cnn 3d
			
 
				+        cnn3d_x = torch.permute(cnn1d_x, [0, 4, 3, 1, 2])
			
 
				+        cnn3d_x = self.cnn3d_0(cnn3d_x)
			
 
				+        cnn3d_x = self.relu(cnn3d_x)
			
 
				+        cnn3d_x = self.dropout(cnn3d_x)
			
 
				+        cnn3d_x = self.cnn3d_1(cnn3d_x)
			
 
				+        cnn3d_x = self.relu(cnn3d_x)
			
 
				+        cnn3d_x = self.dropout(cnn3d_x)
			
 
				+
			
 
				+        cnn3d_x = torch.squeeze(cnn3d_x, 0)
			
 
				+        cnn3d_x = torch.permute(cnn3d_x, [2, 3, 1, 0])
			
 
				+        cnn3d_x = cnn3d_x.contiguous().view(row, col, char_num * self.char_embed_expand)
			
 
				+
			
 
				+        # dnn
			
 
				+        x = self.dense3(cnn3d_x)
			
 
				+        x = self.ln_dnn_2(x)
			
 
				+        x = self.relu(x)
			
 
				+        x = self.dense4(x)
			
 
				+        x = self.sigmoid(x)
			
 
				+        x = torch.squeeze(x, -1)
			
 
				+        return x
			
--- a/BiddingKG/dl/table_head/pre_process_torch.py
+++ b/BiddingKG/dl/table_head/pre_process_torch.py
@@ -0,0 +1,132 @@
 
				+#coding=utf-8
			
 
				+import os
			
 
				+import sys
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+from torch.utils.data import Dataset
			
 
				+
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
			
 
				+from BiddingKG.dl.common.Utils import embedding_word, embedding_word_forward
			
 
				+
			
 
				+
			
 
				+def set_label(row, row_label):
			
 
				+    if len(row) == 1:
			
 
				+        row_label = [0 for x in row]
			
 
				+    elif len(set(row)) == 1:
			
 
				+        row_label = [0 for x in row]
			
 
				+    else:
			
 
				+        row_label = [0 if x in ["", " ", "/", '无', '-', '~~'] else row_label[i] for i, x in enumerate(row)]
			
 
				+    return row_label
			
 
				+
			
 
				+
			
 
				+def set_same_table_head(inputs, y_pred1):
			
 
				+    inputs = torch.squeeze(inputs, 0)
			
 
				+
			
 
				+    for i in range(inputs.shape[0]):
			
 
				+        for j in range(inputs.shape[1]-1):
			
 
				+            col1 = inputs[i, j, :, :]
			
 
				+            col2 = inputs[i, j+1, :, :]
			
 
				+            if (torch.abs(col1 - col2) < 1e-4).all():
			
 
				+                # print('same value', col1[abs(col1) > 0.], col2[abs(col1) > 0.])
			
 
				+                if (y_pred1[i, j] <= 0.5 and y_pred1[i, j+1] <= 0.5) or (y_pred1[i, j] > 0.5 and y_pred1[i, j+1] > 0.5):
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    # print('differ label', y_pred[i, j], y_pred[i, j+1])
			
 
				+                    y_pred1[i, j+1] = y_pred1[i, j]
			
 
				+
			
 
				+    for i in range(inputs.shape[1]):
			
 
				+        for j in range(inputs.shape[0]-1):
			
 
				+            row1 = inputs[j, i, :, :]
			
 
				+            row2 = inputs[j+1, i, :, :]
			
 
				+            if (torch.abs(row1 - row2) < 1e-4).all():
			
 
				+                if (y_pred1[j, i] <= 0.5 and y_pred1[j+1, i] <= 0.5) or (y_pred1[j, i] > 0.5 and y_pred1[j+1, i] > 0.5):
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    # print('same value', row1[abs(row1) > 0.], row2[abs(row2) > 0.])
			
 
				+                    # print('differ label', y_pred[i, j], y_pred[i, j+1])
			
 
				+                    # print('before', x11[0, j, i], x11[0, j+1, i])
			
 
				+                    y_pred1[j+1, i] = y_pred1[j, i]
			
 
				+                    # print('after', x1[0, j, i],  x1[0, j+1, i])
			
 
				+    return y_pred1
			
 
				+
			
 
				+
			
 
				+def data_to_numpy29(data_list, data_label_list):
			
 
				+    """
			
 
				+    输出表格 (table_cnt, row, col, 20, 60)
			
 
				+
			
 
				+    :param data_list:
			
 
				+    :param data_label_list:
			
 
				+    :return:
			
 
				+    """
			
 
				+    data_num = len(data_list)
			
 
				+
			
 
				+    new_data_list = []
			
 
				+    new_label_list = []
			
 
				+    mask_list = []
			
 
				+    for i in range(len(data_list)):
			
 
				+        table = data_list[i]
			
 
				+        table_label = []
			
 
				+        if data_label_list:
			
 
				+            table_label = data_label_list[i]
			
 
				+        embed_list = []
			
 
				+        label_list = []
			
 
				+        mask = []
			
 
				+        for j in range(len(table)):
			
 
				+            row = table[j]
			
 
				+            blank_list = [0 if x in ["", " ", "/"] else 1 for x in row]
			
 
				+            mask.append(blank_list)
			
 
				+            row = embedding_word_forward(row, shape=(len(row), 20, 60))
			
 
				+            embed_list.append(row)
			
 
				+            if data_label_list:
			
 
				+                row_label = table_label[j]
			
 
				+                # print(j, row_label)
			
 
				+                row_label = [int(x) for x in row_label]
			
 
				+                row_label = set_label(table[j], row_label)
			
 
				+                label_list.append(row_label)
			
 
				+        embed_list = np.array(embed_list, dtype=np.float32)
			
 
				+        label_list = np.array(label_list, dtype=np.float32)
			
 
				+        mask = np.array(mask, dtype=np.float32)
			
 
				+        # print('embed_list.shape', embed_list.shape)
			
 
				+        # print('label_list.shape', label_list.shape)
			
 
				+        new_data_list.append(embed_list)
			
 
				+        new_label_list.append(label_list)
			
 
				+        mask_list.append(mask)
			
 
				+
			
 
				+    new_data_list = np.array(new_data_list, dtype=np.float32)
			
 
				+    new_label_list = np.array(new_label_list, dtype=np.float32)
			
 
				+    mask_list = np.array(mask_list, dtype=np.float32)
			
 
				+    # print(new_data_list.shape)
			
 
				+
			
 
				+    return new_data_list, new_label_list, mask_list
			
 
				+
			
 
				+
			
 
				+class CustomDatasetTiny40(Dataset):
			
 
				+    def __init__(self, data_x, data_y, mode=0):
			
 
				+        if mode in [0, 1]:
			
 
				+            # Split -> Train, Test
			
 
				+            split_size = int(len(data_x)*0.1)
			
 
				+            test_x, test_y = data_x[:split_size], data_y[:split_size]
			
 
				+            train_x, train_y = data_x[split_size:], data_y[split_size:]
			
 
				+
			
 
				+            if mode == 0:
			
 
				+                self.data = train_x
			
 
				+                self.targets = train_y
			
 
				+            else:
			
 
				+                self.data = test_x
			
 
				+                self.targets = test_y
			
 
				+        else:
			
 
				+            pass
			
 
				+
			
 
				+        # self.data = data
			
 
				+        # self.targets = targets
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        return len(self.data)
			
 
				+
			
 
				+    def __getitem__(self, idx):
			
 
				+        # x, y = data_to_numpy12([self.data[idx]], [self.targets[idx]])
			
 
				+        x, y, mask = data_to_numpy29([self.data[idx]], [self.targets[idx]])
			
 
				+        x = x[0]
			
 
				+        y = y[0]
			
 
				+        mask = mask[0]
			
 
				+        return x, y, mask
			
--- a/BiddingKG/dl/table_head/predict_torch.py
+++ b/BiddingKG/dl/table_head/predict_torch.py
@@ -0,0 +1,68 @@
 
				+import copy
			
 
				+import os
			
 
				+import sys
			
 
				+import torch
			
 
				+from torch.utils.data import DataLoader
			
 
				+
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
			
 
				+from BiddingKG.dl.table_head.models.model_torch import TableHeadModel
			
 
				+from BiddingKG.dl.table_head.pre_process_torch import CustomDatasetTiny40, set_same_table_head, set_label
			
 
				+
			
 
				+device = torch.device("cpu")
			
 
				+model_path = os.path.abspath(os.path.dirname(__file__)) + '/model_40_0.951.pth'
			
 
				+batch_size = 1
			
 
				+
			
 
				+
			
 
				+def predict(table_text_list):
			
 
				+    if globals().get("model") is None:
			
 
				+        print("="*15, "init table_head model", "="*15)
			
 
				+        # 实例化模型
			
 
				+        model = TableHeadModel()
			
 
				+        model.to(device)
			
 
				+        model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
			
 
				+        # 将模型设置为评估模式
			
 
				+        model.eval()
			
 
				+        globals()["model"] = model
			
 
				+    else:
			
 
				+        model = globals().get("model")
			
 
				+
			
 
				+    if len(table_text_list) <= 0:
			
 
				+        return []
			
 
				+
			
 
				+    data_x = copy.deepcopy(table_text_list)
			
 
				+    data_y = [[0 for col in row] for row in data_x]
			
 
				+
			
 
				+    row_len = len(data_x)
			
 
				+    col_len = len(data_x[0])
			
 
				+
			
 
				+    if col_len >= 50:
			
 
				+        return data_y
			
 
				+
			
 
				+    if col_len >= 20:
			
 
				+        batch_row_len = 50
			
 
				+    else:
			
 
				+        batch_row_len = 100
			
 
				+
			
 
				+    result_list = []
			
 
				+    for i in range(0, row_len, batch_row_len):
			
 
				+        batch_data_x = data_x[i:i+batch_row_len]
			
 
				+        dataset = CustomDatasetTiny40([batch_data_x], [data_y], mode=0)
			
 
				+        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
			
 
				+        # 存储预测结果
			
 
				+        with torch.no_grad():
			
 
				+            for data, targets, _ in data_loader:
			
 
				+                data = data.to(device)
			
 
				+                outputs = model(data)
			
 
				+                outputs = set_same_table_head(data, outputs)
			
 
				+                result = torch.zeros_like(outputs)
			
 
				+                result[outputs >= 0.5] = 1
			
 
				+                result = result.numpy().tolist()
			
 
				+        result_list += result
			
 
				+
			
 
				+    # 设置一些特定的表头
			
 
				+    for i in range(len(result_list)):
			
 
				+        row = table_text_list[i]
			
 
				+        row_label = result_list[i]
			
 
				+        result_list[i] = set_label(row, row_label)
			
 
				+
			
 
				+    return result_list