1 month ago · 342bcc2e00
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -1160,7 +1160,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
															     # 使用正则识别金额
														
 
															     entity_type = "money"
														
 
															     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
														
 
															+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：?)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
														
 
															                           "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
														
 
															                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															     # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
														
@@ -1269,10 +1269,12 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
															                     unit = '万元'
														
 
															                 elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
														
 
															                     unit = '万元'
														
 
															-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
														
 
															+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本|报价（单位）)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
														
 
															                     if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
														
 
															                         unit = '万元'
														
 
															                         # print('金额较小且句子中有万元的，补充单位为万元')
														
 
															+                    elif re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<100 and re.search('单位：%', sentence_text):
														
 
															+                        continue
														
 
															                     elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
														
 
															                         unit = '万元'
														
 
															                     else:
														
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -574,12 +574,11 @@ def match_enterprise_max_first(sentence):
 
															             else:
														
 
															                 break
														
 
															     # print("======",list_match)
														
 
															-    long_names = ['乌鲁木齐经济技术开发区（乌鲁木齐市头屯河区）市场监督管理局（区知识产权局、区市场监管综合行政执法队）']
														
 
															-    if len(sentence) > MAX_ENTERPRISE_LEN:
														
 
															-        for name in long_names: # 规则补充超长实体
														
 
															-            for it in re.finditer(name, sentence):
														
 
															-                match_item = {"entity_text": "%s" % (name), "begin_index": it.start(), "end_index": it.end()}
														
 
															-                list_match.append(match_item)
														
 
															+    not_match_names = ['乌鲁木齐经济技术开发区（乌鲁木齐市头屯河区）市场监督管理局（区知识产权局、区市场监管综合行政执法队）', '政采云有限公司'] # 字典匹配不到的名称列表
														
 
															+    pattern = re.compile('|'.join(not_match_names))
														
 
															+    for it in re.finditer(pattern, sentence):
														
 
															+        match_item = {"entity_text": "%s" % (it.group(0)), "begin_index": it.start(), "end_index": it.end()}
														
 
															+        list_match.append(match_item)
														
 
															     return list_match
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -3763,6 +3763,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 if entity_type == 'location' and re.search('^\w{2,4}[市县]\w{2,15}(中心|监狱|殡仪馆|水利站)$', entity_text) and \
														
 
															                     re.search('\d[楼层号]', entity_text)==None: # 2024/06/07 修改错误地址实体为角色
														
 
															                     entity_type = 'org'
														
 
															+                elif entity_type in ["org", "company"] and re.search('地址：$', sentence_text[:begin_index_temp]): # 20250421 修复地址识别错为角色 地址：新疆阿拉尔幸福镇十三团，2、运维公司名称：政采云有限公司
														
 
															+                    entity_type = 'location'
														
 
															                 if begin_index_temp>0 and '县' in entity_text and re.match('前郭尔罗斯蒙古族自治县|积石山县', sentence_text[begin_index_temp-1:end_index_temp]): #20240905 修复实体识别少字问题
														
 
															                     entity_text = sentence_text[begin_index_temp-1] + entity_text
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -512,7 +512,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															-    version_date = {'version_date': '2025-04-02'}
														
 
															+    version_date = {'version_date': '2025-04-22'}
														
 
															     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
														
 
															     if original_docchannel == 302:
														
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -95,7 +95,8 @@ class Model_role_classify_word():
 
															         text = re.sub('[【（\[][0-9]{2,}[\]）】]|\d+([：:.-]\d+)+', 'd', text)
														
 
															         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
														
 
															         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
														
 
															-        text = re.sub('序号：\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、|([^\w]|^)序：?\d+', '  d', text) # ，序：1，单位名称：
														
 
															+        text = re.sub('序号：\d+', '序号：d', text)
														
 
															+        text = re.sub('第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、|([^\w]|^)序：?\d+', '  d', text) # ，序：1，单位名称：
														
 
															         text = re.sub('(中标|成交|中选|入围)(工程|项目)', '工程', text)  # 修复易错分为中标人
														
 
															         text = re.sub('约定|(盖章|签名)：?', '  ', text) # 修复 233233636 错分为中标人 国有产权网上竞价有关约定 辽阳市公共资源交易中心 ，标  修复 273505905 乡镇签名：盖章： 次村产权交易服务中心 预测为中标
														
 
															         text = re.sub('中介机构', '投标机构', text) # 251058999 错分为中标人 序号：2，中介机构名称：
														
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -62,7 +62,6 @@ aptitude_pattern = "资质（资格）要求|资格（资质）要求|单位要
 
															 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[)）]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([:：，]|$)|开启([:：，]|$)"
														
 
															 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([:：，]|$)"
														
 
															 pinmu_name_pattern = "采购品目(名称)?([:：，]|$)"
														
 
															-out_lines = []
														
 
															 policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
														
 
															 not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
														
@@ -80,6 +79,7 @@ def extract_parameters(parse_document):
 
															     requirement_scope = [] # 采购内容始末位置
														
 
															     pinmu_name = '' # 品目名称
														
 
															     list_policy = [] # 政策法规
														
 
															+    out_lines = []
														
 
															     _find_count = 0
														
 
															     _data_i = -1
														
@@ -92,7 +92,9 @@ def extract_parameters(parse_document):
 
															         if _type=="sentence":
														
 
															             if _data["sentence_title"] is not None:
														
 
															                 if re.search('[（(][一二三四五六七八九十}]+[)）]|[一二三四五六七八九十]+\s*、|^\d{1,2}[.、][\u4e00-\u9fa5]', _text[:10]):
														
 
															-                    out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
														
 
															+                    idx = _text.replace(':', '：').find('：')
														
 
															+                    outline_text = _text[:idx] if idx >= 4 else _text
														
 
															+                    out_lines.append((outline_text, _data['sentence_index'], _data['wordOffset_begin']))
														
 
															                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求，', _text[:30])==None:
														
 
															                     b = (_data['sentence_index'], _data['wordOffset_begin'])
														
@@ -220,10 +222,10 @@ if __name__ == "__main__":
 
															     from bs4 import BeautifulSoup
														
 
															     import json
														
 
															-    df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
														
 
															-    df['len']= df['招标内容'].apply(lambda x: len(x))
														
 
															-    print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
														
 
															-    print(len([it for it in df['len'] if it>1500]))
														
 
															+    # df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
														
 
															+    # df['len']= df['招标内容'].apply(lambda x: len(x))
														
 
															+    # print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
														
 
															+    # print(len([it for it in df['len'] if it>1500]))
														
 
															     # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
														
 
															     # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx')
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -916,7 +916,7 @@ class PREMPredict():
 
															                 elif re.search('^，?(投标报价|(资格性审查：|符合性审查：)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人：单位名称： 上海理想信息产业（集团）有限公司 ，投标报价：
														
 
															                     values[2] = 0.5
														
 
															                     label = 5
														
 
															-                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单)：$|确定为标的的受让方，$|[主次出]入口?，?$|确定(项目|\w{,2})成交供应商，$|，承刻单位：$|乙方接受为$|丙方：$', front):  # 234501112 民币元，序号：1，债务人： 东营市海宁工贸有限责任公司 ，债权本金： 262414286 八、中标后签约单位，合同签约单位： 241929628 1月9，承刻单位： 肃宁县超凡网络光敏印章刻印部 ，印章预留印模
														
 
															+                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单)：$|确定为标的的受让方，$|[主次出]入口?，?$|确定(项目|\w{,2})成交供应商，$|，承刻单位：$|乙方接受为$|丙方：$|来源名称：$', front):  # 234501112 民币元，序号：1，债务人： 东营市海宁工贸有限责任公司 ，债权本金： 262414286 八、中标后签约单位，合同签约单位： 241929628 1月9，承刻单位： 肃宁县超凡网络光敏印章刻印部 ，印章预留印模  600825761 来源名称：红星集团，评标时间：
														
 
															                     label = 5
														
 
															                 elif re.search('，来源：$', front) and re.search('^，', behind): # 修复 472062585 项目采购-关于定制手机询比价采购中标公告，来源：深圳市网联安瑞网络科技有限公司 预测为中标
														
 
															                     label = 0
														
@@ -1487,7 +1487,7 @@ class RoleRulePredictor():
 
															                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书，致|征集结果|选择中介|选择结果|成交对象|勘察人|(，|审计|处置|勘察|设计)服务单位|受托[人方])[：:是为]+$" \
														
 
															                "|((评审结果|名次|排名|中标结果)[:：]*第?[一1]名?)[：:是为]+$|成交供应商信息[，：]?(序号1)?：?|供应商名称$|竞争性选择申请人名称：$" \
														
 
															                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[：:是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格)，$|合作伙伴名称：$|供应商（乙方）-?$" \
														
 
															-               "|现(公布|宣布|公示)中标单位如下：$|现将中标单位(公布|公示)如下：$|现宣布以下(企业|单位|公司)中标：$|经讨论，决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人：$)"  # 承办单位：不作为中标 83914772  |施工 单位不作为中标人 例：386692187
														
 
															+               "|现(公布|宣布|公示)中标单位如下：$|现将中标单位(公布|公示)如下：$|现宣布以下(企业|单位|公司)中标：$|经讨论，决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人：$|入围供应商如下（排名不分先后）[，：]$)"  # 承办单位：不作为中标 83914772  |施工 单位不作为中标人 例：386692187
														
 
															         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
														
 
															                                            "(，|。|：|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
														
 
															                                            "(：?单位名称|：?名称|盖章)?[,，]?([(（]按综合排名排序[)）]|：择优选取)?[：:,，]$|选取(情况|说明)：中选，中介机构名称：$|排名如下：1、$|第[一1]名，?投标(人|单位|银行|公司)：$)"  # 解决表头识别不到加逗号情况，需前面为，。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
														
@@ -7817,7 +7817,7 @@ class WebsourceTenderee():
 
															             web_ree = '中国人民解放军总医院'
														
 
															         elif web_source_no.startswith('Y00484-') and web_ree == "":
														
 
															             web_ree = '航空总医院'
														
 
															-        if web_ree == "" and re.search('\w{2,8}(大学|医院)$', web_source_name): # 20240524 大学、医院类站源没唯一招标人默认为站源名称
														
 
															+        if web_ree == "" and re.search('\w{2,8}(大学|医院|妇幼保健院)$', web_source_name): # 20240524 大学、医院类站源没唯一招标人默认为站源名称
														
 
															             web_ree = web_source_name
														
 
															         if web_ree != '':
														
 
															             if 'Project' in prem[0]['prem']: