Просмотр исходного кода

新增投标地点;优化多中标人、联合体;属性补充产品名称等

lsm 10 месяцев назад
Родитель
Сommit
35a5bdee5e

+ 7 - 3
BiddingKG/dl/interface/extract.py

@@ -253,10 +253,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -432,7 +432,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-07-26'}
+    version_date = {'version_date': '2024-07-30'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -452,6 +452,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
             if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
                 data_res['prem'].pop('Project')
 
+    # 把产品属性里面的产品补充到产品列表
+    for d in data_res['product_attrs']['data']:
+        if isinstance(d['product'], str) and d['product'] not in data_res['product']:
+            data_res['product'].append(d['product'])
 
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])

+ 11 - 5
BiddingKG/dl/interface/getAttributes.py

@@ -4285,7 +4285,7 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     :return:
     '''
     try:
-        if 'win_tenderer' in str(prem) and re.search('联合(体|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]', list_articles[0].content):
+        if 'win_tenderer' in str(prem) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]', list_articles[0].content):
             sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
             for project in prem[0].values():
                 if not isinstance(project, dict):
@@ -4303,13 +4303,19 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
                                     if _entity.entity_type in ['org', 'company'] and _entity.label==2\
                                             and _entity.entity_text==winner:
                                         s = sentences[_entity.sentence_index].sentence_text
+                                        find_joint = 0 # 是否包含联合体
                                         for j in range(i+1, len(list_entity)):
                                             behind_entity = list_entity[j]
                                             b2 = behind_entity.wordOffset_begin
                                             e2 = behind_entity.wordOffset_end
                                             if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
-                                                    and b2-e<10 and re.search('联合(体|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[b2-e:b2]) or \
-                                                re.search('(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
+                                                    and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
+                                                re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
+                                                join_l.append(behind_entity.entity_text)
+                                                b = b2
+                                                e = e2
+                                                find_joint = 1
+                                            elif find_joint and s[e:b2] in [';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。']):
                                                 join_l.append(behind_entity.entity_text)
                                                 b = b2
                                                 e = e2
@@ -4377,9 +4383,9 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     ent_bh = list_entitys[0][j]
                     b_idx_bh = ent_bh.wordOffset_begin
                     e_idx_bh = ent_bh.wordOffset_end
-                    if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
+                    if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr in [1, 2]:
                         sentence_text = sentences[ent_bh.sentence_index].sentence_text
-                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、','&',','] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in [';','、','&', ',', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
+                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、','&',',','/','//'] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in [';','、','&', ',','/','//', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
                             if ent_bh.entity_text not in multi_winner_l:
                                 multi_winner_l.append(ent_bh.entity_text)
                             e_idx_fr = e_idx_bh

+ 13 - 1
BiddingKG/dl/interface/outline_extractor.py

@@ -57,6 +57,7 @@ requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::,]|$)"
 aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
+addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
 out_lines = []
 
 def extract_parameters(parse_document, content):
@@ -70,6 +71,7 @@ def extract_parameters(parse_document, content):
     requirement_text = ''
     aptitude_text = ''
     addr_bidopen_text = ''
+    addr_bidsend_text = ''
 
     _find_count = 0
     _data_i = -1
@@ -146,6 +148,12 @@ def extract_parameters(parse_document, content):
                         addr_bidopen_text += c["text"]
                     _data_i += len(childs)
                     _data_i -= 1
+                elif re.search(addr_bidsend_pattern, _text[:20]):
+                    childs = get_childs([_data], max_depth=1)
+                    for c in childs:
+                        addr_bidsend_text += c["text"]
+                    _data_i += len(childs)
+                    _data_i -= 1
     if re.search('时间:', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
         for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
             b, e = ser.span()
@@ -156,7 +164,11 @@ def extract_parameters(parse_document, content):
         ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
         if ser:
             addr_bidopen_text = ser.group(0)
-    return requirement_text, aptitude_text, addr_bidopen_text
+    if re.search('时间:', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
+        for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
+            b, e = ser.span()
+        addr_bidsend_text = addr_bidsend_text[b:e]
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:

+ 26 - 20
BiddingKG/dl/interface/predictor.py

@@ -1400,7 +1400,7 @@ class RoleRulePredictor():
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
                                      "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_right = "(?P<tenderee_right>^(机关)?([((](以下简称)?[,\"“]*((招标|采购)(人|单位|机构)|(服务)?购买方)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
         self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
@@ -1464,7 +1464,7 @@ class RoleRulePredictor():
         
         self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价|总费用约?为")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
-        self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
+        self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
         # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
@@ -6511,7 +6511,7 @@ class TablePremExtractor(object):
             package_code = package_code_raw
             if re.search('合计|总计', package_code+project_code):
                 continue
-            if package_code != '' and package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
+            if package_code + project_code != '' and package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
                 same_package = True
                 project_name = ''
             previous_package = package_code + project_code
@@ -6612,7 +6612,7 @@ class TablePremExtractor(object):
                         "role_text": tenderee,
                         "serviceTime": ""
                 })
-            if tenderer and not same_package:
+            if tenderer:
                 if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
@@ -6628,20 +6628,26 @@ class TablePremExtractor(object):
                 if (re.search('费率|下浮率|[%%‰折]',
                               bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
                     bid_amount = 0
-                prem_dic[package]['roleList'].append({
-                        "address": "",
-                        "linklist": [],
-                        "role_money": {
-                            "discount_ratio": "",
-                            "downward_floating_ratio": "",
-                            "floating_ratio": "",
-                            "money": bid_amount,
-                            "money_unit": money_unit
-                        },
-                        "role_name": "win_tenderer",
-                        "role_text": tenderer,
-                        "serviceTime": ""
-                })
+                if not same_package:
+                    prem_dic[package]['roleList'].append({
+                            "address": "",
+                            "linklist": [],
+                            "role_money": {
+                                "discount_ratio": "",
+                                "downward_floating_ratio": "",
+                                "floating_ratio": "",
+                                "money": bid_amount,
+                                "money_unit": money_unit
+                            },
+                            "role_name": "win_tenderer",
+                            "role_text": tenderer,
+                            "serviceTime": ""
+                    })
+                elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
+                    if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
+                        prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']+','+tenderer
+                    else:
+                        prem_dic[package]['roleList'][-1]['multi_winner'] += ','+tenderer
                 tenderer_list.append(tenderer)
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
@@ -7367,7 +7373,7 @@ class ApprovalPredictor():
             if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
                 code_name_set.add(multi_project['project_code']+multi_project['project_name'])
                 district = getPredictor('district').get_area(
-                    multi_project['project_name'] + multi_project['project_addr'], '')
+                    multi_project['approver'] + multi_project['project_name'] + multi_project['project_addr'], '')
                 if district['district']['province'] != '全国':
                     multi_project['area'] = district['district']['area']
                     multi_project['province'] = district['district']['province']
@@ -7379,7 +7385,7 @@ class ApprovalPredictor():
             return rs_l
         elif found_key == 1:
             district = getPredictor('district').get_area(
-                rs_dic['construct_company'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
+                rs_dic['approver'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
             if district['district']['province'] != '全国':
                 rs_dic['area'] = district['district']['area']
                 rs_dic['province'] = district['district']['province']