Преглед изворни кода

优化资质要求、金额等;新增政策法规等

lsm пре 8 месеци
родитељ
комит
3fdcb4e179

+ 2 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -710,6 +710,8 @@ def tableToText(soup, docid=None):
                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
                 if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式'] and predict_list[i][j]!=1:
                     inner_table[i][j] = [origin_inner_table[i][j][0], 1]
+                elif origin_inner_table[i][j][0] in ['经评审的最低评标价法'] and predict_list[i][j]==1:
+                    inner_table[i][j] = [origin_inner_table[i][j][0], 0]
 
         if show:
             print(json.dumps(inner_table, ensure_ascii=False))

+ 3 - 2
BiddingKG/dl/interface/extract.py

@@ -267,10 +267,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name = extract_parameters(parse_document)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name = extract_parameters(parse_document)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
     if addr_bidopen_text == '':
         addr_bidopen_text = extract_addr(list_articles[0].content)
 
@@ -527,6 +527,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # 是否为存款项目
     data_res['is_deposit_project'] = deposit_project
     data_res['pinmu_name'] = pinmu_name
+    data_res['policies'] = list_policy
 
     # for _article in list_articles:
     #         log(_article.content)

+ 1 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -4404,7 +4404,7 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
                                             b = b2
                                             e = e2
                                             find_joint = 1
-                                        elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
+                                        elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。', ')'] or s[e2:e2+3]=='联合体'):
                                             join_l.append(behind_entity.entity_text)
                                             b = b2
                                             e = e2

+ 3 - 1
BiddingKG/dl/interface/get_label_dic.py

@@ -229,7 +229,9 @@ def get_all_label(title, content):
 
     def consortium_permit():
         # 允许联合体:是;不允许:否;无关键词:0
-        if re.search('(接受|允许|欢迎|同意))?联合体(参与)?投标|联合体投标是合法的|联合体投标的,应|联合体各方应具备承担|投标人可以组成联合体', content):
+        if re.search('(接受|允许|欢迎|同意)联合体(投标)?:否|联合体投标:(不(接受|允许|欢迎|同意)|否)', content):
+            return '否'
+        elif re.search('(接受|允许|欢迎|同意))?联合体(参与)?投标|联合体投标是合法的|联合体投标的,应|联合体各方应具备承担|投标人可以组成联合体|(接受|允许|欢迎|同意)联合体(投标)?:是', content):
             if re.search('不(接受|允许|欢迎|同意))?联合体(参与)?投标|禁止联合体(参与)?投标|投标人不得为联合体|仅接受独立法人投标|投标人必须为独立法人|不得组成联合体|只有独立法人单位可以参与', content):
                 return '否'
             return '是'

+ 28 - 4
BiddingKG/dl/interface/outline_extractor.py

@@ -57,11 +57,13 @@ def extract_sentence_list(sentence_list):
 requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
                           "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
-aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
+aptitude_pattern = "((资格|资质)[的及]?要求|竞买资格及要求|供应商报价须知)([::,]|$)|(竞买|竞买人|竞投人)?资格(条件)?:"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
-pinmu_name_pattern = "采购品目名称(\w{2,50})[,]"
+pinmu_name_pattern = "采购品目名称([::,]|$)"
 out_lines = []
+policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
+not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
 
 def extract_parameters(parse_document):
     '''
@@ -76,6 +78,7 @@ def extract_parameters(parse_document):
     addr_bidsend_text = '' # 投标地址
     requirement_scope = [] # 采购内容始末位置
     pinmu_name = '' # 品目名称
+    list_policy = [] # 政策法规
 
     _find_count = 0
     _data_i = -1
@@ -161,7 +164,23 @@ def extract_parameters(parse_document):
                     _data_i += len(childs)
                     _data_i -= 1
                 elif re.search(pinmu_name_pattern, _text):
-                    pinmu_name += re.search(pinmu_name_pattern, _text).group(1)
+                    childs = get_childs([_data], max_depth=1)
+                    for c in childs:
+                        pinmu_name += c["text"]
+                    _data_i += len(childs)
+                    _data_i -= 1
+    _data_i = -1
+    while _data_i<len(list_data)-1:
+        _data_i += 1
+        _data = list_data[_data_i]
+        _type = _data["type"]
+        _text = _data["text"].strip()
+        # print(_data.keys())
+        if _type=="sentence":
+            for it in re.finditer(policy_pattern, _text):
+                if it not in list_policy:
+                    list_policy.append(it.group(0))
+
     if re.search('时间:', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
         for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
             b, e = ser.span()
@@ -172,7 +191,12 @@ def extract_parameters(parse_document):
         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
             b, e = ser.span()
         addr_bidsend_text = addr_bidsend_text[b:e]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name
+    ser = re.search(pinmu_name_pattern, pinmu_name)
+    if ser:
+        pinmu_name = pinmu_name[ser.end():]
+        if re.search('[^\w]$', pinmu_name):
+            pinmu_name = pinmu_name[:-1]
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy
 
 def extract_addr(content):
     '''

+ 4 - 4
BiddingKG/dl/interface/predictor.py

@@ -6845,7 +6845,7 @@ class TablePremExtractor(object):
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
 
-                if (re.search('费率|下浮率|[%%‰折]',
+                if (re.search('费率|下浮率|[%%‰折]|优惠率',
                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
                     budget = 0
                 if budget > 0:
@@ -6886,7 +6886,7 @@ class TablePremExtractor(object):
                     continue
 
                 bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
-                if (re.search('费率|下浮率|[%%‰折]',
+                if (re.search('费率|下浮率|[%%‰折]|优惠率',
                               bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
                     bid_amount = 0
                 if serviceTime:
@@ -7288,7 +7288,7 @@ class CandidateExtractor(object):
                             break
                         money, money_unit = money_process(text, header)
 
-                        if (re.search('费率|下浮率|[%%‰折]', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
+                        if (re.search('费率|下浮率|[%%‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
                             money = 0
                         if money > 0:
                             if type not in role_dic:
@@ -7325,7 +7325,7 @@ class CandidateExtractor(object):
                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
 
                     header = headers['bid_amount'][1] if "bid_amount" in headers else ''
-                    if (re.search('费率|下浮率|[%%‰折]',
+                    if (re.search('费率|下浮率|[%%‰折]|优惠率',
                                   header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
                         bid_amount = 0
                     prem_dic[package]['roleList'].append({