Ver código fonte

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

 Conflicts:
	BiddingKG/dl/interface/Preprocessing.py
znj 2 anos atrás
pai
commit
bd64c5b2ea

+ 8 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -2032,7 +2032,7 @@ def del_achievement(text):
 def del_tabel_achievement(soup):
     if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
         return None
-    p1 = '中标(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
+    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
     '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
     for tag in soup.find_all('table'):
         pre_text = tag.findPreviousSibling().text.strip() if tag.findPreviousSibling() != None else ""
@@ -2201,6 +2201,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
         article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
+        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(\w{4,15})/(\w{4,15})[,。]', article_processed)
+        if ser:
+            article_processed = article_processed.replace(ser.group(0), '采购人名称: %s,采购代理机构名称:%s,' % (ser.group(1), ser.group(2)))
+
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -2421,6 +2425,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
     list_entitys = []
+    not_extract_roles = ['黄埔军校'] # 需要过滤掉的企业单位
     for list_sentence in list_sentences:
         sentences = []
         list_entitys_temp = []
@@ -2546,6 +2551,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
                 # 组织机构实体名称补充
                 if entity_type in ["org", "company"]:
+                    if entity_text in not_extract_roles: # 过滤掉名称在 需要过滤企业单位列表里的
+                        continue
                     if not re.search("有限责任公司|有限公司",entity_text):
                         fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
                         if fix_name:

+ 8 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -879,7 +879,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
                 if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
                     continue
                 # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
-                if re.match('\d', iter.group(0)) and re.search('\d.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
+                if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
                     # print('过滤掉错误包:', iter.group())
                     continue
                 if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
@@ -3515,6 +3515,13 @@ def update_prem(old_prem, new_prem):
             for k in del_k:
                 old_prem.pop(k)
 
+        if 'Project' in old_prem:
+            for d in old_prem['Project']['roleList']:
+                if d['role_name'] in ['tenderee', 'agency']:
+                    tenderree_ = d['role_text']
+                    if tenderree_ in str(new_prem) and re.search('公司', tenderree_):
+                        old_prem['Project']['roleList'].remove(d) # 如果旧预测的招标人/代理人在表格预测里面去掉,防止错误召回,以表格提取的为准
+
         for k, v in new_prem.items():
             if k == 'Project':
                 if 'Project' in old_prem:

BIN
BiddingKG/dl/interface/header_set.pkl


+ 29 - 14
BiddingKG/dl/interface/predictor.py

@@ -705,7 +705,7 @@ class PREMPredict():
                 elif re.search('第[4-9四五六]中标候选人', front):  #修复第4以上的预测错为中标人
                     label = 5
                     values[label] = 0.5
-                elif re.search('(序号|排名|排序|名次):[4-9],', front): # 293225236 附件中 排名预测错误
+                elif re.search('(序号|排名|排序|名次):([4-9]|\d{2,}),', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
             elif re.search('是否中标:是,供应商', front) and label == 5:
@@ -4475,7 +4475,7 @@ class DistrictPredictor():
                 return ''
 
         def get_project_addr(text):
-            p1 = '(项目|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
             if re.search(p1, text):
                 return re.search(p1, text).group(0)
             else:
@@ -4669,14 +4669,14 @@ class TablePremExtractor(object):
     def __init__(self):
         '''各要素表头规则'''
         self.head_rule_dic = {
-            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
+            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
             "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
-            "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
-            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
+            "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
-            "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
+            "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -4686,7 +4686,7 @@ class TablePremExtractor(object):
 
 
     def find_header(self, td_list):
-        td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
+        td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$|/万?元', '', it) for it in td_list]
         header_dic = dict()
         flag = False
         contain_header = False
@@ -4721,9 +4721,11 @@ class TablePremExtractor(object):
                         if re.search('^金额((万?元))?$', text):
                             header_dic['budget'] = (i, text)
                             break
-            if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
-                    'budget' in header_dic or 'bid_amount' in header_dic):
+            if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
+                     'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
                 return flag, contain_header, header_dic
+            elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
+                return flag,contain_header, header_dic
         elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
         return flag, contain_header, dict()
@@ -4800,9 +4802,11 @@ class TablePremExtractor(object):
 
             if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
                 continue
-            if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
+            if win_sort != "" and re.search('是否(中标|成交)', headers['win_sort'][1]) and re.search('否|未(中标|成交)', win_sort):
+                continue
+            if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
-            if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
+            if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐中标候选人', headers['tenderer'][1])==None:
                 tenderer = ""
 
             # tenderee = tenderee if self.is_role(tenderee) else ""
@@ -4817,7 +4821,7 @@ class TablePremExtractor(object):
                 continue
             link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
 
-            package = package_code if package_code else str(i+1)
+            package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
             package = uniform_package_name(package)
 
             if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
@@ -4897,6 +4901,17 @@ class TablePremExtractor(object):
 
         rs_dic = {}
         for table in tables:
+
+            text = table.text.strip()
+            previous = table.findPreviousSibling()
+            text2 = previous .text.strip() if previous else ""
+            # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
+            if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
+                tb_ex = table.extract()
+                if previous:
+                    sib = previous.extract()
+                continue
+
             trs = self.tb.table2list(table)
             # table.extract()
             i = 0
@@ -4960,7 +4975,7 @@ class CandidateExtractor(object):
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否中标|是否入围|是否入库|入围结论',
             "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
-            "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
             "third_tenderer": "第三名|第三(中标|成交)?候选人",