Browse Source

新增入围候选人提取,表格前3角色提取 ,微调表格多包中标规则

lsm 2 years ago
parent
commit
2772967107

+ 3 - 0
BiddingKG/dl/common/Utils.py

@@ -842,6 +842,9 @@ def uniform_package_name(package_name):
         _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
         _digit = uniform_num(_digit)
         name += _digit
+    elif re.search('^[a-zA-Z0-9-]+$', package_name):
+        _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
+        name += _char.upper()
     if name == "":
         return package_name_raw
     else:

+ 5 - 4
BiddingKG/dl/interface/extract.py

@@ -202,10 +202,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''表格要素提取'''
     table_prem = predictor.getPredictor("tableprem").predict(text)
     if table_prem:
-        if 'Project' in prem[0]['prem']:
-            table_prem.update({'Project': prem[0]['prem']['Project']})
-        prem[0]['prem'] = table_prem
+        getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
 
+    '''候选人提取'''
+    candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys)
+    getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
 
     '''获取联合体信息'''
     getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
@@ -253,7 +254,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district)
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment

+ 46 - 0
BiddingKG/dl/interface/getAttributes.py

@@ -3370,6 +3370,52 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     except Exception as e:
         print('获取联合体抛出异常', e)
 
+def update_prem(old_prem, new_prem):
+    '''
+    根据新旧对比,更新数据
+    :param old_prem:
+    :param new_prem: 表格提取的要素
+    :return:
+    '''
+    if len(new_prem) >= 1 :
+        '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准'''
+        if len(new_prem) > 2 and len(old_prem) > len(new_prem):
+            del_k = []
+            for k in old_prem:
+                if k not in new_prem and k != 'Project':
+                    del_k.append(k)
+            for k in del_k:
+                old_prem.pop(k)
+
+        for k, v in new_prem.items():
+            if k == 'Project':
+                if 'Project' in old_prem:
+                    for d in old_prem['Project']['roleList']:
+                        for d2 in v['roleList']:
+                            if d['role_name'] == d2['role_name']:
+                                d['role_text'] = d2['role_text']
+                                d['role_money']['money'] = d2['role_money']['money']
+                                d['role_money']['money_unit'] = d2['role_money']['money_unit']
+                                v['roleList'].remove(d2)
+                    for d2 in v['roleList']:
+                        old_prem['Project']['roleList'].append(d2)
+                else:
+                    old_prem[k] = v
+            else:
+                if k not in old_prem:
+                    old_prem[k] = v
+                else:
+                    for d in old_prem[k]['roleList']:
+                        for d2 in v['roleList']:
+                            if d['role_name'] == d2['role_name']:
+                                d['role_text'] = d2['role_text']
+                                d['role_money']['money'] = d2['role_money']['money']
+                                d['role_money']['money_unit'] = d2['role_money']['money_unit']
+                                v['roleList'].remove(d2)
+                    for d2 in v['roleList']:
+                        old_prem[k]['roleList'].append(d2)
+
+    # return old_prem
 
 if __name__=="__main__":
     '''

BIN
BiddingKG/dl/interface/header_set.pkl


+ 298 - 21
BiddingKG/dl/interface/predictor.py

@@ -55,6 +55,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "moneygrade": {"predictor": None, "Lock": RLock()},
                   "district": {"predictor": None, "Lock": RLock()},
                   'tableprem': {"predictor": None, "Lock": RLock()},
+                  'candidate': {"predictor": None, "Lock": RLock()},
                   }
 
 
@@ -100,6 +101,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = DistrictPredictor()
                 if _type == 'tableprem':
                     dict_predictor[_type]["predictor"] = TablePremExtractor()
+                if _type == 'candidate':
+                    dict_predictor[_type]["predictor"] = CandidateExtractor()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -3499,7 +3502,7 @@ class DocChannel():
                   self.title_life_dic['废标公告'], title) == None:
               result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
               msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
-          elif result['docchannel']['doctype'] != '采招数据' and origin_dic.get(
+          elif result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
                   original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨', title)==None:
               result['docchannel']['doctype'] = '采招数据'
               msc += '最终规则修改:预测为非采招数据,原始为采招数据且无关键词,返回采招数据'
@@ -4320,23 +4323,8 @@ class DistrictPredictor():
                 rs = rs2
         return rs
 
-class TablePremExtractor(object):
-    def __init__(self):
-        '''各要素表头规则'''
-        self.head_rule_dic = {
-            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
-            'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
-            "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
-            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
-            "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
-            "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
-        }
-
-        with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
-            self.headerset = pickle.load(f)
-
+class TableTag2List():
+    '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
     def table2list(self, table):
         self._output = []
         row_ind = 0
@@ -4415,6 +4403,27 @@ class TablePremExtractor(object):
         if self._output[i][j] == "":
             self._output[i][j] = val
 
+
+class TablePremExtractor(object):
+    def __init__(self):
+        '''各要素表头规则'''
+        self.head_rule_dic = {
+            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
+            'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
+            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
+            "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
+            "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
+            "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
+            "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
+        }
+
+        with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
+            self.headerset = pickle.load(f)
+
+        self.tb = TableTag2List()
+
+
     def find_header(self, td_list):
         header_dic = dict()
         flag = False
@@ -4440,7 +4449,7 @@ class TablePremExtractor(object):
         return flag, dict()
 
     def is_role(self, text):
-        if len(text) > 25 or len(text)<5:
+        if len(text) > 25 or len(text)<4:
             return False
         elif len(re.findall('有限责?任?公司', text)) > 1:
             return False
@@ -4469,6 +4478,10 @@ class TablePremExtractor(object):
             bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
             win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
 
+            if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
+                package_code_raw = project_name
+                project_name = ""
+
             package_code = package_code_raw
             if re.search('合计|总计', package_code+project_code):
                 continue
@@ -4481,7 +4494,7 @@ class TablePremExtractor(object):
                 continue
             if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
                 continue
-            if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and 'bid_amount' in headers and re.search('(中标|成交)价', headers['bid_amount'][1])==None:
+            if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
                 tenderer = ""
 
             tenderee = tenderee if self.is_role(tenderee) else ""
@@ -4580,7 +4593,7 @@ class TablePremExtractor(object):
         tables.reverse()
         rs_dic = {}
         for table in tables:
-            trs = self.table2list(table)
+            trs = self.tb.table2list(table)
             table.extract()
             i = 0
             headers = ""
@@ -4617,6 +4630,270 @@ class TablePremExtractor(object):
             prem = self.get_prem(richText)
         return prem
 
+class CandidateExtractor(object):
+    def __init__(self):
+        '''各要素表头规则'''
+        self.head_rule_dic = {
+            'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
+            "win_sort": "排名|排序|名次",
+            'win_or_not': '是否中标|是否入围|是否入库|入围结论',
+            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
+            "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
+            "win_tenderer": "第一名|第一(中标|成交)?候选人",
+            "second_tenderer": "第二名|第二(中标|成交)?候选人",
+            "third_tenderer": "第三名|第三(中标|成交)?候选人",
+        }
+        '''非表格候选人正则'''
+        self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?:$'
+        self.tb = TableTag2List()
+        with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
+            self.headerset = pickle.load(f)
+
+    def find_header(self, td_list):
+        header_dic = dict()
+        flag = False
+        if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
+            flag = True
+            for i in range(len(td_list)) :
+                text = td_list[i]
+                if len(text) > 15: # 长度大于15 不进行表头匹配
+                    continue
+                if re.search('未(中标|成交)原因', text):  # 不提取此种表格
+                    return flag, dict()
+                num = 0
+                for k, v in self.head_rule_dic.items():
+                    if re.search(v, text):
+                        header_dic[k] = (i, text)
+                        if k != 'candidate': # candidate 可与前三候选重复
+                            num += 1
+                if num>1:
+                    print('表头错误,一个td匹配到两个表头:', header_dic)
+                    return flag, dict()
+            if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
+                return flag, header_dic
+        return flag, dict()
+
+    def is_role(self, text):
+        if len(text) > 25 or len(text) < 4:
+            return False
+        elif len(re.findall('有限责?任?公司', text)) > 1:
+            return False
+        elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
+            return True
+        else:
+            ners = selffool.ner(text)
+            if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
+                return True
+        return False
+
+    def money_process(self, money_text, header):
+        '''
+        输入金额文本及金额列表头,返回统一数字化金额及金额单位
+        :param money_text:
+        :param header:
+        :return:
+        '''
+        money = 0
+        money_unit = ""
+        re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
+        if re_price:
+            money_text = re_price.group(0)
+            if '万元' in header and '万' not in money_text:
+                money_text += '万元'
+            money = float(str(getUnifyMoney(money_text)))
+            if money > 10000000000000:  # 大于万亿的去除
+                money = 0
+            money_unit = '万元' if '万' in money_text else '元'
+        return (money, money_unit)
+
+    def extract_from_df(self, df, headers):
+        prem_dic = {}
+        link_set = set()
+        candidate_set = set()
+        role_dic = dict()  # 保存一二三候选人并排的情况
+        for i in df.index:
+            package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
+            candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
+            win_or_not = df.loc[i, headers['win_or_not'][0]] if "win_or_not" in headers else ""
+            # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
+            bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
+            win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
+            win_tenderer = df.loc[i, headers['win_tenderer'][0]] if "win_tenderer" in headers else ""
+            second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
+            third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
+
+            package_code = package_code_raw
+
+            candidate = candidate_ if self.is_role(candidate_) else ""
+            # tenderer = tenderer if self.is_role(tenderer) else ""
+
+            # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
+            #     break
+            if(candidate,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
+                continue
+            link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
+            package = package_code
+            package = uniform_package_name(package) if package !="" else "Project"
+            if candidate_:
+                if win_or_not and re.search('否|未入围', win_or_not):
+                    pass
+                else:
+                    candidate_set.add(candidate)
+
+            if win_tenderer and second_tenderer and third_tenderer:
+                if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
+                    for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
+                                           [win_tenderer, second_tenderer, third_tenderer]):
+                        if self.is_role(text):
+                            if type not in role_dic:
+                                role_dic[type] = dict()
+                            role_dic[type]['role_text'] = text
+                            if type in ['second_tenderer', 'third_tenderer']:
+                                candidate_set.add(text)
+
+                elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]):
+                    header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
+                    for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
+                                           [win_tenderer, second_tenderer, third_tenderer]):
+                        money, money_unit = self.money_process(text, header)
+                        if money > 0:
+                            if type not in role_dic:
+                                role_dic[type] = dict()
+                            role_dic[type]['money'] = money
+                            role_dic[type]['money_unit'] = money_unit
+                else:
+                    break
+            elif candidate and win_sort:
+                role_type = ""
+                if re.search('第[一1]|^[一1]$', win_sort):
+                    role_type = "win_tenderer"
+                elif re.search('第[二2]|^[二2]$', win_sort):
+                    role_type = "second_tenderer"
+                elif re.search('第[三3]|^[三3]$', win_sort):
+                    role_type = "third_tenderer"
+                if role_type != "":
+                    if package not in prem_dic:
+                        prem_dic[package] = {
+                            'code': '',
+                            'name': '',
+                            'roleList': [],
+                            'tendereeMoney': 0,
+                            'tendereeMoneyUnit': ""
+                        }
+
+                    bid_amount, money_unit  = self.money_process(bid_amount_, df.loc[i, headers['bid_amount'][0]])  if "bid_amount" in headers else (0, "")
+                    prem_dic[package]['roleList'].append({
+                            "address": "",
+                            "linklist": [],
+                            "role_money": {
+                                "discount_ratio": "",
+                                "downward_floating_ratio": "",
+                                "floating_ratio": "",
+                                "money": bid_amount,
+                                "money_unit": money_unit
+                            },
+                            "role_name": role_type,
+                            "role_text": candidate,
+                            "serviceTime": ""
+                    })
+                    if len(prem_dic[package]['roleList']) == 0:  # 只有项目编号和名称的 丢弃
+                        prem_dic.pop(package)
+        if role_dic and prem_dic == dict():
+            if package not in prem_dic:
+                prem_dic[package] = {
+                    'code': '',
+                    'name': '',
+                    'roleList': [],
+                    'tendereeMoney': 0,
+                    'tendereeMoneyUnit': ""
+                }
+            for role_type, v in role_dic.items():
+                role_text = v.get('role_text', '')
+                if role_text == "":
+                    continue
+                money = v.get('money', 0)
+                money_unit = v.get('money_unit', '')
+                prem_dic[package]['roleList'].append({
+                    "address": "",
+                    "linklist": [],
+                    "role_money": {
+                        "discount_ratio": "",
+                        "downward_floating_ratio": "",
+                        "floating_ratio": "",
+                     "money": money,
+                        "money_unit": money_unit
+                    },
+                    "role_name": role_type,
+                    "role_text": role_text,
+                    "serviceTime": ""
+                    })
+            if len(prem_dic[package]['roleList']) == 0:  # 只有项目编号和名称的 丢弃
+                prem_dic.pop(package)
+
+        return prem_dic, candidate_set
+
+    def get_prem(self, soup):
+        tables = soup.find_all('table')
+        tables.reverse()
+        rs_dic = {}
+        candidate_set = set()
+        for table in tables:
+            trs = self.tb.table2list(table)
+            table.extract()
+            i = 0
+            headers = ""
+            while i < len(trs) - 1:
+                flag_, headers_ = self.find_header(trs[i])
+                if flag_ and headers_ != dict():
+                    table_items = []
+                    headers = headers_
+                    for j in range(i + 1, len(trs)):
+                        if len(trs[j]) == len(trs[i]):
+                            flag_, headers_ = self.find_header(trs[j])
+                            if flag_:
+                                break
+                            else:
+                                table_items.append(trs[j])
+                        else:
+                            print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
+                            break
+                    if len(table_items) > 1:
+                        df = pd.DataFrame(table_items)
+                        prem_, candidate_set_ = self.extract_from_df(df, headers)
+                        rs_dic.update(prem_)
+                        candidate_set.update(candidate_set_)
+                    i = j - 1
+                i += 1
+        return rs_dic, candidate_set
+
+    def get_candidates_from_text(self, list_sentences, list_entitys):
+        candidates = set()
+        sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
+        for ent in list_entitys[0]:
+            if ent.entity_type in ['org', 'company']:
+                sen_index = ent.sentence_index
+
+                text = sentences[sen_index].sentence_text
+                b = ent.wordOffset_begin
+                e = ent.wordOffset_end
+                if isinstance(b, int) and isinstance(e, int):
+                    foreword = text[max(0, b - 10):b]
+                    if re.search(self.p, foreword):
+                        candidates.add(ent.entity_text)
+        return candidates
+
+    def predict(self, html, list_sentences, list_entitys):
+        soup = BeautifulSoup(html, 'lxml')
+        richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
+        if richText:
+            richText = richText.extract()  # 过滤掉附件
+        prem, candidate_set = self.get_prem(soup)
+        if prem == {} and richText:
+            prem, candidate_set = self.get_prem(richText)
+        if prem == {} and candidate_set == set():
+            candidate_set = self.get_candidates_from_text(list_sentences, list_entitys)
+        return prem, {'candidate': ','.join(candidate_set)}
+
 
 def getSavedModel():
     #predictor = FormPredictor()