Просмотр исходного кода

优化角色金额的提取;特殊数据源预处理调整;

lsm 2 лет назад
Родитель
Сommit
0c98eb5124

+ 26 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1741,7 +1741,7 @@ def get_preprocessed(articles, useselffool=False):
 def special_treatment(sourceContent, web_source_no):
     try:
         if web_source_no == 'DX000202-1':
-             ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
+             ser = re.search('中标供应商及中标金额:【(([\w()]{5,20}-[\d,.]+,)+)】', sourceContent)
              if ser:
                  new = ""
                  l = ser.group(1).split(',')
@@ -1864,6 +1864,28 @@ def special_treatment(sourceContent, web_source_no):
         elif web_source_no=='00811-8':
             if re.search('是否中标:是', sourceContent) and re.search('排名:\d,', sourceContent):
                 sourceContent = re.sub('排名:\d,', '候选', sourceContent)
+        elif web_source_no == "00049-3":
+            if re.search('主要标的单价\s+合同金额', sourceContent.get_text()):
+                header = []
+                attrs = []
+                flag = 0
+                tag = None
+                for p in sourceContent.find_all('p'):
+                    text = p.get_text()
+                    if re.search('主要标的数量\s+主要标的单价\s+合同金额', text):
+                        header = text.split()
+                        flag = 1
+                        tag = p
+                        continue
+                    if flag:
+                        attrs = text.split()
+                        p.extract()
+                        break
+                if header and len(header) == len(attrs) and tag:
+                    s = ""
+                    for head, attr in zip(header, attrs):
+                        s += head + ':' + attr + ','
+                    tag.string = s
         return sourceContent
     except Exception as e:
         log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
@@ -2142,7 +2164,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         del_tabel_achievement(article_processed)
 
         '''特别数据源对 BeautifulSoup(html) 做特别修改'''
-        if web_source_no in ["00753-14","DX008357-11","18021-2"]:
+        if web_source_no in ["00753-14","DX008357-11","18021-2", "00049-3"]:
             article_processed = special_treatment(article_processed, web_source_no)
         for _soup in article_processed.descendants:
             # 识别无标签文本,添加<span>标签
@@ -2638,6 +2660,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                                 notSure = True
                             if k.split("_")[0]=="money":
                                 entity_text = v
+                                if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
+                                    entity_text = entity_text[:-3]
                             if k.split("_")[0]=="unit":
                                 if v=='万元' or unit=="":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
                                     unit = v

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -254,7 +254,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2022-12-13'}
+    version_date = {'version_date': '2022-12-23'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 98 - 29
BiddingKG/dl/interface/predictor.py

@@ -590,7 +590,11 @@ class PREMPredict():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
-                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
+                            # text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
+                            text_sen = sentence.sentence_text
+                            b = entity.wordOffset_begin
+                            e = entity.wordOffset_end
+                            text_list.append((text_sen[max(0, b-13):b], text_sen[b:e], text_sen[e:e+10]))
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
                             item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
                             data_x.append(item_x)
@@ -630,7 +634,7 @@ class PREMPredict():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
-                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 13):entity.wordOffset_begin])
                             #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                             #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                             item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
@@ -680,33 +684,45 @@ class PREMPredict():
             entity = points_entitys[i]
             label = np.argmax(predict_y[i])
             values = predict_y[i]
-            text = text_list[i]
+            # text = text_list[i]
+            text_tup = text_list[i]
+            front, middle, behind = text_tup
+            whole = "".join(text_tup)
             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
                 label = 5
-            elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', text):
+            elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
                 label = 5
             elif label == 2:
-                if re.search('中标单位和.{,25}签订合同', text):
+                if re.search('中标单位和.{,25}签订合同', whole):
                     label = 0
                     values[label] = 0.501
-                elif re.search('尊敬的供应商:.{,25}我公司', text):
+                elif re.search('尊敬的供应商:.{,25}我公司', whole):
                     label = 0
                     values[label] = 0.801
-                elif re.search('尊敬的供应商:', text):
+                elif re.search('尊敬的供应商:$', front):
                     label = 0
                     values[label] = 0.501
-                elif re.search('[^\w]中标候选人', text[:15]) and re.search('[1一]', text[:15]) == None:  #修复第4以上的预测错为中标人
+                elif re.search('第[4-9四五六]中标候选人', front):  #修复第4以上的预测错为中标人
                     label = 5
                     values[label] = 0.5
-            elif re.search('是否中标:是,供应商', text) and label == 5:
+            elif re.search('是否中标:是,供应商', front) and label == 5:
                 label = 2
                 values[label] = 0.9
-            elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
+            elif label == 1 and re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
                 label = 0
                 values[label] = 0.501
-            elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
+            elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', front):
                 label = 2
                 values[label] = 0.501
+            elif label in [3,4] and re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
+                label = 2
+                values[label] = 0.7
+            elif label == 3 and re.search('决定选择第二名', front) and re.search('^作为(中标|成交)(人|供应商|单位|公司)', behind):
+                label = 2
+                values[label] = 0.8
+            elif re.search('(中标|成交)通知书[,:]$', front) and re.search('^:', behind) and label != 2:
+                label = 2
+                values[label] = 0.8
             entity.set_Role(label, values)
 
     def predict_money(self,list_sentences,list_entitys):
@@ -742,16 +758,20 @@ class PREMPredict():
             label = np.argmax(predict_y[i])
             values = predict_y[i]
             text = text_list[i]
+            # print('金额: ', entity.entity_text, label, values, text)
             if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
                 label = 2
-            elif label == 1 and re.search('[::,。](总金额|总价|单价)', text):
+            elif label == 1 and re.search('[::,。](总金额|总价|单价):?$', text) and re.search('(中标|投标|成交|中价)', text)==None:
                 values[label] = 0.49
             elif label ==0 and entity.notes in ["投资", "工程造价"]:
                 values[label] = 0.49
-            elif label == 0 and re.search('最低限价', text):
+            elif label == 0 and re.search('最低限价:?$', text):
                 values[label] = 0.49
             elif re.search('金额在$', text):
                 values[label] = 0.49
+            elif re.search('报价:预估不?含税总价[为:]$', text) and (label != 1 or values[label]<0.5):
+                label = 1
+                values[label] = 0.8
             entity.set_Money(label, values)
 
     def correct_money_by_rule(self, title, list_entitys, list_articles):
@@ -1176,7 +1196,7 @@ class RoleRulePredictor():
         self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
-        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
                                         "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名?称?[))]))"
         self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
@@ -1210,7 +1230,7 @@ class RoleRulePredictor():
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
         self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
-        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
+        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价|报酬(含税):")
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
@@ -4631,7 +4651,7 @@ class TablePremExtractor(object):
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
+            "bid_amount": "投标[报总]价|(中标|成交))?([金总]?额|[报均总]价|价[格款]?)|承包价",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -4643,7 +4663,7 @@ class TablePremExtractor(object):
     def find_header(self, td_list):
         header_dic = dict()
         flag = False
-        if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
+        if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
             flag = True
             for i in range(len(td_list)) :
                 text = td_list[i]
@@ -4674,8 +4694,8 @@ class TablePremExtractor(object):
                         if re.search('^金额(万?元)$', text):
                             header_dic['budget'] = (i, text)
                             break
-            if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
-                    'budget' in header_dic or 'tenderer' in header_dic):
+            if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
+                    'budget' in header_dic or 'bid_amount' in header_dic):
                 return flag, header_dic
         return flag, dict()
 
@@ -4692,6 +4712,20 @@ class TablePremExtractor(object):
                 return True
         return False
 
+    def get_role(self, text):
+        if len(text) > 25 or len(text)<4:
+            return ''
+        ners = getNers([text], useselffool=True)
+        roles = []
+        if ners:
+            for ner in ners[0]:
+                if ner[2] in ['org', 'company']:
+                    roles.append(ner[3])
+        if roles and len(''.join(roles)) > len(text)*0.8:
+            return roles[0]
+        else:
+            return ''
+
     def extract_from_df(self, df, headers):
         prem_dic = {}
         previous_package = ""  # 上一行包号
@@ -4735,8 +4769,11 @@ class TablePremExtractor(object):
             if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
                 tenderer = ""
 
-            tenderee = tenderee if self.is_role(tenderee) else ""
-            tenderer = tenderer if self.is_role(tenderer) else ""
+            # tenderee = tenderee if self.is_role(tenderee) else ""
+            # tenderer = tenderer if self.is_role(tenderer) else ""
+
+            tenderee = self.get_role(tenderee)
+            tenderer = self.get_role(tenderer)
 
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
                 break
@@ -4833,9 +4870,10 @@ class TablePremExtractor(object):
         rs_dic = {}
         for table in tables:
             trs = self.tb.table2list(table)
-            table.extract()
+            # table.extract()
             i = 0
             headers = ""
+            table_prem = {}
             while i < len(trs) - 1:
                 flag_, headers_ = self.find_header(trs[i])
                 if flag_ and headers_ != dict():
@@ -4854,9 +4892,21 @@ class TablePremExtractor(object):
                     if len(table_items) > 0:
                         df = pd.DataFrame(table_items)
                         prem_ = self.extract_from_df(df, headers)
-                        rs_dic.update(prem_)
+                        # rs_dic.update(prem_)
+                        table_prem.update(prem_)
                     i = j - 1
                 i += 1
+            if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
+                sib = table.find_previous_sibling()
+                sib_text = sib.get_text()
+                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
+                if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
+                    package_sib = ser_sib.group(0)
+                    package_sib = uniform_package_name(package_sib)
+                    table_prem[package_sib] = table_prem.pop('1')
+            if table_prem:
+                rs_dic.update(table_prem)
+            table.extract()
         return rs_dic
 
     def predict(self, html):
@@ -4881,7 +4931,7 @@ class CandidateExtractor(object):
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否中标|是否入围|是否入库|入围结论',
             "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
-            "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
+            "bid_amount": "投标[报总]价|(中标|成交))?([金总]额|[报均总]价|价[格款])|承包价",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
             "third_tenderer": "第三名|第三(中标|成交)?候选人",
@@ -4931,6 +4981,20 @@ class CandidateExtractor(object):
                 return True
         return False
 
+    def get_role(self, text):
+        if len(text) > 25 or len(text)<4:
+            return ''
+        ners = getNers([text], useselffool=True)
+        roles = []
+        if ners:
+            for ner in ners[0]:
+                if ner[2] in ['org', 'company']:
+                    roles.append(ner[3])
+        if roles and len(''.join(roles)) > len(text)*0.8:
+            return roles[0]
+        else:
+            return ''
+
     def money_process(self, money_text, header):
         '''
         输入金额文本及金额列表头,返回统一数字化金额及金额单位
@@ -4982,17 +5046,18 @@ class CandidateExtractor(object):
 
             package_code = package_code_raw
 
-            candidate = candidate_ if self.is_role(candidate_) else ""
+            # candidate = candidate_ if self.is_role(candidate_) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
+            candidate = self.get_role(candidate_)
 
             # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
             #     break
-            if(candidate,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
+            if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
                 continue
             link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
             package = package_code
             package = uniform_package_name(package) if package !="" else "Project"
-            if candidate_:
+            if candidate:
                 if win_or_not and re.search('否|未入围', win_or_not):
                     pass
                 else:
@@ -5002,7 +5067,9 @@ class CandidateExtractor(object):
                 if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
-                        if self.is_role(text):
+                        text = self.get_role(text)
+                        if text:
+                        # if self.is_role(text):
                             if type not in role_dic:
                                 role_dic[type] = dict()
                             role_dic[type]['role_text'] = text
@@ -5134,7 +5201,9 @@ class CandidateExtractor(object):
                 text = sentences[sen_index].sentence_text
                 b = ent.wordOffset_begin
                 e = ent.wordOffset_end
-                if isinstance(b, int) and isinstance(e, int):
+                if ent.label in [2,3,4]: # 直接加实体预测的候选人, 否则规则检查是否为候选人
+                    candidates.add(ent.entity_text)
+                elif isinstance(b, int) and isinstance(e, int):
                     foreword = text[max(0, b - 10):b]
                     if re.search(self.p, foreword):
                         candidates.add(ent.entity_text)