Bladeren bron

优化角色、金额、链接、多包、预处理等

lsm 7 maanden geleden
bovenliggende
commit
d6c2b1e528

+ 1 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -179,7 +179,7 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                     if have_bus:
                         lb, prob = get_role(dic)
                         bus_dic[_entity.entity_text] = (lb, prob)
-                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
+                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
                             bus_tenderee.append(_entity)
                     elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据
                         have_bus = True

+ 35 - 6
BiddingKG/dl/interface/Preprocessing.py

@@ -1404,10 +1404,25 @@ def tableToText(soup, docid=None):
                 in_attachment = True
     #逆序处理嵌套表格
     # print('len(tbodies)1', len(tbodies))
-    for tbody_index in range(1,len(tbodies)+1):
+    # for tbody_index in range(1,len(tbodies)+1):
+    tbody_index = 1
+    while tbody_index < len(tbodies)+1:
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况;例:526321576
+            for row in tbody.find_all('tr'):
+                if tbodies[tbody_index - 1][0].tbody:
+                    tbodies[tbody_index - 1][0].tbody.append(row)
+                else:
+                    tbodies[tbody_index - 1][0].append(row)
+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
+            list_innerTable.append(inner_table)
+            tbody_index += 2
+            continue
         inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
+        tbody_index += 1
 
     # tbodies = soup.find_all('tbody')
     # 遍历表格中的每个tbody
@@ -1420,11 +1435,25 @@ def tableToText(soup, docid=None):
             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
                 in_attachment = True
     #逆序处理嵌套表格
-    # print('len(tbodies)2', len(tbodies))
-    for tbody_index in range(1,len(tbodies)+1):
+    tbody_index = 1
+    # for tbody_index in range(1,len(tbodies)+1):
+    while tbody_index < len(tbodies) + 1:
         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况;例:526321576
+            for row in tbody.find_all('tr'):
+                if tbodies[tbody_index - 1][0].tbody:
+                    tbodies[tbody_index - 1][0].tbody.append(row)
+                else:
+                    tbodies[tbody_index - 1][0].append(row)
+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
+            list_innerTable.append(inner_table)
+            tbody_index += 2
+            continue
         inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
+        tbody_index += 1
 
     return soup
     # return list_innerTable
@@ -2087,8 +2116,8 @@ def segment(soup,final=True):
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
-            if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
-                child.insert_before("。")
+            # if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割 20240930 注销,修复529501491关键词 三、中标供应商:(一)单位名称被分句
+            #     child.insert_before("。")
         # if child.name == 'div' and 'class' in child.attrs:
         #     # 添加附件"attachment"标识
         #     if "richTextFetch" in child['class']:
@@ -2933,7 +2962,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
-        article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
+        article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20}))?:?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题 例:530377517
         article_processed = re.sub('(不?含(可抵扣增值|\w{,8})税)', '', article_processed)    # 120637247 投标报价(元),(含可抵扣增值税):277,560.00。
         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -450,7 +450,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-09-29'}
+    version_date = {'version_date': '2024-10-17'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 25 - 11
BiddingKG/dl/interface/getAttributes.py

@@ -464,6 +464,12 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
     win_tenderer_set = set() # 记录所有预测为中标的实体集合
   # print(PackageList)
     #拿到各个实体的packageName,packageCode
+    main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
+    for entity in list_entity:
+        if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
+            main_contain_winner = True
+            break
+
     for entity in list_entity:
         if entity.entity_type in ['org','company']:
             #限制附件里角色values[label]最大概率prob
@@ -477,6 +483,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
             values = entity.values
             role_prob = float(values[int(entity.label)])
             if role_prob>=on_value and str(entity.label)!="5":
+                if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人,不再提取附件中标人 避免 例:504046747 附件角色OCR错字变两个标段
+                    continue
                 if str(entity.label) in ["0","1"]:
                     packageName = "Project"
                 else:
@@ -824,11 +832,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
                         scope_begin = [PackageList_scope[j]["sentence_index"],
                                        PackageList_scope[j]["offsetWords_begin"]]
                     else:
-                        if j == 0:
-                            scope_begin = [0, 0]
-                        else:
-                            scope_begin = [PackageList_scope[j - 1]["sentence_index"],
-                                           PackageList_scope[j - 1]["offsetWords_begin"]]
+                        scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
+                        # if j == 0:
+                        #     scope_begin = [0, 0]
+                        # else:
+                        #     scope_begin = [PackageList_scope[j - 1]["sentence_index"],
+                        #                    PackageList_scope[j - 1]["offsetWords_begin"]]
+
                     if j == len(PackageList_scope) - 1:
                         scope_end = [list_sentence[-1].sentence_index,
                                      changeIndexFromWordToWords(list_sentence[-1].tokens,
@@ -4246,9 +4256,9 @@ def limit_maximum_amount(dic, list_entity):
                     if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
                         # print('单位元小金额且格式类似万元的乘以万倍')
                         l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
-                    else:
-                        # print('中标金额小于限额:%d元 去除' % minximum_amount)
-                        l["role_money"]['money'] = 0
+                    # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
+                    #     # print('中标金额小于限额:%d元 去除' % minximum_amount)
+                    #     l["role_money"]['money'] = 0
 
             if float(value['tendereeMoney']) > maximum_amount:
                 flag = 1
@@ -4269,9 +4279,9 @@ def limit_maximum_amount(dic, list_entity):
                 if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
                     # print('单位元小金额且格式类似万元的乘以万倍')
                     value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
-                else:
-                    # print('招标金额小于限额:%d元 去除' % minximum_amount)
-                    value['tendereeMoney'] = 0
+                # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
+                #     # print('招标金额小于限额:%d元 去除' % minximum_amount)
+                #     value['tendereeMoney'] = 0
 
 
 def limit_maximum_amount_backup(prem, industry):
@@ -4556,6 +4566,10 @@ def update_prem(old_prem, new_prem, in_attachment=False):
             k = list(old_prem.keys()-set(['Project']))[0]
             k_new = list(new_prem.keys())[0]
             new_prem[k] = new_prem.pop(k_new)
+        elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同,把表格提取包名替换为非表格包名
+            k = list(old_prem.keys()-set(['Project']))[0]
+            k_new = list(new_prem.keys())[0]
+            new_prem[k] = new_prem.pop(k_new)
 
         if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project
             k = list(new_prem.keys())[0]

+ 2 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -105,6 +105,8 @@ class Model_role_classify_word():
         text = re.sub('交易单位', '发布单位', text)
         text = re.sub('[,:]各种数据:', ':', text) # 20240620优化 478331984 山东省交通运输厅站源提取不到 各种数据:中标单位,各种数据:济南金曰公路工程有限公司,
         text = re.sub('电子签章', '', text) # 20240924 修复 529923459 电子签名:投标人名称(电子签章:西君兰信息科技有限公司,2024年9月7日 预测为中标
+        text = re.sub('采购方式', 'xxxx', text) # 修复 499096797 招标人预测错误
+        text = re.sub('中标人\d名称', '中标人名称', text) # 修复 499096797 中标人预测错误
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):

+ 16 - 10
BiddingKG/dl/interface/predictor.py

@@ -497,6 +497,8 @@ class CodeNamePredict():
                             continue
                         elif '公司:你单位在' in _name: # 避免类似 339900030 这种作为项目名称,导致中标角色作为招标角色
                             continue
+                        elif _name.endswith('公司') and len(_name)<20: # 修复 456957250 雄县辉茂纸塑包装制品销售有限公司 作为项目名称
+                            continue
 
                         #add name to entitys
                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
@@ -856,7 +858,7 @@ class PREMPredict():
                 elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
                     label = 5
                     values[2] = 0.5
-                elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
+                elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front): # or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
                 elif re.search('税费', front) and re.search('^承担', behind):
@@ -1477,7 +1479,7 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为")  # |建安费用 不作为招标金额
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|招标规模")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
         self.pattern_money_other = re.compile("代理费|服务费")
@@ -2391,6 +2393,8 @@ class RoleGrade():
         for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表,改为非角色
             if entity.entity_text in all_tenderee_agency:
                 entity.label = 5
+            elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标
+                entity.label = 5
 
         if org_winner != []:
             flag = 0
@@ -2432,7 +2436,7 @@ class MoneyGrade():
                     if ser:
                         groupdict = pattern.split('>')[0].replace('(?P<', '')
                         _role, _direct, _prob = groupdict.split('_')
-                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100:
+                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
                             _prob = 6
                         _label = role2id.get(_role)
                         if _label != entity.label:
@@ -2455,6 +2459,8 @@ class MoneyGrade():
                     # _prob = min_prob - 0.1 if in_att else min_prob
                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
                     # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
+            if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
+                entity.label = 2
 
 
 # 时间类别
@@ -6022,7 +6028,7 @@ class DistrictPredictor():
             return province_l, city_l, district_l
 
         def get_pro_city_dis_score(text, text_weight=1):
-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区', ' ', text)
+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河', ' ', text)
             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
             text = re.sub('茂名滨海新区', '茂名市', text)
@@ -6121,13 +6127,13 @@ class DistrictPredictor():
                         pro_idx = idx_dic[idx]['省']
                         if pro_idx in pro_ids:
                             pro_ids[pro_idx] += (score + 0) * w * weight
-                        else:
-                            pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
+                        # else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
+                        #     pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
                         city_idx = idx_dic[idx]['市']
                         if city_idx in city_ids:
                             city_ids[city_idx] += (score + 0) * w * weight
-                        else:
-                            city_ids[city_idx] = (score + 0) * w * weight * 0.1
+                        # else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
+                        #     city_ids[city_idx] = (score + 0) * w * weight * 0.1
 
             for k, v in pro_ids.items():
                 pro_ids[k] = v * text_weight
@@ -6995,7 +7001,7 @@ class CandidateExtractor(object):
         header_dic = dict()
         flag = False
         contain_header = False
-        if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
+        if len(set(fix_td_list) & self.headerset)>=2 and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
             flag = True
             for i in range(len(td_list)) :
                 text = td_list[i]
@@ -8080,7 +8086,7 @@ if __name__=="__main__":
     rs = tb_extract.predict(html, [
         "江苏中联铸本混凝土有限公司",
         "鼓楼区协荣机械设备经销部"
-    ], web_source_name = '', all_winner=True)
+    ], web_source_name = '', all_winner=False)
     print('标段数:',len(rs[0]))
     print(rs)