9 maanden geleden · d6c2b1e528
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -179,7 +179,7 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
 
				                     if have_bus:
			
 
				                         lb, prob = get_role(dic)
			
 
				                         bus_dic[_entity.entity_text] = (lb, prob)
			
 
				-                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|银行|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
			
 
				+                        if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
			
 
				                             bus_tenderee.append(_entity)
			
 
				                     elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据
			
 
				                         have_bus = True
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1404,10 +1404,25 @@ def tableToText(soup, docid=None):
 
				                 in_attachment = True
			
 
				     #逆序处理嵌套表格
			
 
				     # print('len(tbodies)1', len(tbodies))
			
 
				-    for tbody_index in range(1,len(tbodies)+1):
			
 
				+    # for tbody_index in range(1,len(tbodies)+1):
			
 
				+    tbody_index = 1
			
 
				+    while tbody_index < len(tbodies)+1:
			
 
				         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
			
 
				+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
			
 
				+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
			
 
				+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况；例：526321576
			
 
				+            for row in tbody.find_all('tr'):
			
 
				+                if tbodies[tbody_index - 1][0].tbody:
			
 
				+                    tbodies[tbody_index - 1][0].tbody.append(row)
			
 
				+                else:
			
 
				+                    tbodies[tbody_index - 1][0].append(row)
			
 
				+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
			
 
				+            list_innerTable.append(inner_table)
			
 
				+            tbody_index += 2
			
 
				+            continue
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
 
				         list_innerTable.append(inner_table)
			
 
				+        tbody_index += 1
			
 
				 
			
 
				     # tbodies = soup.find_all('tbody')
			
 
				     # 遍历表格中的每个tbody
			
@@ -1420,11 +1435,25 @@ def tableToText(soup, docid=None):
 
				             if 'class' in _part.attrs and "richTextFetch" in _part['class']:
			
 
				                 in_attachment = True
			
 
				     #逆序处理嵌套表格
			
 
				-    # print('len(tbodies)2', len(tbodies))
			
 
				-    for tbody_index in range(1,len(tbodies)+1):
			
 
				+    tbody_index = 1
			
 
				+    # for tbody_index in range(1,len(tbodies)+1):
			
 
				+    while tbody_index < len(tbodies) + 1:
			
 
				         tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
			
 
				+        if len(tbody.find_all('tr')) == 1 and tbody_index > 0 and len(
			
 
				+                tbodies[tbody_index - 1][0].find_all('tr')) == 1 and len(tbody.find_all(['th', 'td'])) == len(
			
 
				+                tbodies[tbody_index - 1][0].find_all(['th', 'td'])): # 处理相邻表格都只有一行的情况；例：526321576
			
 
				+            for row in tbody.find_all('tr'):
			
 
				+                if tbodies[tbody_index - 1][0].tbody:
			
 
				+                    tbodies[tbody_index - 1][0].tbody.append(row)
			
 
				+                else:
			
 
				+                    tbodies[tbody_index - 1][0].append(row)
			
 
				+            inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
			
 
				+            list_innerTable.append(inner_table)
			
 
				+            tbody_index += 2
			
 
				+            continue
			
 
				         inner_table = trunTable(tbody,_in_attachment)
			
 
				         list_innerTable.append(inner_table)
			
 
				+        tbody_index += 1
			
 
				 
			
 
				     return soup
			
 
				     # return list_innerTable
			
@@ -2087,8 +2116,8 @@ def segment(soup,final=True):
 
				             child.insert_after("。")
			
 
				         if child.name in commaList:
			
 
				             child.insert_after("，")
			
 
				-            if child.name != "td" and re.match('[（(][一二三四五六七八九十]+[)）]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
			
 
				-                child.insert_before("。")
			
 
				+            # if child.name != "td" and re.match('[（(][一二三四五六七八九十]+[)）]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割 20240930 注销，修复529501491关键词 三、中标供应商：（一）单位名称被分句
			
 
				+            #     child.insert_before("。")
			
 
				         # if child.name == 'div' and 'class' in child.attrs:
			
 
				         #     # 添加附件"attachment"标识
			
 
				         #     if "richTextFetch" in child['class']:
			
@@ -2933,7 +2962,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
			
 
				         article_processed = re.sub('(招标|采购)人(概况|信息)：?[，。]', '采购人信息：', article_processed)  # 2022/8/10统一表达
			
 
				         article_processed = article_processed.replace('\（%）', '')    # 中标（成交）金额（元）\（%）：498888.00， 处理 江西省政府采购网  金额特殊问题
			
 
				-        article_processed = re.sub('金额：?（(可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元（）\d]{8,20})）：?', '金额：', article_processed)    # 中标（成交）金额：（可填写下浮率、折扣率或费率）：29.3万元  金额特殊问题
			
 
				+        article_processed = re.sub('金额：?（(可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元（）\d]{8,20}）)）?：?', '金额：', article_processed)    # 中标（成交）金额：（可填写下浮率、折扣率或费率）：29.3万元  金额特殊问题 例：530377517
			
 
				         article_processed = re.sub('（不?含(可抵扣增值|\w{,8})税）', '', article_processed)    # 120637247 投标报价（元），（含可抵扣增值税）：277,560.00。
			
 
				         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式：名称)', '供应商名称', article_processed)  # 18889217, 84422177
			
 
				         article_processed = re.sub('，最高有效报价者：', '，中标人名称：', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -450,7 +450,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-09-29'}
			
 
				+    version_date = {'version_date': '2024-10-17'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -464,6 +464,12 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				     win_tenderer_set = set() # 记录所有预测为中标的实体集合
			
 
				   # print(PackageList)
			
 
				     #拿到各个实体的packageName,packageCode
			
 
				+    main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
			
 
				+    for entity in list_entity:
			
 
				+        if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
			
 
				+            main_contain_winner = True
			
 
				+            break
			
 
				+
			
 
				     for entity in list_entity:
			
 
				         if entity.entity_type in ['org','company']:
			
 
				             #限制附件里角色values[label]最大概率prob
			
@@ -477,6 +483,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
 
				             values = entity.values
			
 
				             role_prob = float(values[int(entity.label)])
			
 
				             if role_prob>=on_value and str(entity.label)!="5":
			
 
				+                if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人，不再提取附件中标人 避免 例：504046747 附件角色OCR错字变两个标段
			
 
				+                    continue
			
 
				                 if str(entity.label) in ["0","1"]:
			
 
				                     packageName = "Project"
			
 
				                 else:
			
@@ -824,11 +832,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
 
				                         scope_begin = [PackageList_scope[j]["sentence_index"],
			
 
				                                        PackageList_scope[j]["offsetWords_begin"]]
			
 
				                     else:
			
 
				-                        if j == 0:
			
 
				-                            scope_begin = [0, 0]
			
 
				-                        else:
			
 
				-                            scope_begin = [PackageList_scope[j - 1]["sentence_index"],
			
 
				-                                           PackageList_scope[j - 1]["offsetWords_begin"]]
			
 
				+                        scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
			
 
				+                        # if j == 0:
			
 
				+                        #     scope_begin = [0, 0]
			
 
				+                        # else:
			
 
				+                        #     scope_begin = [PackageList_scope[j - 1]["sentence_index"],
			
 
				+                        #                    PackageList_scope[j - 1]["offsetWords_begin"]]
			
 
				+
			
 
				                     if j == len(PackageList_scope) - 1:
			
 
				                         scope_end = [list_sentence[-1].sentence_index,
			
 
				                                      changeIndexFromWordToWords(list_sentence[-1].tokens,
			
@@ -4246,9 +4256,9 @@ def limit_maximum_amount(dic, list_entity):
 
				                     if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
			
 
				                         # print('单位元小金额且格式类似万元的乘以万倍')
			
 
				                         l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
			
 
				-                    else:
			
 
				-                        # print('中标金额小于限额：%d元 去除' % minximum_amount)
			
 
				-                        l["role_money"]['money'] = 0
			
 
				+                    # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例：520248605
			
 
				+                    #     # print('中标金额小于限额：%d元 去除' % minximum_amount)
			
 
				+                    #     l["role_money"]['money'] = 0
			
 
				 
			
 
				             if float(value['tendereeMoney']) > maximum_amount:
			
 
				                 flag = 1
			
@@ -4269,9 +4279,9 @@ def limit_maximum_amount(dic, list_entity):
 
				                 if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
			
 
				                     # print('单位元小金额且格式类似万元的乘以万倍')
			
 
				                     value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
			
 
				-                else:
			
 
				-                    # print('招标金额小于限额：%d元 去除' % minximum_amount)
			
 
				-                    value['tendereeMoney'] = 0
			
 
				+                # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例：520248605
			
 
				+                #     # print('招标金额小于限额：%d元 去除' % minximum_amount)
			
 
				+                #     value['tendereeMoney'] = 0
			
 
				 
			
 
				 
			
 
				 def limit_maximum_amount_backup(prem, industry):
			
@@ -4556,6 +4566,10 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
				             k = list(old_prem.keys()-set(['Project']))[0]
			
 
				             k_new = list(new_prem.keys())[0]
			
 
				             new_prem[k] = new_prem.pop(k_new)
			
 
				+        elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同，把表格提取包名替换为非表格包名
			
 
				+            k = list(old_prem.keys()-set(['Project']))[0]
			
 
				+            k_new = list(new_prem.keys())[0]
			
 
				+            new_prem[k] = new_prem.pop(k_new)
			
 
				 
			
 
				         if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号，非表格没提取到，合并到Project
			
 
				             k = list(new_prem.keys())[0]
			
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -105,6 +105,8 @@ class Model_role_classify_word():
 
				         text = re.sub('交易单位', '发布单位', text)
			
 
				         text = re.sub('[，：]各种数据：', '：', text) # 20240620优化 478331984 山东省交通运输厅站源提取不到 各种数据：中标单位，各种数据：济南金曰公路工程有限公司，
			
 
				         text = re.sub('电子签章', '', text) # 20240924 修复 529923459 电子签名：投标人名称（电子签章：西君兰信息科技有限公司，2024年9月7日 预测为中标
			
 
				+        text = re.sub('采购方式', 'xxxx', text) # 修复 499096797 招标人预测错误
			
 
				+        text = re.sub('中标人\d名称', '中标人名称', text) # 修复 499096797 中标人预测错误
			
 
				         return text.replace('(', '（').replace(')', '）').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
			
 
				 
			
 
				     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -497,6 +497,8 @@ class CodeNamePredict():
 
				                             continue
			
 
				                         elif '公司：你单位在' in _name: # 避免类似 339900030 这种作为项目名称，导致中标角色作为招标角色
			
 
				                             continue
			
 
				+                        elif _name.endswith('公司') and len(_name)<20: # 修复 456957250 雄县辉茂纸塑包装制品销售有限公司 作为项目名称
			
 
				+                            continue
			
 
				 
			
 
				                         #add name to entitys
			
 
				                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
			
@@ -856,7 +858,7 @@ class PREMPredict():
 
				                 elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位)：$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
			
 
				                     label = 5
			
 
				                     values[2] = 0.5
			
 
				-                elif re.search('(排名|排序|名次)：([4-9]|\d{2,})，', front) or re.search('序号：\d+，(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
			
 
				+                elif re.search('(排名|排序|名次)：([4-9]|\d{2,})，', front): # or re.search('序号：\d+，(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
			
 
				                     values[2] = 0.5
			
 
				                     label = 5
			
 
				                 elif re.search('税费', front) and re.search('^承担', behind):
			
@@ -1477,7 +1479,7 @@ class RoleRulePredictor():
 
				 
			
 
				         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
			
 
				         
			
 
				-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源，?[为：]+\w{2,4}资金|采购成本价|总费用约?为")  # |建安费用 不作为招标金额
			
 
				+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源，?[为：]+\w{2,4}资金|采购成本价|总费用约?为|招标规模")  # |建安费用 不作为招标金额
			
 
				         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[）\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬（含税）：|经评审的价格")  # 单写 总价 不能作为中标金额，很多表格有单价、总价
			
 
				         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
			
 
				         self.pattern_money_other = re.compile("代理费|服务费")
			
@@ -2391,6 +2393,8 @@ class RoleGrade():
 
				         for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表，改为非角色
			
 
				             if entity.entity_text in all_tenderee_agency:
			
 
				                 entity.label = 5
			
 
				+            elif entity.in_attachment: # 附件低概率中标角色不要 避免：516109391 桂林银行崇左宁明支行，宁明县城中镇兴宁大道中70号，预测为中标
			
 
				+                entity.label = 5
			
 
				 
			
 
				         if org_winner != []:
			
 
				             flag = 0
			
@@ -2432,7 +2436,7 @@ class MoneyGrade():
 
				                     if ser:
			
 
				                         groupdict = pattern.split('>')[0].replace('(?P<', '')
			
 
				                         _role, _direct, _prob = groupdict.split('_')
			
 
				-                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100:
			
 
				+                        if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
			
 
				                             _prob = 6
			
 
				                         _label = role2id.get(_role)
			
 
				                         if _label != entity.label:
			
@@ -2455,6 +2459,8 @@ class MoneyGrade():
 
				                     # _prob = min_prob - 0.1 if in_att else min_prob
			
 
				                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
			
 
				                     # print('找不到规则修改金额概率：', entity.entity_text, entity.label, entity.values)
			
 
				+            if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
			
 
				+                entity.label = 2
			
 
				 
			
 
				 
			
 
				 # 时间类别
			
@@ -6022,7 +6028,7 @@ class DistrictPredictor():
 
				             return province_l, city_l, district_l
			
 
				 
			
 
				         def get_pro_city_dis_score(text, text_weight=1):
			
 
				-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区', ' ', text)
			
 
				+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河', ' ', text)
			
 
				             text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
			
 
				             text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
			
 
				             text = re.sub('茂名滨海新区', '茂名市', text)
			
@@ -6121,13 +6127,13 @@ class DistrictPredictor():
 
				                         pro_idx = idx_dic[idx]['省']
			
 
				                         if pro_idx in pro_ids:
			
 
				                             pro_ids[pro_idx] += (score + 0) * w * weight
			
 
				-                        else:
			
 
				-                            pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
			
 
				+                        # else: # 20241015 注销 区县简称且不在提取的省市下面，不加分，避免提取错误 例：536550843
			
 
				+                        #     pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
			
 
				                         city_idx = idx_dic[idx]['市']
			
 
				                         if city_idx in city_ids:
			
 
				                             city_ids[city_idx] += (score + 0) * w * weight
			
 
				-                        else:
			
 
				-                            city_ids[city_idx] = (score + 0) * w * weight * 0.1
			
 
				+                        # else: # 20241015 注销 区县简称且不在提取的省市下面，不加分，避免提取错误 例：536550843
			
 
				+                        #     city_ids[city_idx] = (score + 0) * w * weight * 0.1
			
 
				 
			
 
				             for k, v in pro_ids.items():
			
 
				                 pro_ids[k] = v * text_weight
			
@@ -6995,7 +7001,7 @@ class CandidateExtractor(object):
 
				         header_dic = dict()
			
 
				         flag = False
			
 
				         contain_header = False
			
 
				-        if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
			
 
				+        if len(set(fix_td_list) & self.headerset)>=2 and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
			
 
				             flag = True
			
 
				             for i in range(len(td_list)) :
			
 
				                 text = td_list[i]
			
@@ -8080,7 +8086,7 @@ if __name__=="__main__":
 
				     rs = tb_extract.predict(html, [
			
 
				         "江苏中联铸本混凝土有限公司",
			
 
				         "鼓楼区协荣机械设备经销部"
			
 
				-    ], web_source_name = '', all_winner=True)
			
 
				+    ], web_source_name = '', all_winner=False)
			
 
				     print('标段数：',len(rs[0]))
			
 
				     print(rs)