Browse Source

修复同时包含万元及元单位超过5千的金额为元;去除金额前包含固定特殊字符

lsm 2 years ago
parent
commit
cc93c9889a

+ 1 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -2164,6 +2164,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('招标(建设)单位', '招标单位')  #2022/8/10 修正预测不到表达
         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
+        article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -254,7 +254,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2022-11-24'}
+    version_date = {'version_date': '2022-12-08'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 23 - 7
BiddingKG/dl/interface/getAttributes.py

@@ -1125,13 +1125,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     p_entity = 0
 
     # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
-    money_list = [it for it in list_entity if it.entity_type=="money"]
-    for i in range(len(money_list)-1):
-        for j in range(1, len(money_list)):
-            if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
-                    Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
-                money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
-                # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
+    # money_list = [it for it in list_entity if it.entity_type=="money"]
+    # for i in range(len(money_list)-1):
+    #     for j in range(1, len(money_list)):
+    #         if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
+    #                 Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
+    #             money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
+    #             # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
+
+    '''同样金额同时有元及万元单位的,把万元的金额改为元'''
+    wanyuan = []
+    yuan = []
+    for it in list_entity:
+        if it.entity_type == "money" and float(it.entity_text)>5000:
+            if it.money_unit == '万元':
+                wanyuan.append(it)
+            else:
+                yuan.append(it)
+    if wanyuan != [] and yuan != []:
+        for m1 in wanyuan:
+            for m2 in yuan:
+                if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000:
+                    m1.entity_text = m2.entity_text
+
     
     #遍历所有实体
     # while(p_entity<len(list_entity)):

+ 3 - 1
BiddingKG/dl/interface/predictor.py

@@ -1493,7 +1493,7 @@ class RoleRuleFinalAdd():
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
                 if ent.label == 0 and ent.values[ent.label]>=0.5:
-                    if '公共资源交易中心' in ent.entity_text:
+                    if '公共资源交易中心' in ent.entity_text:  # 公共资源交易中心不算招标或代理,只算平台
                         ent.label = 5
                         continue
                     tenderee_list.append(ent.entity_text)
@@ -4474,6 +4474,7 @@ class DistrictPredictor():
         msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
         rs['district']['province'], rs['district']['city'], rs['district']['district'])
         # self.f.write('%s %s \n' % (list_articles[0].id, msc))
+        # print('地区匹配:', msc)
         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
             msc = ""
             all_addr, tenderees = get_all_addr(list_entitys)
@@ -4492,6 +4493,7 @@ class DistrictPredictor():
             msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
                 rs['district']['province'],rs['district']['city'],rs['district']['district'])
         # self.f.write('%s %s \n'%(list_articles[0].id, msc))
+        # print('地区匹配:', msc)
         return rs
 
 class TableTag2List():