Преглед на файлове

调整县区权重;增加负值金额符号

lsm преди 2 години
родител
ревизия
fad01331e9
променени са 2 файла, в които са добавени 30 реда и са изтрити 18 реда
  1. 7 3
      BiddingKG/dl/interface/Preprocessing.py
  2. 23 15
      BiddingKG/dl/interface/predictor.py

+ 7 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -2537,9 +2537,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
+                                  "behind_m":"(()()(?P<money_behind_m>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
@@ -2693,6 +2693,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             break
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
 
+                    symbol = '-' if entity_text.startswith('-') else ''  # 负值金额前面保留负号
+
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
                     # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
                     if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
@@ -2754,6 +2756,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             _exists = True
                     if not _exists:
                         if float(entity_text)>1:
+                            if symbol == '-': # 负值金额保留负号
+                                entity_text = '-'+entity_text
                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注

+ 23 - 15
BiddingKG/dl/interface/predictor.py

@@ -4266,6 +4266,7 @@ class DistrictPredictor():
             self.full_name = full_name
             self.short2id = short2id
             self.full2id = full2id
+        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
 
     def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
         '''
@@ -4330,7 +4331,7 @@ class DistrictPredictor():
                                     if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
                                         type_w = 2
                                     else:
-                                        type_w = 1
+                                        type_w = 0.5
                                 id_set.add(_id)
                                 score2 += w * type_w
                             score_l.append([_id, score * w + score2] + area)
@@ -4409,17 +4410,14 @@ class DistrictPredictor():
 
         def get_all_addr(list_entitys):
             tenderee_l = []
-            other_roles = []
             addr_l = []
             for ent in list_entitys[0]:
-                if ent.entity_type == 'location':
+                if ent.entity_type == 'location' and len(ent.entity_text)>2:
                     addr_l.append(ent.entity_text)
                 elif ent.entity_type in ['org', 'company']:
-                    if ent.label == 0:
+                    if ent.label in [0, 1]:  # 加招标或代理
                         tenderee_l.append(ent.entity_text)
-                    else:
-                        other_roles.append(ent.entity_text)
-            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
+            return ' '.join(addr_l), ' '.join(tenderee_l)
 
         def get_title_addr(text):
             p1 = '(\w{2,8}[省市州区县][^\w]*)+'
@@ -4436,21 +4434,26 @@ class DistrictPredictor():
             content = list_articles[0].content
 
         tenderee, tenderee_address = get_ree_addr(prem)
+        msc = ""
         pro_addr = get_project_addr(content)
         if pro_addr != "":
+            msc += '使用规则提取的项目地址;'
             tenderee_address = pro_addr
         else:
             role_addr = get_role_address(content)
             if role_addr != "":
+                msc += '使用规则提取的联系人地址;'
                 tenderee_address = role_addr
 
         if tenderee_address == "":
             title_addr = get_title_addr(title)
             if title_addr != "":
+                msc += '使用规则提取的标题地址;'
                 tenderee_address = title_addr
             else:
                 bid_addr = get_bid_addr(content)
                 if bid_addr != "":
+                    msc += '使用规则提取的开标地址;'
                     tenderee_address = bid_addr
 
         project_name = str(project_name)
@@ -4466,24 +4469,29 @@ class DistrictPredictor():
         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
         # print('text1:', text1)
+        msc += '## 第一次预测输入:%s ##;'%text1
         rs = get_area(text1, web_source_name)
-
+        msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
+        rs['district']['province'], rs['district']['city'], rs['district']['district'])
+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
-            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
-            if tenderees != "":
-                text2 = tenderees + " " + all_addr
-                # print('所有地址:', all_addr)
-            else:
-                text2 = other_roles + " " + all_addr
-                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
+            msc = ""
+            all_addr, tenderees = get_all_addr(list_entitys)
+            text2 = tenderees + " " + all_addr + ' ' + title
+            msc += '使用实体列表所有招标人+所有地址;'
+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
             # print('text2:', text2)
+            msc += '## 第二次预测输入:%s ##'%text2
             rs2 = get_area(text2, web_source_name, not_in_content=False)
             rs2['district']['is_in_text'] = True
             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
                 rs = rs2
             elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
                 rs = rs2
+            msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
+                rs['district']['province'],rs['district']['city'],rs['district']['district'])
+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
         return rs
 
 class TableTag2List():