преди 2 години · fad01331e9
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2537,9 +2537,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				             #                       "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)())",
			
 
				             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\)）]?)"}
			
 
				             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				-                                  "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				-                                  "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)())",
			
 
				-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				+                                  "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>-?[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				+                                  "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-?[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)())",
			
 
				+                                  "behind_m":"(()()(?P<money_behind_m>-?[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				             # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。
			
 
				 
			
 
				             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
			
@@ -2693,6 +2693,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                             break
			
 
				                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
			
 
				 
			
 
				+                    symbol = '-' if entity_text.startswith('-') else ''  # 负值金额前面保留负号
			
 
				+
			
 
				                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
			
 
				                     # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
			
 
				                     if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
			
@@ -2754,6 +2756,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
				                             _exists = True
			
 
				                     if not _exists:
			
 
				                         if float(entity_text)>1:
			
 
				+                            if symbol == '-': # 负值金额保留负号
			
 
				+                                entity_text = '-'+entity_text
			
 
				                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
			
 
				                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
			
 
				                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -4266,6 +4266,7 @@ class DistrictPredictor():
 
				             self.full_name = full_name
			
 
				             self.short2id = short2id
			
 
				             self.full2id = full2id
			
 
				+        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
			
 
				 
			
 
				     def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				         '''
			
@@ -4330,7 +4331,7 @@ class DistrictPredictor():
 
				                                     if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
			
 
				                                         type_w = 2
			
 
				                                     else:
			
 
				-                                        type_w = 1
			
 
				+                                        type_w = 0.5
			
 
				                                 id_set.add(_id)
			
 
				                                 score2 += w * type_w
			
 
				                             score_l.append([_id, score * w + score2] + area)
			
@@ -4409,17 +4410,14 @@ class DistrictPredictor():
 
				 
			
 
				         def get_all_addr(list_entitys):
			
 
				             tenderee_l = []
			
 
				-            other_roles = []
			
 
				             addr_l = []
			
 
				             for ent in list_entitys[0]:
			
 
				-                if ent.entity_type == 'location':
			
 
				+                if ent.entity_type == 'location' and len(ent.entity_text)>2:
			
 
				                     addr_l.append(ent.entity_text)
			
 
				                 elif ent.entity_type in ['org', 'company']:
			
 
				-                    if ent.label == 0:
			
 
				+                    if ent.label in [0, 1]:  # 加招标或代理
			
 
				                         tenderee_l.append(ent.entity_text)
			
 
				-                    else:
			
 
				-                        other_roles.append(ent.entity_text)
			
 
				-            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
			
 
				+            return ' '.join(addr_l), ' '.join(tenderee_l)
			
 
				 
			
 
				         def get_title_addr(text):
			
 
				             p1 = '(\w{2,8}[省市州区县][^\w]*)+'
			
@@ -4436,21 +4434,26 @@ class DistrictPredictor():
 
				             content = list_articles[0].content
			
 
				 
			
 
				         tenderee, tenderee_address = get_ree_addr(prem)
			
 
				+        msc = ""
			
 
				         pro_addr = get_project_addr(content)
			
 
				         if pro_addr != "":
			
 
				+            msc += '使用规则提取的项目地址；'
			
 
				             tenderee_address = pro_addr
			
 
				         else:
			
 
				             role_addr = get_role_address(content)
			
 
				             if role_addr != "":
			
 
				+                msc += '使用规则提取的联系人地址；'
			
 
				                 tenderee_address = role_addr
			
 
				 
			
 
				         if tenderee_address == "":
			
 
				             title_addr = get_title_addr(title)
			
 
				             if title_addr != "":
			
 
				+                msc += '使用规则提取的标题地址；'
			
 
				                 tenderee_address = title_addr
			
 
				             else:
			
 
				                 bid_addr = get_bid_addr(content)
			
 
				                 if bid_addr != "":
			
 
				+                    msc += '使用规则提取的开标地址；'
			
 
				                     tenderee_address = bid_addr
			
 
				 
			
 
				         project_name = str(project_name)
			
@@ -4466,24 +4469,29 @@ class DistrictPredictor():
 
				         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
			
 
				         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
			
 
				         # print('text1:', text1)
			
 
				+        msc += '## 第一次预测输入：%s ##；'%text1
			
 
				         rs = get_area(text1, web_source_name)
			
 
				-
			
 
				+        msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
			
 
				+        rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
			
 
				         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
			
 
				-            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
			
 
				-            if tenderees != "":
			
 
				-                text2 = tenderees + " " + all_addr
			
 
				-                # print('所有地址：', all_addr)
			
 
				-            else:
			
 
				-                text2 = other_roles + " " + all_addr
			
 
				-                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
			
 
				+            msc = ""
			
 
				+            all_addr, tenderees = get_all_addr(list_entitys)
			
 
				+            text2 = tenderees + " " + all_addr + ' ' + title
			
 
				+            msc += '使用实体列表所有招标人+所有地址；'
			
 
				+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
			
 
				             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
			
 
				             # print('text2:', text2)
			
 
				+            msc += '## 第二次预测输入：%s ##'%text2
			
 
				             rs2 = get_area(text2, web_source_name, not_in_content=False)
			
 
				             rs2['district']['is_in_text'] = True
			
 
				             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
			
 
				                 rs = rs2
			
 
				             elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
			
 
				                 rs = rs2
			
 
				+            msc += '预测结果：省份：%s， 城市：%s，区县：%s'%(
			
 
				+                rs['district']['province'],rs['district']['city'],rs['district']['district'])
			
 
				+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
			
 
				         return rs
			
 
				 
			
 
				 class TableTag2List():