2 anni fa · fad01331e9
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2537,9 +2537,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															             #                       "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千万亿元]*)())",
														
 
															             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,，]*(?:\.\d+)?(?:，?)[百千万亿]*)[\(（]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\)）]?)"}
														
 
															             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															-                                  "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
														
 
															-                                  "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)())",
														
 
															-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															+                                  "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>-?[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
														
 
															+                                  "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-?[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)())",
														
 
															+                                  "behind_m":"(()()(?P<money_behind_m>-?[0-9][\d,]*(?:\.\d+)?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															             # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。
														
 
															             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
														
@@ -2693,6 +2693,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                             break
														
 
															                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
														
 
															+                    symbol = '-' if entity_text.startswith('-') else ''  # 负值金额前面保留负号
														
 
															+
														
 
															                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
														
 
															                     # print('转换前金额：', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
														
 
															                     if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
														
@@ -2754,6 +2756,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                             _exists = True
														
 
															                     if not _exists:
														
 
															                         if float(entity_text)>1:
														
 
															+                            if symbol == '-': # 负值金额保留负号
														
 
															+                                entity_text = '-'+entity_text
														
 
															                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
														
 
															                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
														
 
															                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -4266,6 +4266,7 @@ class DistrictPredictor():
 
															             self.full_name = full_name
														
 
															             self.short2id = short2id
														
 
															             self.full2id = full2id
														
 
															+        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
														
 
															     def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
														
 
															         '''
														
@@ -4330,7 +4331,7 @@ class DistrictPredictor():
 
															                                     if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
														
 
															                                         type_w = 2
														
 
															                                     else:
														
 
															-                                        type_w = 1
														
 
															+                                        type_w = 0.5
														
 
															                                 id_set.add(_id)
														
 
															                                 score2 += w * type_w
														
 
															                             score_l.append([_id, score * w + score2] + area)
														
@@ -4409,17 +4410,14 @@ class DistrictPredictor():
 
															         def get_all_addr(list_entitys):
														
 
															             tenderee_l = []
														
 
															-            other_roles = []
														
 
															             addr_l = []
														
 
															             for ent in list_entitys[0]:
														
 
															-                if ent.entity_type == 'location':
														
 
															+                if ent.entity_type == 'location' and len(ent.entity_text)>2:
														
 
															                     addr_l.append(ent.entity_text)
														
 
															                 elif ent.entity_type in ['org', 'company']:
														
 
															-                    if ent.label == 0:
														
 
															+                    if ent.label in [0, 1]:  # 加招标或代理
														
 
															                         tenderee_l.append(ent.entity_text)
														
 
															-                    else:
														
 
															-                        other_roles.append(ent.entity_text)
														
 
															-            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
														
 
															+            return ' '.join(addr_l), ' '.join(tenderee_l)
														
 
															         def get_title_addr(text):
														
 
															             p1 = '(\w{2,8}[省市州区县][^\w]*)+'
														
@@ -4436,21 +4434,26 @@ class DistrictPredictor():
 
															             content = list_articles[0].content
														
 
															         tenderee, tenderee_address = get_ree_addr(prem)
														
 
															+        msc = ""
														
 
															         pro_addr = get_project_addr(content)
														
 
															         if pro_addr != "":
														
 
															+            msc += '使用规则提取的项目地址；'
														
 
															             tenderee_address = pro_addr
														
 
															         else:
														
 
															             role_addr = get_role_address(content)
														
 
															             if role_addr != "":
														
 
															+                msc += '使用规则提取的联系人地址；'
														
 
															                 tenderee_address = role_addr
														
 
															         if tenderee_address == "":
														
 
															             title_addr = get_title_addr(title)
														
 
															             if title_addr != "":
														
 
															+                msc += '使用规则提取的标题地址；'
														
 
															                 tenderee_address = title_addr
														
 
															             else:
														
 
															                 bid_addr = get_bid_addr(content)
														
 
															                 if bid_addr != "":
														
 
															+                    msc += '使用规则提取的开标地址；'
														
 
															                     tenderee_address = bid_addr
														
 
															         project_name = str(project_name)
														
@@ -4466,24 +4469,29 @@ class DistrictPredictor():
 
															         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
														
 
															         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
														
 
															         # print('text1:', text1)
														
 
															+        msc += '## 第一次预测输入：%s ##；'%text1
														
 
															         rs = get_area(text1, web_source_name)
														
 
															-
														
 
															+        msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
														
 
															+        rs['district']['province'], rs['district']['city'], rs['district']['district'])
														
 
															+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
														
 
															         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
														
 
															-            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
														
 
															-            if tenderees != "":
														
 
															-                text2 = tenderees + " " + all_addr
														
 
															-                # print('所有地址：', all_addr)
														
 
															-            else:
														
 
															-                text2 = other_roles + " " + all_addr
														
 
															-                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
														
 
															+            msc = ""
														
 
															+            all_addr, tenderees = get_all_addr(list_entitys)
														
 
															+            text2 = tenderees + " " + all_addr + ' ' + title
														
 
															+            msc += '使用实体列表所有招标人+所有地址；'
														
 
															+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
														
 
															             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
														
 
															             # print('text2:', text2)
														
 
															+            msc += '## 第二次预测输入：%s ##'%text2
														
 
															             rs2 = get_area(text2, web_source_name, not_in_content=False)
														
 
															             rs2['district']['is_in_text'] = True
														
 
															             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
														
 
															                 rs = rs2
														
 
															             elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
														
 
															                 rs = rs2
														
 
															+            msc += '预测结果：省份：%s， 城市：%s，区县：%s'%(
														
 
															+                rs['district']['province'],rs['district']['city'],rs['district']['district'])
														
 
															+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
														
 
															         return rs
														
 
															 class TableTag2List():