|
@@ -1816,15 +1816,18 @@ class RoleRuleFinalAdd():
|
|
'''
|
|
'''
|
|
# text_end = list_articles[0].content.split('##attachment##')[0][-40:]
|
|
# text_end = list_articles[0].content.split('##attachment##')[0][-40:]
|
|
main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
|
|
main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
|
|
- end_tokens = []
|
|
|
|
- for sentence in main_sentences[-5:]:
|
|
|
|
- end_tokens.extend(sentence.tokens)
|
|
|
|
- # text_end = "".join(end_tokens[-30:])
|
|
|
|
- text_end = "".join(end_tokens)
|
|
|
|
- text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
|
|
|
|
- text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
|
|
- # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
|
- sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
|
|
|
+ # end_tokens = []
|
|
|
|
+ for sentence in main_sentences[-5:][::-1]: # 402073799 最后五句由后往前,匹配文末角色,日期
|
|
|
|
+ # end_tokens.extend(sentence.tokens)
|
|
|
|
+ # text_end = "".join(end_tokens[-30:])
|
|
|
|
+ # text_end = "".join(end_tokens)
|
|
|
|
+ text_end = "".join(sentence.tokens)
|
|
|
|
+ text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
|
|
|
|
+ text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
|
|
+ # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
|
+ sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
|
+ if sear_ent:
|
|
|
|
+ break
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
@@ -5053,29 +5056,32 @@ class IndustryPredictor():
|
|
|
|
|
|
class DistrictPredictor():
|
|
class DistrictPredictor():
|
|
def __init__(self):
|
|
def __init__(self):
|
|
- with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
|
|
|
|
- dist_dic = pickle.load(f)
|
|
|
|
- short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
|
|
|
|
- full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
|
|
|
|
- short2id = {}
|
|
|
|
- full2id = {}
|
|
|
|
- for k, v in dist_dic.items():
|
|
|
|
- if v['简称'] not in short2id:
|
|
|
|
- short2id[v['简称']] = [k]
|
|
|
|
- else:
|
|
|
|
- short2id[v['简称']].append(k)
|
|
|
|
- if v['全称'] not in full2id:
|
|
|
|
- full2id[v['全称']] = [k]
|
|
|
|
- else:
|
|
|
|
- full2id[v['全称']].append(k)
|
|
|
|
- self.dist_dic = dist_dic
|
|
|
|
- self.short_name = short_name
|
|
|
|
- self.full_name = full_name
|
|
|
|
- self.short2id = short2id
|
|
|
|
- self.full2id = full2id
|
|
|
|
- # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
|
|
|
|
-
|
|
|
|
- def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
|
|
|
|
+ # with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
|
|
|
|
+ # dist_dic = pickle.load(f)
|
|
|
|
+ # short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
|
|
|
|
+ # full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
|
|
|
|
+ # short2id = {}
|
|
|
|
+ # full2id = {}
|
|
|
|
+ # for k, v in dist_dic.items():
|
|
|
|
+ # if v['简称'] not in short2id:
|
|
|
|
+ # short2id[v['简称']] = [k]
|
|
|
|
+ # else:
|
|
|
|
+ # short2id[v['简称']].append(k)
|
|
|
|
+ # if v['全称'] not in full2id:
|
|
|
|
+ # full2id[v['全称']] = [k]
|
|
|
|
+ # else:
|
|
|
|
+ # full2id[v['全称']].append(k)
|
|
|
|
+ # self.dist_dic = dist_dic
|
|
|
|
+ # self.short_name = short_name
|
|
|
|
+ # self.full_name = full_name
|
|
|
|
+ # self.short2id = short2id
|
|
|
|
+ # self.full2id = full2id
|
|
|
|
+ # # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
|
|
|
|
+ with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
|
|
|
|
+ district_tuple = pickle.load(f)
|
|
|
|
+ self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
|
|
|
|
+
|
|
|
|
+ def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
'''
|
|
'''
|
|
先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
|
|
先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
|
|
:param project_name:
|
|
:param project_name:
|
|
@@ -5189,9 +5195,9 @@ class DistrictPredictor():
|
|
3:地址直接在招标人后面 招标人:xxx,地址:xxx
|
|
3:地址直接在招标人后面 招标人:xxx,地址:xxx
|
|
4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
|
|
4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
|
|
'''
|
|
'''
|
|
- p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
|
|
|
|
- p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
|
|
|
|
- p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
|
|
|
|
|
|
+ p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
if re.search(p3, text):
|
|
if re.search(p3, text):
|
|
return re.search(p3, text).group('addr')
|
|
return re.search(p3, text).group('addr')
|
|
elif re.search(p4, text):
|
|
elif re.search(p4, text):
|
|
@@ -5202,16 +5208,16 @@ class DistrictPredictor():
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def get_project_addr(text):
|
|
def get_project_addr(text):
|
|
- p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
|
|
|
|
|
|
+ p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
if re.search(p1, text):
|
|
if re.search(p1, text):
|
|
- return re.search(p1, text).group(0)
|
|
|
|
|
|
+ return re.search(p1, text).group('addr')
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def get_bid_addr(text):
|
|
def get_bid_addr(text):
|
|
- p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
|
|
|
|
|
|
+ p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
if re.search(p2, text):
|
|
if re.search(p2, text):
|
|
- return re.search(p2, text).group(0)
|
|
|
|
|
|
+ return re.search(p2, text).group('addr')
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
@@ -5227,9 +5233,9 @@ class DistrictPredictor():
|
|
return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
|
|
|
|
def get_title_addr(text):
|
|
def get_title_addr(text):
|
|
- p1 = '(\w{2,8}[省市州区县][^\w]*)+'
|
|
|
|
|
|
+ p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
if re.search(p1, text):
|
|
if re.search(p1, text):
|
|
- return re.search(p1, text).group(0)
|
|
|
|
|
|
+ return re.search(p1, text).group('addr')
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
@@ -5312,6 +5318,370 @@ class DistrictPredictor():
|
|
# self.f.write('%s %s \n'%(list_articles[0].id, msc))
|
|
# self.f.write('%s %s \n'%(list_articles[0].id, msc))
|
|
# print('地区匹配:', msc)
|
|
# print('地区匹配:', msc)
|
|
return rs
|
|
return rs
|
|
|
|
+ def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
|
|
+ '''
|
|
|
|
+ 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
|
|
|
|
+ :param project_name:
|
|
|
|
+ :param prem:
|
|
|
|
+ :param title:
|
|
|
|
+ :param list_articles:
|
|
|
|
+ :param web_source_name:
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ def get_ree_addr(prem):
|
|
|
|
+ tenderee = ""
|
|
|
|
+ tenderee_address = ""
|
|
|
|
+ try:
|
|
|
|
+ for v in prem[0]['prem'].values():
|
|
|
|
+ for link in v['roleList']:
|
|
|
|
+ if link['role_name'] == 'tenderee' and tenderee == "":
|
|
|
|
+ tenderee = link['role_text']
|
|
|
|
+ tenderee_address = link['address']
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print('解析prem 获取招标人、及地址出错')
|
|
|
|
+ return tenderee, tenderee_address
|
|
|
|
+
|
|
|
|
+ def get_role_address(text):
|
|
|
|
+ '''正则匹配获取招标人地址
|
|
|
|
+ 3:地址直接在招标人后面 招标人:xxx,地址:xxx
|
|
|
|
+ 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
|
|
|
|
+ '''
|
|
|
|
+ p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ if re.search(p3, text):
|
|
|
|
+ return re.search(p3, text).group('addr')
|
|
|
|
+ elif re.search(p4, text):
|
|
|
|
+ return re.search(p4, text).group('addr')
|
|
|
|
+ elif re.search(p5, text):
|
|
|
|
+ return re.search(p5, text).group('addr')
|
|
|
|
+ else:
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+ def get_project_addr(text):
|
|
|
|
+ p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ if re.search(p1, text):
|
|
|
|
+ return re.search(p1, text).group('addr')
|
|
|
|
+ else:
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+ def get_bid_addr(text):
|
|
|
|
+ p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ if re.search(p2, text):
|
|
|
|
+ return re.search(p2, text).group('addr')
|
|
|
|
+ else:
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+ def get_all_addr(list_entitys):
|
|
|
|
+ tenderee_l = []
|
|
|
|
+ addr_l = []
|
|
|
|
+ for ent in list_entitys[0]:
|
|
|
|
+ if ent.entity_type == 'location' and len(ent.entity_text) > 2:
|
|
|
|
+ addr_l.append(ent.entity_text)
|
|
|
|
+ elif ent.entity_type in ['org', 'company']:
|
|
|
|
+ if ent.label in [0, 1]: # 加招标或代理
|
|
|
|
+ tenderee_l.append(ent.entity_text)
|
|
|
|
+ return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
|
|
+
|
|
|
|
+ def get_title_addr(text):
|
|
|
|
+ p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ if re.search(p1, text):
|
|
|
|
+ return re.search(p1, text).group('addr')
|
|
|
|
+ else:
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+ def find_areas(pettern, text):
|
|
|
|
+ '''
|
|
|
|
+ 通过正则匹配字符串返回地址
|
|
|
|
+ :param pettern: 地址正则 广东省|广西省|...
|
|
|
|
+ :param text: 待匹配文本
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ addr = []
|
|
|
|
+ for it in re.finditer(pettern, text):
|
|
|
|
+ if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
|
|
|
|
+ '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
|
|
|
|
+ continue
|
|
|
|
+ addr.append((it.group(0), it.start(), it.end()))
|
|
|
|
+ if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
|
|
|
|
+ addr.append((it.group(0), it.start(), it.end()))
|
|
|
|
+ return addr
|
|
|
|
+
|
|
|
|
+ def get_pro_city_dis_score(text, text_weight=1):
|
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
|
|
|
|
+ province_l = find_areas(p_pro, text)
|
|
|
|
+ city_l = find_areas(p_city, text)
|
|
|
|
+ district_l = find_areas(p_dis, text)
|
|
|
|
+
|
|
|
|
+ province_l = chage_area2score(province_l, max_len=len(text))
|
|
|
|
+ city_l = chage_area2score(city_l, max_len=len(text))
|
|
|
|
+ district_l = chage_area2score(district_l, max_len=len(text))
|
|
|
|
+
|
|
|
|
+ pro_ids = dict()
|
|
|
|
+ city_ids = dict()
|
|
|
|
+ dis_ids = dict()
|
|
|
|
+ for pro in province_l:
|
|
|
|
+ name, score = pro
|
|
|
|
+ assert (name in full_dic['province'] or name in short_dic['province'])
|
|
|
|
+ if name in full_dic['province']:
|
|
|
|
+ idx = full_dic['province'][name]
|
|
|
|
+ if idx not in pro_ids:
|
|
|
|
+ pro_ids[idx] = 0
|
|
|
|
+ pro_ids[idx] += (score + 2)
|
|
|
|
+ else:
|
|
|
|
+ idx = short_dic['province'][name]
|
|
|
|
+ if idx not in pro_ids:
|
|
|
|
+ pro_ids[idx] = 0
|
|
|
|
+ pro_ids[idx] += (score + 1)
|
|
|
|
+
|
|
|
|
+ for city in city_l:
|
|
|
|
+ name, score = city
|
|
|
|
+ if name in full_dic['city']:
|
|
|
|
+ w = 0.1 if len(full_dic['city'][name]) > 1 else 1
|
|
|
|
+ for idx in full_dic['city'][name]:
|
|
|
|
+ if idx not in city_ids:
|
|
|
|
+ city_ids[idx] = 0
|
|
|
|
+ # weight = idx_dic[idx]['权重']
|
|
|
|
+ city_ids[idx] += (score + 2) * w
|
|
|
|
+
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in pro_ids:
|
|
|
|
+ pro_ids[pro_idx] += (score + 2) * w
|
|
|
|
+ else:
|
|
|
|
+ pro_ids[pro_idx] = (score + 2) * w * 0.5
|
|
|
|
+ elif name in short_dic['city']:
|
|
|
|
+ w = 0.1 if len(short_dic['city'][name]) > 1 else 1
|
|
|
|
+ for idx in short_dic['city'][name]:
|
|
|
|
+ if idx not in city_ids:
|
|
|
|
+ city_ids[idx] = 0
|
|
|
|
+ weight = idx_dic[idx]['权重']
|
|
|
|
+ city_ids[idx] += (score + 1) * w * weight
|
|
|
|
+
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in pro_ids:
|
|
|
|
+ pro_ids[pro_idx] += (score + 1) * w * weight
|
|
|
|
+ else:
|
|
|
|
+ pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
|
|
|
|
+
|
|
|
|
+ for dis in district_l:
|
|
|
|
+ name, score = dis
|
|
|
|
+ if name in full_dic['district']:
|
|
|
|
+ w = 0.1 if len(full_dic['district'][name]) > 1 else 1
|
|
|
|
+ for idx in full_dic['district'][name]:
|
|
|
|
+ if idx not in dis_ids:
|
|
|
|
+ dis_ids[idx] = 0
|
|
|
|
+ # weight = idx_dic[idx]['权重']
|
|
|
|
+ dis_ids[idx] += (score + 1) * w
|
|
|
|
+
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in pro_ids:
|
|
|
|
+ pro_ids[pro_idx] += (score + 1) * w
|
|
|
|
+ else:
|
|
|
|
+ pro_ids[pro_idx] = (score + 1) * w * 0.5
|
|
|
|
+ city_idx = idx_dic[idx]['市']
|
|
|
|
+ if city_idx in city_ids:
|
|
|
|
+ city_ids[city_idx] += (score + 1) * w
|
|
|
|
+ else:
|
|
|
|
+ city_ids[city_idx] = (score + 1) * w * 0.5
|
|
|
|
+ elif name in short_dic['district']:
|
|
|
|
+ w = 0.1 if len(short_dic['district'][name]) > 1 else 1
|
|
|
|
+ for idx in short_dic['district'][name]:
|
|
|
|
+ if idx not in dis_ids:
|
|
|
|
+ dis_ids[idx] = 0
|
|
|
|
+ weight = idx_dic[idx]['权重']
|
|
|
|
+ dis_ids[idx] += (score + 0) * w
|
|
|
|
+
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in pro_ids:
|
|
|
|
+ pro_ids[pro_idx] += (score + 0) * w * weight
|
|
|
|
+ else:
|
|
|
|
+ pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
|
|
|
|
+ city_idx = idx_dic[idx]['市']
|
|
|
|
+ if city_idx in city_ids:
|
|
|
|
+ city_ids[city_idx] += (score + 0) * w * weight
|
|
|
|
+ else:
|
|
|
|
+ city_ids[city_idx] = (score + 0) * w * weight * 0.5
|
|
|
|
+
|
|
|
|
+ for k, v in pro_ids.items():
|
|
|
|
+ pro_ids[k] = v * text_weight
|
|
|
|
+ for k, v in city_ids.items():
|
|
|
|
+ city_ids[k] = v * text_weight
|
|
|
|
+ for k, v in dis_ids.items():
|
|
|
|
+ dis_ids[k] = v * text_weight
|
|
|
|
+ return pro_ids, city_ids, dis_ids
|
|
|
|
+
|
|
|
|
+ def chage_area2score(group_list, max_len):
|
|
|
|
+ '''
|
|
|
|
+ 把匹配的的地址转为分数
|
|
|
|
+ :param group_list: [('name', b, e)]
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ area_list = []
|
|
|
|
+ if group_list != []:
|
|
|
|
+ for it in group_list:
|
|
|
|
+ name, b, e = it
|
|
|
|
+ area_list.append((name, (e - b + e) / max_len / 2))
|
|
|
|
+ return area_list
|
|
|
|
+
|
|
|
|
+ def get_final_addr(pro_ids, city_ids, dis_ids):
|
|
|
|
+ '''
|
|
|
|
+ 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
|
|
|
|
+ :param province_l: 匹配到的所有省份
|
|
|
|
+ :param city_l: 匹配到的所有城市
|
|
|
|
+ :param district_l: 匹配到的所有区县
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ big_area = ""
|
|
|
|
+ pred_pro = ""
|
|
|
|
+ pred_city = ""
|
|
|
|
+ pred_dis = ""
|
|
|
|
+
|
|
|
|
+ final_pro = ""
|
|
|
|
+ final_city = ""
|
|
|
|
+ if len(pro_ids) >= 1:
|
|
|
|
+ pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ final_pro, score = pro_l[0]
|
|
|
|
+ if score >= 0.01:
|
|
|
|
+ pred_pro = idx_dic[final_pro]['返回名称']
|
|
|
|
+ big_area = idx_dic[final_pro]['大区']
|
|
|
|
+ # else:
|
|
|
|
+ # print("得分过低,过滤掉", idx_dic[final_pro]['返回名称'], score)
|
|
|
|
+
|
|
|
|
+ if pred_pro != "" and len(city_ids) >= 1:
|
|
|
|
+ city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in city_l:
|
|
|
|
+ if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
+ final_city = it[0]
|
|
|
|
+ pred_city = idx_dic[final_city]['返回名称']
|
|
|
|
+ break
|
|
|
|
+ if final_city != "" and len(set(dis_ids)) >= 1:
|
|
|
|
+ dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in dis_l:
|
|
|
|
+ if idx_dic[it[0]]['市'] == final_city:
|
|
|
|
+ pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
+
|
|
|
|
+ if pred_city in ['北京', '天津', '上海', '重庆']:
|
|
|
|
+ pred_city = pred_dis
|
|
|
|
+ pred_dis = ""
|
|
|
|
+ return big_area, pred_pro, pred_city, pred_dis
|
|
|
|
+
|
|
|
|
+ def get_area(text, web_name, in_content=False):
|
|
|
|
+ area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
|
|
+
|
|
|
|
+ pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
|
|
|
|
+ pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2)
|
|
|
|
+ for k in pro_ids1:
|
|
|
|
+ if k in pro_ids:
|
|
|
|
+ pro_ids[k] += pro_ids1[k]
|
|
|
|
+ else:
|
|
|
|
+ pro_ids[k] = pro_ids1[k]
|
|
|
|
+ for k in city_ids1:
|
|
|
|
+ if k in city_ids:
|
|
|
|
+ city_ids[k] += city_ids1[k]
|
|
|
|
+ else:
|
|
|
|
+ city_ids[k] = city_ids1[k]
|
|
|
|
+ for k in dis_ids1:
|
|
|
|
+ if k in dis_ids:
|
|
|
|
+ dis_ids[k] += dis_ids1[k]
|
|
|
|
+ else:
|
|
|
|
+ dis_ids[k] = dis_ids1[k]
|
|
|
|
+
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
+ if big_area != "":
|
|
|
|
+ area_dic['area'] = big_area
|
|
|
|
+ if pred_pro != "":
|
|
|
|
+ area_dic['province'] = pred_pro
|
|
|
|
+ if pred_city != "":
|
|
|
|
+ area_dic['city'] = pred_city
|
|
|
|
+ if pred_dis != "":
|
|
|
|
+ area_dic['district'] = pred_dis
|
|
|
|
+ if in_content:
|
|
|
|
+ area_dic['is_in_text'] = True
|
|
|
|
+
|
|
|
|
+ return {'district': area_dic}
|
|
|
|
+
|
|
|
|
+ p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
|
|
+
|
|
|
|
+ if '##attachment##' in list_articles[0].content:
|
|
|
|
+ content, attachment = list_articles[0].content.split('##attachment##')
|
|
|
|
+ if len(content) < 200:
|
|
|
|
+ content += attachment
|
|
|
|
+ else:
|
|
|
|
+ content = list_articles[0].content
|
|
|
|
+
|
|
|
|
+ tenderee, tenderee_address = get_ree_addr(prem)
|
|
|
|
+ msc = ""
|
|
|
|
+ pro_addr = get_project_addr(content)
|
|
|
|
+ if pro_addr != "":
|
|
|
|
+ msc += '使用规则提取的项目地址;'
|
|
|
|
+ tenderee_address = pro_addr
|
|
|
|
+ else:
|
|
|
|
+ role_addr = get_role_address(content)
|
|
|
|
+ if role_addr != "":
|
|
|
|
+ msc += '使用规则提取的联系人地址;'
|
|
|
|
+ tenderee_address = role_addr
|
|
|
|
+
|
|
|
|
+ if tenderee_address == "":
|
|
|
|
+ title_addr = get_title_addr(title)
|
|
|
|
+ if title_addr != "":
|
|
|
|
+ msc += '使用规则提取的标题地址;'
|
|
|
|
+ tenderee_address = title_addr
|
|
|
|
+ else:
|
|
|
|
+ bid_addr = get_bid_addr(content)
|
|
|
|
+ if bid_addr != "":
|
|
|
|
+ msc += '使用规则提取的开标地址;'
|
|
|
|
+ tenderee_address = bid_addr
|
|
|
|
+
|
|
|
|
+ project_name = str(project_name)
|
|
|
|
+ tenderee = str(tenderee)
|
|
|
|
+
|
|
|
|
+ # print('招标人地址',role_addr, tenderee_address)
|
|
|
|
+
|
|
|
|
+ project_name = project_name + title if project_name not in title else project_name
|
|
|
|
+ project_name = project_name.replace(tenderee, '')
|
|
|
|
+
|
|
|
|
+ text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
|
|
|
|
+
|
|
|
|
+ web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
|
|
+ text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) # 预防提取错 合肥 路南 新会 等地区
|
|
|
|
+
|
|
|
|
+ if pro_addr:
|
|
|
|
+ msc += '## 使用项目地址输入:%s ##;' % pro_addr
|
|
|
|
+ rs = get_area(pro_addr, '')
|
|
|
|
+ msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
|
+ rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
|
+ if rs['district']['province'] != '全国':
|
|
|
|
+ # print('地区匹配:', msc)
|
|
|
|
+ return rs
|
|
|
|
+
|
|
|
|
+ # print('text1:', text1)
|
|
|
|
+ msc += '## 第一次预测输入:%s ##;' % text1
|
|
|
|
+ rs = get_area(text1, web_source_name)
|
|
|
|
+ msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
|
+ rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
|
+ # self.f.write('%s %s \n' % (list_articles[0].id, msc))
|
|
|
|
+ # print('地区匹配:', msc)
|
|
|
|
+ if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
|
|
|
|
+ msc = ""
|
|
|
|
+ all_addr, tenderees = get_all_addr(list_entitys)
|
|
|
|
+ text2 = tenderees + " " + all_addr + ' ' + title
|
|
|
|
+ msc += '使用实体列表所有招标人+所有地址;'
|
|
|
|
+ # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
|
|
|
|
+ text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
|
|
|
|
+ # print('text2:', text2)
|
|
|
|
+ msc += '## 第二次预测输入:%s ##' % text2
|
|
|
|
+ rs2 = get_area(text2, web_source_name, in_content=True)
|
|
|
|
+ # rs2['district']['is_in_text'] = True
|
|
|
|
+ if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
|
|
|
|
+ rs = rs2
|
|
|
|
+ elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
|
|
|
|
+ rs = rs2
|
|
|
|
+ msc += '预测结果:省份:%s, 城市:%s,区县:%s' % (
|
|
|
|
+ rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
|
+ # self.f.write('%s %s \n'%(list_articles[0].id, msc))
|
|
|
|
+ # print('地区匹配:', msc)
|
|
|
|
+ return rs
|
|
|
|
|
|
class TableTag2List():
|
|
class TableTag2List():
|
|
'''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
|
|
'''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
|