|
@@ -531,9 +531,12 @@ class CodeNamePredict():
|
|
if len(dict_name_freq_score) == 0:
|
|
if len(dict_name_freq_score) == 0:
|
|
# name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
|
|
# name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
|
|
name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
|
|
name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
|
|
|
|
+ name_re2 = '(合同|采购)包\d((?P<name>[^,。:;]{2,60}))[:,。]' # 20241202 补充合同包 包名表达 558410976
|
|
for sentence in list_sentence:
|
|
for sentence in list_sentence:
|
|
# pad_sentence = sentence.sentence_text
|
|
# pad_sentence = sentence.sentence_text
|
|
othername = re.search(name_re1, sentence.sentence_text)
|
|
othername = re.search(name_re1, sentence.sentence_text)
|
|
|
|
+ if othername == None:
|
|
|
|
+ othername = re.search(name_re2, sentence.sentence_text)
|
|
if othername != None:
|
|
if othername != None:
|
|
project_name = othername.group('name')
|
|
project_name = othername.group('name')
|
|
if re.search('[\u4e00-\u9fa5]+', project_name) == None: # 没有中文的项目名称去除
|
|
if re.search('[\u4e00-\u9fa5]+', project_name) == None: # 没有中文的项目名称去除
|
|
@@ -869,7 +872,7 @@ class PREMPredict():
|
|
elif re.search('^放弃中标资格|是否中标:否|^(中标|成交)(公示|公告)', behind):
|
|
elif re.search('^放弃中标资格|是否中标:否|^(中标|成交)(公示|公告)', behind):
|
|
values[2] = 0.5
|
|
values[2] = 0.5
|
|
label = 5
|
|
label = 5
|
|
- elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
|
|
|
|
|
|
+ elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人:单位名称: 上海理想信息产业(集团)有限公司 ,投标报价:
|
|
values[2] = 0.5
|
|
values[2] = 0.5
|
|
label = 5
|
|
label = 5
|
|
elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
|
|
elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
|
|
@@ -982,8 +985,8 @@ class PREMPredict():
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
|
|
elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
- elif entity.notes == '单价' and float(entity.entity_text)<5000:
|
|
|
|
- label = 2
|
|
|
|
|
|
+ # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
|
|
|
|
+ # label = 2
|
|
elif label ==0: # 错误招标金额处理
|
|
elif label ==0: # 错误招标金额处理
|
|
if re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
|
|
if re.search('投资(金额|规模):$', front): # 545988699 金额不大的投资金额作为备选招标金额
|
|
values[label] = 0.51
|
|
values[label] = 0.51
|
|
@@ -994,8 +997,8 @@ class PREMPredict():
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
# elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front): # 2024/10/30 注销,避免漏提 预算金额:控制在26000元以内由合作银行出资 ;投资金额不低于人民币500万元
|
|
# elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front): # 2024/10/30 注销,避免漏提 预算金额:控制在26000元以内由合作银行出资 ;投资金额不低于人民币500万元
|
|
# values[label] = 0.49
|
|
# values[label] = 0.49
|
|
- elif entity.notes == '单价' and float(entity.entity_text)<5000:
|
|
|
|
- label = 2
|
|
|
|
|
|
+ # elif entity.notes == '单价' and float(entity.entity_text)<5000: # 20241128 注释,单价单独存放
|
|
|
|
+ # label = 2
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
label = 1
|
|
label = 1
|
|
values[label] = 0.8
|
|
values[label] = 0.8
|
|
@@ -2334,12 +2337,12 @@ class RoleGrade():
|
|
self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
|
|
self.tenderee_left_6 = "(?P<tenderee_left_6>(业主|建设|委托)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|业主|买方)"
|
|
self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
|
|
self.tenderee_left_5 = "(?P<tenderee_left_5>(发布)(人|方|单位|组织|用户|业主|主体|部门|公司|企业)|买方|发布机构)"
|
|
self.agency_left_9 = "(?P<agency_left_9>代理)"
|
|
self.agency_left_9 = "(?P<agency_left_9>代理)"
|
|
- self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]名|排[名序]:1|名次:1)"
|
|
|
|
|
|
+ self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一](名|候选)|排[名序]:1|名次:1)"
|
|
self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))" # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商,
|
|
self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方|最[终后]选[择取]))" # 229435497 最后选择西平,县中原彩印有限公司,作为此项目中标供应商,
|
|
self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
|
|
self.winTenderer_left_6 = "(?P<winTenderer_left_6>(入围|承[接建包修做制担租销]))"
|
|
self.winTenderer_right_9 = "(?P<winTenderer_right_9>^(为(中标|成交|中选)(人|单位|供应商|公司)|以\d+[\d.,]+万?元中标))"
|
|
self.winTenderer_right_9 = "(?P<winTenderer_right_9>^(为(中标|成交|中选)(人|单位|供应商|公司)|以\d+[\d.,]+万?元中标))"
|
|
- self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
|
|
|
|
- self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
|
|
|
|
|
|
+ self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2](名|候选)|排[名序]:2|名次:2))"
|
|
|
|
+ self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3](名|候选)|排[名序]:3|名次:3))"
|
|
self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
|
|
self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
|
|
self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
|
|
self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
|
|
def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
|
|
def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
|
|
@@ -2456,8 +2459,8 @@ class RoleGrade():
|
|
for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表,改为非角色
|
|
for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表,改为非角色
|
|
if entity.entity_text in all_tenderee_agency:
|
|
if entity.entity_text in all_tenderee_agency:
|
|
entity.label = 5
|
|
entity.label = 5
|
|
- elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标
|
|
|
|
- entity.label = 5
|
|
|
|
|
|
+ # elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标 20241126 注释掉,558294326 附件单个候选人漏提取
|
|
|
|
+ # entity.label = 5
|
|
|
|
|
|
if org_winner != []:
|
|
if org_winner != []:
|
|
flag = 0
|
|
flag = 0
|
|
@@ -2499,7 +2502,7 @@ class MoneyGrade():
|
|
if ser:
|
|
if ser:
|
|
groupdict = pattern.split('>')[0].replace('(?P<', '')
|
|
groupdict = pattern.split('>')[0].replace('(?P<', '')
|
|
_role, _direct, _prob = groupdict.split('_')
|
|
_role, _direct, _prob = groupdict.split('_')
|
|
- if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
|
|
|
|
|
|
+ if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or entity.notes == '总投资':# or float(entity.entity_text)<100:
|
|
_prob = 6
|
|
_prob = 6
|
|
_label = role2id.get(_role)
|
|
_label = role2id.get(_role)
|
|
if _label != entity.label:
|
|
if _label != entity.label:
|
|
@@ -2522,8 +2525,8 @@ class MoneyGrade():
|
|
# _prob = min_prob - 0.1 if in_att else min_prob
|
|
# _prob = min_prob - 0.1 if in_att else min_prob
|
|
entity.values[entity.label] = _prob + entity.values[entity.label] / 20
|
|
entity.values[entity.label] = _prob + entity.values[entity.label] / 20
|
|
# print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
|
|
# print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
|
|
- if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
|
|
|
|
- entity.label = 2
|
|
|
|
|
|
+ # if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额 # 20241128 小金额可能为单价,放单价存放
|
|
|
|
+ # entity.label = 2
|
|
|
|
|
|
|
|
|
|
# 时间类别
|
|
# 时间类别
|
|
@@ -5765,16 +5768,233 @@ class DistrictPredictor():
|
|
with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
self.area_variance_dic = pickle.load(f)
|
|
self.area_variance_dic = pickle.load(f)
|
|
|
|
|
|
- def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
|
|
- '''
|
|
|
|
- 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
|
|
|
|
- :param project_name:
|
|
|
|
- :param prem:
|
|
|
|
- :param title:
|
|
|
|
- :param list_articles:
|
|
|
|
- :param web_source_name:
|
|
|
|
- :return:
|
|
|
|
- '''
|
|
|
|
|
|
+ def predict_area(self, title, ree, addr, web_source_name):
|
|
|
|
+ p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
|
|
+
|
|
|
|
+ def find_whole_areas(text, weight=1):
|
|
|
|
+ '''
|
|
|
|
+ 通过正则匹配字符串返回地址
|
|
|
|
+ :param pettern: 地址正则 广东省|广西省|...
|
|
|
|
+ :param text: 待匹配文本
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ province_l, city_l, district_l = [], [], []
|
|
|
|
+
|
|
|
|
+ text = str(text)
|
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县',
|
|
|
|
+ ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
|
+ text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
|
+ text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
|
+ text = re.sub('茂名滨海新区', '茂名市', text)
|
|
|
|
+ text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
|
|
|
|
+ text = re.sub('横州市', '横县', text) # 例:547363890 修复广西南宁横州 不在地区表问题
|
|
|
|
+ ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
|
|
|
|
+ if ser and '黎族' not in ser.group(0):
|
|
|
|
+ text = text.replace(ser.group(0), ser.group(0) + '黎族')
|
|
|
|
+ for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
|
|
|
|
+ text = text.replace(k, v)
|
|
|
|
+
|
|
|
|
+ if re.search('[\u4e00-\u9fa5]', text) == None:
|
|
|
|
+ return province_l, city_l, district_l
|
|
|
|
+
|
|
|
|
+ pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
|
|
|
|
+ p_pro, p_city, p_dis, p_city, p_dis, p_dis)
|
|
|
|
+
|
|
|
|
+ for it in re.finditer(pettern, text):
|
|
|
|
+ if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
|
|
|
|
+ continue
|
|
|
|
+ for k, v in it.groupdict().items():
|
|
|
|
+ if v != None:
|
|
|
|
+ if it.end() == it.end(k) and re.search('[省市区县州旗盟]$', v) == None and re.search(
|
|
|
|
+ '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆|经济开发区|开发区|新区)',
|
|
|
|
+ # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
|
|
|
|
+ text[it.end(k):]) != None:
|
|
|
|
+ continue
|
|
|
|
+ if k in ['prov']:
|
|
|
|
+ if v in full_dic['province']:
|
|
|
|
+ score = 2
|
|
|
|
+ else:
|
|
|
|
+ score = 1
|
|
|
|
+ if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
+ , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
|
|
|
|
+ score += 1
|
|
|
|
+ score += it.end(k) / len(text) / 10
|
|
|
|
+ province_l.append((v, score * weight))
|
|
|
|
+ elif k in ['city', 'city1']:
|
|
|
|
+ if v in full_dic['city']:
|
|
|
|
+ score = 2
|
|
|
|
+ else:
|
|
|
|
+ score = 1
|
|
|
|
+ if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
+ , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
|
|
|
|
+ score += 1
|
|
|
|
+ score += it.end(k) / len(text) / 10
|
|
|
|
+ city_l.append((v, score * weight))
|
|
|
|
+ elif k in ['dist', 'dist1', 'dist2']:
|
|
|
|
+ if v in ['东区', '西区', '城区', '郊区', '矿区']:
|
|
|
|
+ continue
|
|
|
|
+ if v in full_dic['district'] and len(v)>2:
|
|
|
|
+ score = 2
|
|
|
|
+ else:
|
|
|
|
+ score = 0.5
|
|
|
|
+ if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
+ , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
|
|
|
|
+ score += 1
|
|
|
|
+ # print('县区加分:', v, text)
|
|
|
|
+ score += it.end(k) / len(text) / 10
|
|
|
|
+ if v == '昌江' and '景德镇' not in it.group(0):
|
|
|
|
+ district_l.append(('昌江黎族', score * weight))
|
|
|
|
+ else:
|
|
|
|
+ district_l.append((v, score * weight))
|
|
|
|
+ return province_l, city_l, district_l
|
|
|
|
+
|
|
|
|
+ def merge_score(province_l, city_l, district_l, filter_short_dist=True):
|
|
|
|
+ '''
|
|
|
|
+ 合并分数,下级地区分数加到上级
|
|
|
|
+ :param province_l: 提取到的省份列表 [(name, score)]
|
|
|
|
+ :param city_l: 提取到的城市列表 [(name, score)]
|
|
|
|
+ :param district_l: 提取到的区县列表 [(name, score)]
|
|
|
|
+ :param filter_short_dist: 是否过滤不在省份下的区县简称权重
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ pro_ids = dict()
|
|
|
|
+ city_ids = dict()
|
|
|
|
+ dis_ids = dict()
|
|
|
|
+ for pro in province_l:
|
|
|
|
+ name, score = pro
|
|
|
|
+ idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
|
|
|
|
+ if idx not in pro_ids:
|
|
|
|
+ pro_ids[idx] = 0
|
|
|
|
+ pro_ids[idx] += score
|
|
|
|
+
|
|
|
|
+ tmp_pro = {}
|
|
|
|
+ for city in city_l:
|
|
|
|
+ name, score = city
|
|
|
|
+ if name in full_dic['city']:
|
|
|
|
+ for idx in full_dic['city'][name]:
|
|
|
|
+ if idx not in city_ids:
|
|
|
|
+ city_ids[idx] = 0
|
|
|
|
+ city_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ elif name in short_dic['city']:
|
|
|
|
+ for idx in short_dic['city'][name]:
|
|
|
|
+ if idx not in city_ids:
|
|
|
|
+ city_ids[idx] = 0
|
|
|
|
+ city_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ if set(tmp_pro) & set(pro_ids) != set():
|
|
|
|
+ for k, v in tmp_pro.items():
|
|
|
|
+ if k in pro_ids:
|
|
|
|
+ pro_ids[k] += v
|
|
|
|
+ else:
|
|
|
|
+ pro_ids.update(tmp_pro)
|
|
|
|
+ tmp_pro = {}
|
|
|
|
+ tmp_city = {}
|
|
|
|
+ for dis in district_l:
|
|
|
|
+ name, score = dis
|
|
|
|
+ if name in full_dic['district']:
|
|
|
|
+ for idx in full_dic['district'][name]:
|
|
|
|
+ if idx not in dis_ids:
|
|
|
|
+ dis_ids[idx] = 0
|
|
|
|
+ dis_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ city_idx = idx_dic[idx]['市']
|
|
|
|
+ if city_idx in tmp_city:
|
|
|
|
+ tmp_city[city_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_city[city_idx] = score
|
|
|
|
+ elif name in short_dic['district']:
|
|
|
|
+ for idx in short_dic['district'][name]:
|
|
|
|
+ if idx not in dis_ids:
|
|
|
|
+ dis_ids[idx] = 0
|
|
|
|
+ dis_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if filter_short_dist and pro_idx not in pro_ids:
|
|
|
|
+ continue
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ city_idx = idx_dic[idx]['市']
|
|
|
|
+ if city_idx in tmp_city:
|
|
|
|
+ tmp_city[city_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_city[city_idx] = score
|
|
|
|
+ if set(tmp_pro) & set(pro_ids) != set():
|
|
|
|
+ for k, v in tmp_pro.items():
|
|
|
|
+ if k in pro_ids:
|
|
|
|
+ pro_ids[k] += v
|
|
|
|
+ else:
|
|
|
|
+ pro_ids.update(tmp_pro)
|
|
|
|
+ if set(tmp_city) & set(city_ids) != set():
|
|
|
|
+ for k, v in tmp_city.items():
|
|
|
|
+ if k in city_ids:
|
|
|
|
+ city_ids[k] += v
|
|
|
|
+ else:
|
|
|
|
+ city_ids.update(tmp_city)
|
|
|
|
+ return pro_ids, city_ids, dis_ids
|
|
|
|
+
|
|
|
|
+ def get_final_addr(pro_ids, city_ids, dis_ids):
|
|
|
|
+ '''
|
|
|
|
+ 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
|
|
|
|
+ :param province_l: 匹配到的所有省份
|
|
|
|
+ :param city_l: 匹配到的所有城市
|
|
|
|
+ :param district_l: 匹配到的所有区县
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ big_area = ""
|
|
|
|
+ pred_pro = ""
|
|
|
|
+ pred_city = ""
|
|
|
|
+ pred_dis = ""
|
|
|
|
+
|
|
|
|
+ final_pro = ""
|
|
|
|
+ final_city = ""
|
|
|
|
+ prob = 0
|
|
|
|
+ max_score = 0
|
|
|
|
+ if len(pro_ids) >= 1:
|
|
|
|
+ pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ scores = [it[1] for it in pro_l]
|
|
|
|
+ prob = max(scores)/sum(scores)
|
|
|
|
+ max_score = max(scores)
|
|
|
|
+ final_pro, score = pro_l[0]
|
|
|
|
+ if score >= 0.01:
|
|
|
|
+ pred_pro = idx_dic[final_pro]['返回名称']
|
|
|
|
+ big_area = idx_dic[final_pro]['大区']
|
|
|
|
+ if pred_pro != "" and len(city_ids) >= 1:
|
|
|
|
+ city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in city_l:
|
|
|
|
+ if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
+ final_city = it[0]
|
|
|
|
+ pred_city = idx_dic[final_city]['返回名称']
|
|
|
|
+ break
|
|
|
|
+ if final_city != "" and len(set(dis_ids)) >= 1:
|
|
|
|
+ dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in dis_l:
|
|
|
|
+ if idx_dic[it[0]]['市'] == final_city:
|
|
|
|
+ pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
+ elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
|
|
|
|
+ dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in dis_l:
|
|
|
|
+ if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
+ pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
|
|
|
|
+ pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
+ if pred_city in ['北京', '天津', '上海', '重庆']:
|
|
|
|
+ pred_city = pred_dis
|
|
|
|
+ pred_dis = ""
|
|
|
|
+ return big_area, pred_pro, pred_city, pred_dis, prob, max_score
|
|
|
|
+
|
|
def get_ree_addr(prem):
|
|
def get_ree_addr(prem):
|
|
tenderee = ""
|
|
tenderee = ""
|
|
tenderee_address = ""
|
|
tenderee_address = ""
|
|
@@ -5787,92 +6007,6 @@ class DistrictPredictor():
|
|
except Exception as e:
|
|
except Exception as e:
|
|
print('解析prem 获取招标人、及地址出错')
|
|
print('解析prem 获取招标人、及地址出错')
|
|
return tenderee, tenderee_address
|
|
return tenderee, tenderee_address
|
|
- def get_area(text, web_source_name, not_in_content=True):
|
|
|
|
- score_l = []
|
|
|
|
- id_set = set()
|
|
|
|
-
|
|
|
|
- if re.search(self.short_name, text):
|
|
|
|
- for it in re.finditer(self.full_name, text):
|
|
|
|
- name = it.group(0)
|
|
|
|
- score = len(name) / len(text)
|
|
|
|
- for _id in self.full2id[name]:
|
|
|
|
- area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
|
|
|
|
- # score_l.append([_id, score] + area)
|
|
|
|
- # w = self.dist_dic[_id]['权重']
|
|
|
|
- score_l.append([_id, score + 1] + area) # 匹配全称的加1 ,不加权重,因为权重某些赋值不好
|
|
|
|
-
|
|
|
|
- flag = 0
|
|
|
|
- for it in re.finditer(self.short_name, text):
|
|
|
|
- if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
|
|
|
|
- name = it.group(0)
|
|
|
|
- score = (it.start() + len(name)) / len(text)
|
|
|
|
- for _id in self.short2id[name]:
|
|
|
|
- score2 = 0
|
|
|
|
- w = self.dist_dic[_id]['权重']
|
|
|
|
- _type = self.dist_dic[_id]['类型']
|
|
|
|
- area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
|
|
|
|
- if area[0] in ['2', '16', '20', '30']:
|
|
|
|
- _type += 10
|
|
|
|
- if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
|
|
|
|
- w = 1
|
|
|
|
- score2 += w
|
|
|
|
- if _id not in id_set:
|
|
|
|
- if _type == 20:
|
|
|
|
- type_w = 3
|
|
|
|
- elif _type == 30:
|
|
|
|
- if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
|
|
|
|
- type_w = 0
|
|
|
|
- else:
|
|
|
|
- type_w = 2
|
|
|
|
- else:
|
|
|
|
- if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
|
|
|
|
- type_w = 2
|
|
|
|
- else:
|
|
|
|
- type_w = 0.5
|
|
|
|
- id_set.add(_id)
|
|
|
|
- score2 += w * type_w
|
|
|
|
- score_l.append([_id, score * w + score2] + area)
|
|
|
|
-
|
|
|
|
- if flag == 1:
|
|
|
|
- pass
|
|
|
|
- # print('score', score)
|
|
|
|
- if re.search('公司', web_source_name) == None:
|
|
|
|
- for it in re.finditer(self.short_name, web_source_name):
|
|
|
|
- name = it.group(0)
|
|
|
|
- for _id in self.short2id[name]:
|
|
|
|
- area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
|
|
|
|
- w = self.dist_dic[_id]['权重']
|
|
|
|
- score = w * 0.2
|
|
|
|
- score_l.append([_id, score] + area)
|
|
|
|
- area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
|
|
- if len(score_l) == 0:
|
|
|
|
- return {'district': area_dic}
|
|
|
|
- else:
|
|
|
|
- df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
|
|
|
|
- df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
|
|
|
|
- # print('地区评分:')
|
|
|
|
- # print(df)
|
|
|
|
- df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
|
|
|
|
- pro_id = df_pro.index[0]
|
|
|
|
- if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content: # 不是二次全文匹配的 省级评分小于0.1的不要
|
|
|
|
- # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
|
|
|
|
- return {'district': area_dic}
|
|
|
|
- area_dic['province'] = self.dist_dic[pro_id]['地区']
|
|
|
|
- area_dic['area'] = self.dist_dic[pro_id]['大区']
|
|
|
|
- df = df[df['city'] != ""]
|
|
|
|
- df = df[df['province'] == pro_id]
|
|
|
|
- if len(df) > 0:
|
|
|
|
- df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
|
|
|
|
- city_id = df_city.index[0]
|
|
|
|
- area_dic['city'] = self.dist_dic[city_id]['地区']
|
|
|
|
- df = df[df['district'] != ""]
|
|
|
|
- df = df[df['city'] == city_id]
|
|
|
|
- if len(df) > 0:
|
|
|
|
- df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
|
|
|
|
- dist_id = df_dist.index[0]
|
|
|
|
- area_dic['district'] = self.dist_dic[dist_id]['地区']
|
|
|
|
- # print(area_dic)
|
|
|
|
- return {'district': area_dic}
|
|
|
|
|
|
|
|
def get_role_address(text):
|
|
def get_role_address(text):
|
|
'''正则匹配获取招标人地址
|
|
'''正则匹配获取招标人地址
|
|
@@ -5892,14 +6026,17 @@ class DistrictPredictor():
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def get_project_addr(text):
|
|
def get_project_addr(text):
|
|
- p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
|
|
+ p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w()]{,20}[,。])?|\w{2,15}[,。])'
|
|
|
|
+ p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
|
|
if re.search(p1, text):
|
|
if re.search(p1, text):
|
|
return re.search(p1, text).group('addr')
|
|
return re.search(p1, text).group('addr')
|
|
|
|
+ elif re.search(p2, text):
|
|
|
|
+ return re.search(p2, text).group('addr')
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def get_bid_addr(text):
|
|
def get_bid_addr(text):
|
|
- p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
|
|
+ p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
if re.search(p2, text):
|
|
if re.search(p2, text):
|
|
return re.search(p2, text).group('addr')
|
|
return re.search(p2, text).group('addr')
|
|
else:
|
|
else:
|
|
@@ -5909,7 +6046,7 @@ class DistrictPredictor():
|
|
tenderee_l = []
|
|
tenderee_l = []
|
|
addr_l = []
|
|
addr_l = []
|
|
for ent in list_entitys[0]:
|
|
for ent in list_entitys[0]:
|
|
- if ent.entity_type == 'location' and len(ent.entity_text)>2:
|
|
|
|
|
|
+ if ent.entity_type == 'location' and len(ent.entity_text) > 2:
|
|
addr_l.append(ent.entity_text)
|
|
addr_l.append(ent.entity_text)
|
|
elif ent.entity_type in ['org', 'company']:
|
|
elif ent.entity_type in ['org', 'company']:
|
|
if ent.label in [0, 1]: # 加招标或代理
|
|
if ent.label in [0, 1]: # 加招标或代理
|
|
@@ -5923,85 +6060,43 @@ class DistrictPredictor():
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
- if '##attachment##' in list_articles[0].content:
|
|
|
|
- content, attachment = list_articles[0].content.split('##attachment##')
|
|
|
|
- if len(content) < 200:
|
|
|
|
- content += attachment
|
|
|
|
- else:
|
|
|
|
- content = list_articles[0].content
|
|
|
|
-
|
|
|
|
- tenderee, tenderee_address = get_ree_addr(prem)
|
|
|
|
- msc = ""
|
|
|
|
- pro_addr = get_project_addr(content)
|
|
|
|
- if pro_addr != "":
|
|
|
|
- msc += '使用规则提取的项目地址;'
|
|
|
|
- tenderee_address = pro_addr
|
|
|
|
- else:
|
|
|
|
- role_addr = get_role_address(content)
|
|
|
|
- if role_addr != "":
|
|
|
|
- msc += '使用规则提取的联系人地址;'
|
|
|
|
- tenderee_address = role_addr
|
|
|
|
-
|
|
|
|
- if tenderee_address == "":
|
|
|
|
- title_addr = get_title_addr(title)
|
|
|
|
- if title_addr != "":
|
|
|
|
- msc += '使用规则提取的标题地址;'
|
|
|
|
- tenderee_address = title_addr
|
|
|
|
- else:
|
|
|
|
- bid_addr = get_bid_addr(content)
|
|
|
|
- if bid_addr != "":
|
|
|
|
- msc += '使用规则提取的开标地址;'
|
|
|
|
- tenderee_address = bid_addr
|
|
|
|
-
|
|
|
|
- project_name = str(project_name)
|
|
|
|
- tenderee = str(tenderee)
|
|
|
|
-
|
|
|
|
- # print('招标人地址',role_addr, tenderee_address)
|
|
|
|
-
|
|
|
|
- project_name = project_name + title if project_name not in title else project_name
|
|
|
|
- project_name = project_name.replace(tenderee, '')
|
|
|
|
-
|
|
|
|
- text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
|
|
|
|
-
|
|
|
|
- web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
|
|
- text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区
|
|
|
|
-
|
|
|
|
- if pro_addr:
|
|
|
|
- msc += '## 使用项目地址输入:%s ##;' % pro_addr
|
|
|
|
- rs = get_area(pro_addr, '')
|
|
|
|
- msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
|
- rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
|
- if rs['district']['province'] != '全国':
|
|
|
|
- # print('地区匹配:', msc)
|
|
|
|
- return rs
|
|
|
|
-
|
|
|
|
- # print('text1:', text1)
|
|
|
|
- msc += '## 第一次预测输入:%s ##;'%text1
|
|
|
|
- rs = get_area(text1, web_source_name)
|
|
|
|
- msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
|
- rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
|
- # self.f.write('%s %s \n' % (list_articles[0].id, msc))
|
|
|
|
- # print('地区匹配:', msc)
|
|
|
|
- if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
|
|
|
|
- msc = ""
|
|
|
|
- all_addr, tenderees = get_all_addr(list_entitys)
|
|
|
|
- text2 = tenderees + " " + all_addr + ' ' + title
|
|
|
|
- msc += '使用实体列表所有招标人+所有地址;'
|
|
|
|
- # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
|
|
|
|
- text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
|
|
|
|
- # print('text2:', text2)
|
|
|
|
- msc += '## 第二次预测输入:%s ##'%text2
|
|
|
|
- rs2 = get_area(text2, web_source_name, not_in_content=False)
|
|
|
|
- rs2['district']['is_in_text'] = True
|
|
|
|
- if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
|
|
|
|
- rs = rs2
|
|
|
|
- elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
|
|
|
|
- rs = rs2
|
|
|
|
- msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
|
|
|
|
- rs['district']['province'],rs['district']['city'],rs['district']['district'])
|
|
|
|
- # self.f.write('%s %s \n'%(list_articles[0].id, msc))
|
|
|
|
- # print('地区匹配:', msc)
|
|
|
|
- return rs
|
|
|
|
|
|
+ area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
|
|
+ province_l, city_l, district_l = find_whole_areas(title)
|
|
|
|
+ pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
+ # print('关键词1:', province_l, city_l, district_l)
|
|
|
|
+ # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
|
+ if pred_city == "" or prob < 0.7 or max_score<2:
|
|
|
|
+ province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
|
|
|
|
+ province_l.extend(province_l2)
|
|
|
|
+ city_l.extend(city_l2)
|
|
|
|
+ district_l.extend(district_l2)
|
|
|
|
+ pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
+ # print('关键词2:', province_l, city_l, district_l)
|
|
|
|
+ # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
|
+ if pred_city == "" or prob < 0.7 or max_score<2:
|
|
|
|
+ province_l3, city_l3, district_l3 = find_whole_areas(web_source_name, weight=0.6)
|
|
|
|
+ province_l.extend(province_l3)
|
|
|
|
+ city_l.extend(city_l3)
|
|
|
|
+ district_l.extend(district_l3)
|
|
|
|
+ pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
+ # print('关键词3:', province_l, city_l, district_l)
|
|
|
|
+ # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
|
+
|
|
|
|
+ in_content = False
|
|
|
|
+ if big_area != "":
|
|
|
|
+ area_dic['area'] = big_area
|
|
|
|
+ if pred_pro != "":
|
|
|
|
+ area_dic['province'] = pred_pro
|
|
|
|
+ if pred_city != "":
|
|
|
|
+ area_dic['city'] = pred_city
|
|
|
|
+ if pred_dis != "":
|
|
|
|
+ area_dic['district'] = pred_dis
|
|
|
|
+ if in_content:
|
|
|
|
+ area_dic['is_in_text'] = True
|
|
|
|
+ return {'district': area_dic}
|
|
|
|
|
|
def get_area(self, text, web_name, in_content=False):
|
|
def get_area(self, text, web_name, in_content=False):
|
|
p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
@@ -6651,6 +6746,8 @@ class TablePremExtractor(object):
|
|
continue
|
|
continue
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
return flag, contain_header, dict(), not_sure_winner
|
|
return flag, contain_header, dict(), not_sure_winner
|
|
|
|
+ if text == '单位': # 20241128 补充金额单位
|
|
|
|
+ header_dic['amount_unit'] = (i, text)
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
for i in range(len(td_list)):
|
|
for i in range(len(td_list)):
|
|
@@ -6750,6 +6847,7 @@ class TablePremExtractor(object):
|
|
win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
|
|
win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
|
|
win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
|
|
win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
|
|
serviceTime = df.loc[i, headers['serviceTime'][0]].strip() if "serviceTime" in headers else ""
|
|
serviceTime = df.loc[i, headers['serviceTime'][0]].strip() if "serviceTime" in headers else ""
|
|
|
|
+ amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
|
|
|
|
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
# print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
|
|
# print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
|
|
@@ -6764,7 +6862,7 @@ class TablePremExtractor(object):
|
|
project_name = ""
|
|
project_name = ""
|
|
|
|
|
|
package_code = package_code_raw
|
|
package_code = package_code_raw
|
|
- if re.search('合计|总计', package_code+project_code):
|
|
|
|
|
|
+ if re.search('合计|总计', package_code+project_code+project_name):
|
|
continue
|
|
continue
|
|
if package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
if package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
same_package = True
|
|
same_package = True
|
|
@@ -6843,7 +6941,14 @@ class TablePremExtractor(object):
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
break
|
|
break
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
|
|
+ if amount_unit!='' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', budget_+budget_header)==None : # 20241128 补充某些表格价格单位分开两列, 例:557953660
|
|
|
|
+ budget_ += amount_unit
|
|
budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
|
|
budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
|
|
|
|
+ if re.search('元[/每]', amount_unit) or re.search('单价', budget_header):
|
|
|
|
+ unit_tendereeMoney = budget
|
|
|
|
+ budget = 0
|
|
|
|
+ else:
|
|
|
|
+ unit_tendereeMoney = 0
|
|
|
|
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率',
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率',
|
|
budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
@@ -6854,6 +6959,13 @@ class TablePremExtractor(object):
|
|
else:
|
|
else:
|
|
prem_dic[package]['tendereeMoney'] = budget
|
|
prem_dic[package]['tendereeMoney'] = budget
|
|
prem_dic[package]['tendereeMoneyUnit'] = money_unit
|
|
prem_dic[package]['tendereeMoneyUnit'] = money_unit
|
|
|
|
+ if unit_tendereeMoney > 0:
|
|
|
|
+ if 'unit_tendereeMoney' not in prem_dic[package]:
|
|
|
|
+ prem_dic[package]['unit_tendereeMoney'] = 0
|
|
|
|
+ if same_package and prem_dic[package]['unit_tendereeMoney'] != unit_tendereeMoney: # 处理 类似 136839070 一包多物品多预算
|
|
|
|
+ prem_dic[package]['unit_tendereeMoney'] += unit_tendereeMoney
|
|
|
|
+ else:
|
|
|
|
+ prem_dic[package]['unit_tendereeMoney'] = unit_tendereeMoney
|
|
if tenderee and not same_package:
|
|
if tenderee and not same_package:
|
|
prem_dic[package]['roleList'].append({
|
|
prem_dic[package]['roleList'].append({
|
|
"address": "",
|
|
"address": "",
|
|
@@ -6874,8 +6986,16 @@ class TablePremExtractor(object):
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
break
|
|
break
|
|
-
|
|
|
|
|
|
+ bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
|
|
+ if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and bid_amount_!='' and re.search('元',
|
|
|
|
+ bid_amount_ + bid_amount_header) == None:
|
|
|
|
+ bid_amount_ += amount_unit
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
|
+ if re.search('元[/每]', amount_unit) or re.search('单价', bid_amount_header):
|
|
|
|
+ unit_price = bid_amount
|
|
|
|
+ bid_amount = 0
|
|
|
|
+ else:
|
|
|
|
+ unit_price = 0
|
|
if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
@@ -6885,7 +7005,6 @@ class TablePremExtractor(object):
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
continue
|
|
continue
|
|
|
|
|
|
- bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率',
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率',
|
|
bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
bid_amount = 0
|
|
bid_amount = 0
|
|
@@ -6897,7 +7016,7 @@ class TablePremExtractor(object):
|
|
serviceTime = extract_serviceTime(serviceTime[0]['body'],"") if serviceTime else ""
|
|
serviceTime = extract_serviceTime(serviceTime[0]['body'],"") if serviceTime else ""
|
|
# print(serviceTime)
|
|
# print(serviceTime)
|
|
if not same_package or len(prem_dic[package]['roleList'])==0:
|
|
if not same_package or len(prem_dic[package]['roleList'])==0:
|
|
- prem_dic[package]['roleList'].append({
|
|
|
|
|
|
+ role_dic = {
|
|
"address": "",
|
|
"address": "",
|
|
"linklist": [],
|
|
"linklist": [],
|
|
"role_money": {
|
|
"role_money": {
|
|
@@ -6910,17 +7029,20 @@ class TablePremExtractor(object):
|
|
"role_name": "win_tenderer",
|
|
"role_name": "win_tenderer",
|
|
"role_text": tenderer,
|
|
"role_text": tenderer,
|
|
"serviceTime": serviceTime
|
|
"serviceTime": serviceTime
|
|
- })
|
|
|
|
|
|
+ }
|
|
|
|
+ if unit_price > 0:
|
|
|
|
+ role_dic['role_money']['unit_price'] = unit_price
|
|
|
|
+ prem_dic[package]['roleList'].append(role_dic)
|
|
elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
|
|
elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
|
|
if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
|
|
if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
- if bid_amount != 0: # 有中标金额的才放进去
|
|
|
|
|
|
+ if bid_amount != 0 or unit_price > 0: # 有中标金额的才放进去
|
|
if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
- prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
|
|
|
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit, "serviceTime": serviceTime})
|
|
tenderer_list.append(tenderer)
|
|
tenderer_list.append(tenderer)
|
|
serviceTime_list.append(serviceTime)
|
|
serviceTime_list.append(serviceTime)
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
@@ -7113,6 +7235,7 @@ class CandidateExtractor(object):
|
|
flag = True
|
|
flag = True
|
|
for i in range(len(td_list)) :
|
|
for i in range(len(td_list)) :
|
|
text = td_list[i]
|
|
text = td_list[i]
|
|
|
|
+ text = re.sub('\s|[((]排名不分先后[))]', '', text)
|
|
if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
continue
|
|
continue
|
|
if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
@@ -7134,6 +7257,8 @@ class CandidateExtractor(object):
|
|
if num>1:
|
|
if num>1:
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
return flag, contain_header, dict()
|
|
return flag, contain_header, dict()
|
|
|
|
+ if text == '单位': # 20241128 补充金额单位
|
|
|
|
+ header_dic['amount_unit'] = (i, text)
|
|
if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
|
|
if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
|
|
return flag, contain_header, header_dic
|
|
return flag, contain_header, header_dic
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
@@ -7210,6 +7335,7 @@ class CandidateExtractor(object):
|
|
win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
|
|
win_tenderer = df.loc[i, headers['win_tenderer'][0]].strip() if "win_tenderer" in headers else ""
|
|
second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
|
|
second_tenderer = df.loc[i, headers['second_tenderer'][0]].strip() if "second_tenderer" in headers else ""
|
|
third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
|
|
third_tenderer = df.loc[i, headers['third_tenderer'][0]].strip() if "third_tenderer" in headers else ""
|
|
|
|
+ amount_unit = df.loc[i, headers['amount_unit'][0]].strip() if "amount_unit" in headers else ""
|
|
|
|
|
|
if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
|
|
if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配 # 排除 ,win_sort 避免367940050漏提取
|
|
# print('包含表头, 停止匹配')
|
|
# print('包含表头, 停止匹配')
|
|
@@ -7286,7 +7412,14 @@ class CandidateExtractor(object):
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
|
|
+ if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元', text+header)==None: # 补充另外在一列的金额单位
|
|
|
|
+ text += amount_unit
|
|
money, money_unit = money_process(text, header)
|
|
money, money_unit = money_process(text, header)
|
|
|
|
+ if re.search('元[/每]', amount_unit) or re.search('单价', header):
|
|
|
|
+ unit_price = money
|
|
|
|
+ money = 0
|
|
|
|
+ else:
|
|
|
|
+ unit_price = 0
|
|
|
|
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
money = 0
|
|
money = 0
|
|
@@ -7295,6 +7428,11 @@ class CandidateExtractor(object):
|
|
role_dic[type] = dict()
|
|
role_dic[type] = dict()
|
|
role_dic[type]['money'] = money
|
|
role_dic[type]['money'] = money
|
|
role_dic[type]['money_unit'] = money_unit
|
|
role_dic[type]['money_unit'] = money_unit
|
|
|
|
+ if unit_price > 0:
|
|
|
|
+ if type not in role_dic:
|
|
|
|
+ role_dic[type] = dict()
|
|
|
|
+ role_dic[type]['unit_price'] = unit_price
|
|
|
|
+ role_dic[type]['money_unit'] = money_unit
|
|
else:
|
|
else:
|
|
line_num += 1
|
|
line_num += 1
|
|
if findtop3 and findmoney:
|
|
if findtop3 and findmoney:
|
|
@@ -7322,13 +7460,21 @@ class CandidateExtractor(object):
|
|
prem_dic[package]['name'] = project_name
|
|
prem_dic[package]['name'] = project_name
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
|
|
+ header = headers['bid_amount'][1] if "bid_amount" in headers else ''
|
|
|
|
+ if amount_unit != '' and re.search('^[万亿]?元|%|折[\w/]{,6}$', amount_unit) and re.search('元',
|
|
|
|
+ bid_amount_ + header) == None: # 补充另外在一列的金额单位
|
|
|
|
+ bid_amount_ += amount_unit
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
|
|
+ if re.search('元[/每]', amount_unit) or re.search('单价', header):
|
|
|
|
+ unit_price = bid_amount
|
|
|
|
+ bid_amount = 0
|
|
|
|
+ else:
|
|
|
|
+ unit_price = 0
|
|
|
|
|
|
- header = headers['bid_amount'][1] if "bid_amount" in headers else ''
|
|
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率',
|
|
if (re.search('费率|下浮率|[%%‰折]|优惠率',
|
|
header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
bid_amount = 0
|
|
bid_amount = 0
|
|
- prem_dic[package]['roleList'].append({
|
|
|
|
|
|
+ tmp_role_dic = {
|
|
"address": "",
|
|
"address": "",
|
|
"linklist": [],
|
|
"linklist": [],
|
|
"role_money": {
|
|
"role_money": {
|
|
@@ -7341,7 +7487,10 @@ class CandidateExtractor(object):
|
|
"role_name": role_type,
|
|
"role_name": role_type,
|
|
"role_text": candidate,
|
|
"role_text": candidate,
|
|
"serviceTime": ""
|
|
"serviceTime": ""
|
|
- })
|
|
|
|
|
|
+ }
|
|
|
|
+ if unit_price > 0:
|
|
|
|
+ tmp_role_dic['role_money']['unit_price'] = unit_price
|
|
|
|
+ prem_dic[package]['roleList'].append(tmp_role_dic)
|
|
if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
|
|
if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
if role_dic and prem_dic == dict():
|
|
if role_dic and prem_dic == dict():
|