|
@@ -53,7 +53,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
"industry": {"predictor": None, "Lock": RLock()},
|
|
|
"rolegrade": {"predictor": None, "Lock": RLock()},
|
|
|
"moneygrade": {"predictor": None, "Lock": RLock()},
|
|
|
- "district": {"predictor": None, "Lock": RLock()}
|
|
|
+ "district": {"predictor": None, "Lock": RLock()},
|
|
|
+ 'tableprem': {"predictor": None, "Lock": RLock()},
|
|
|
}
|
|
|
|
|
|
|
|
@@ -97,6 +98,8 @@ def getPredictor(_type):
|
|
|
dict_predictor[_type]["predictor"] = MoneyGrade()
|
|
|
if _type == 'district':
|
|
|
dict_predictor[_type]["predictor"] = DistrictPredictor()
|
|
|
+ if _type == 'tableprem':
|
|
|
+ dict_predictor[_type]["predictor"] = TablePremExtractor()
|
|
|
return dict_predictor[_type]["predictor"]
|
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
@@ -685,7 +688,7 @@ class PREMPredict():
|
|
|
elif re.search('尊敬的供应商:', text):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
- elif re.search('[^\w]中标候选人:', text) and re.search('[1一]', text) == None: #修复第4以上的预测错为中标人
|
|
|
+ elif re.search('[^\w]中标候选人', text[:15]) and re.search('[1一]', text[:15]) == None: #修复第4以上的预测错为中标人
|
|
|
label = 5
|
|
|
values[label] = 0.5
|
|
|
elif re.search('是否中标:是,供应商', text) and label == 5:
|
|
@@ -1156,8 +1159,8 @@ class RoleRulePredictor():
|
|
|
self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
|
|
|
"(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
|
|
|
"单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
- self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
- self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
|
|
|
+ self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
+ self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
|
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
@@ -1659,7 +1662,7 @@ class TendereeRuleRecall():
|
|
|
self.get_tenderee = True
|
|
|
else:
|
|
|
if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
|
|
|
- ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index]):
|
|
|
+ ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index].sentence_text):
|
|
|
ent.label = 0
|
|
|
ent.values[0] = 0.5 + ent.values[0] / 10
|
|
|
self.get_tenderee = True
|
|
@@ -2580,7 +2583,7 @@ class ProductAttributesPredictor():
|
|
|
if order_begin_year>=2050 or order_end_year>=2050:
|
|
|
order_begin = order_end = ""
|
|
|
# print(product,demand,budget,order_begin)
|
|
|
- if product!= "" and demand != "" and budget!="" and order_begin != "":
|
|
|
+ if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
|
|
|
link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
|
'order_begin': order_begin, 'order_end': order_end}
|
|
|
if link not in demand_link:
|
|
@@ -2607,7 +2610,7 @@ class ProductAttributesPredictor():
|
|
|
if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
|
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
|
|
|
- if found_header:
|
|
|
+ if found_header and len(headers)<1: # 只保留出现的第一个表头
|
|
|
headers.append('_'.join(header_list))
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
|
header_colnum = len(tds)
|
|
@@ -2653,7 +2656,7 @@ class ProductAttributesPredictor():
|
|
|
brand = ""
|
|
|
if id5 != "":
|
|
|
if re.search('\w', tds[id5]):
|
|
|
- specs = tds[id5]
|
|
|
+ specs = tds[id5][:500] # 限制最多500字
|
|
|
else:
|
|
|
specs = ""
|
|
|
if id6 != "":
|
|
@@ -2676,6 +2679,9 @@ class ProductAttributesPredictor():
|
|
|
order_time = tds[id8].strip()
|
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
+ if len(unitPrice) > 15 or len(product)>100: # 单价大于15位数或 产品名称长于100字
|
|
|
+ i += 1
|
|
|
+ continue
|
|
|
link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
|
|
|
'brand': brand[:50], 'specs':specs}
|
|
|
if link not in product_link:
|
|
@@ -2683,7 +2689,7 @@ class ProductAttributesPredictor():
|
|
|
mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
if link['unitPrice'] != "" and mat:
|
|
|
try:
|
|
|
- total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
|
|
|
+ total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
|
|
|
except:
|
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
|
if order_begin != "" and order_end != "":
|
|
@@ -2804,88 +2810,88 @@ class ProductAttributesPredictor():
|
|
|
headers.append('_'.join(header_list))
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
|
header_col.append('_'.join(tmp_head_list))
|
|
|
- # print('header_dic: ',header_dic)
|
|
|
- id1 = header_dic.get('名称', "")
|
|
|
- id2 = header_dic.get('数量', "")
|
|
|
- id3 = header_dic.get('单价', "")
|
|
|
- id4 = header_dic.get('品牌', "")
|
|
|
- id5 = header_dic.get('规格', "")
|
|
|
-
|
|
|
- id6 = header_dic.get('需求', "")
|
|
|
- id7 = header_dic.get('预算', "")
|
|
|
- id8 = header_dic.get('时间', "")
|
|
|
- if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
|
|
|
- re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
|
|
|
- product = deal_list[id1]
|
|
|
- if id2 != "":
|
|
|
- if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
|
|
|
- quantity = deal_list[id2]
|
|
|
- else:
|
|
|
- quantity = ""
|
|
|
- if id3 != "":
|
|
|
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
|
|
|
- _unitPrice = deal_list[id3]
|
|
|
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
|
|
|
- if re_price:
|
|
|
- _unitPrice = re_price[0]
|
|
|
- if '万元' in header_list[2] and '万' not in _unitPrice:
|
|
|
- _unitPrice += '万元'
|
|
|
- unitPrice = str(getUnifyMoney(_unitPrice))
|
|
|
- if id4 != "":
|
|
|
- if re.search('\w', deal_list[id4]):
|
|
|
- brand = deal_list[id4]
|
|
|
- else:
|
|
|
- brand = ""
|
|
|
- if id5 != "":
|
|
|
- if re.search('\w', deal_list[id5]):
|
|
|
- specs = deal_list[id5]
|
|
|
- else:
|
|
|
- specs = ""
|
|
|
- if id6 != "":
|
|
|
- if re.search('\w', deal_list[id6]):
|
|
|
- demand = deal_list[id6]
|
|
|
- else:
|
|
|
- demand = ""
|
|
|
- if id7 != "":
|
|
|
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
|
|
|
- _budget = deal_list[id7]
|
|
|
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
|
|
|
- if re_price:
|
|
|
- _budget = re_price[0]
|
|
|
- if '万元' in header_list2[2] and '万' not in _budget:
|
|
|
- _budget += '万元'
|
|
|
- budget = str(getUnifyMoney(_budget))
|
|
|
-
|
|
|
- if id8 != "":
|
|
|
- if re.search('\w', deal_list[id8]):
|
|
|
- order_time = deal_list[id8].strip()
|
|
|
- order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
- # print(quantity,unitPrice,brand,specs)
|
|
|
- if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
- link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
|
|
|
- 'brand': brand[:50], 'specs': specs}
|
|
|
- if link not in product_link:
|
|
|
- product_link.append(link)
|
|
|
- # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
- # if link['unitPrice'] != "" and mat:
|
|
|
- # try:
|
|
|
- # total_product_money += float(link['unitPrice']) * float(
|
|
|
- # mat.group(1).replace(',', ''))
|
|
|
- # except:
|
|
|
- # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
- # link['unitPrice'], link['quantity']))
|
|
|
- if order_begin != "" and order_end != "":
|
|
|
- order_begin_year = int(order_begin.split("-")[0])
|
|
|
- order_end_year = int(order_end.split("-")[0])
|
|
|
- # 限制附件错误识别时间
|
|
|
- if order_begin_year >= 2050 or order_end_year >= 2050:
|
|
|
- order_begin = order_end = ""
|
|
|
- # print(budget, order_time)
|
|
|
- if budget != "" and order_time != "":
|
|
|
- link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
|
- 'order_begin': order_begin, 'order_end': order_end}
|
|
|
- if link not in demand_link:
|
|
|
- demand_link.append(link)
|
|
|
+ # print('header_dic: ',header_dic)
|
|
|
+ id1 = header_dic.get('名称', "")
|
|
|
+ id2 = header_dic.get('数量', "")
|
|
|
+ id3 = header_dic.get('单价', "")
|
|
|
+ id4 = header_dic.get('品牌', "")
|
|
|
+ id5 = header_dic.get('规格', "")
|
|
|
+
|
|
|
+ id6 = header_dic.get('需求', "")
|
|
|
+ id7 = header_dic.get('预算', "")
|
|
|
+ id8 = header_dic.get('时间', "")
|
|
|
+ if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
|
|
|
+ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
|
|
|
+ product = deal_list[id1]
|
|
|
+ if id2 != "":
|
|
|
+ if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
|
|
|
+ quantity = deal_list[id2]
|
|
|
+ else:
|
|
|
+ quantity = ""
|
|
|
+ if id3 != "":
|
|
|
+ if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
|
|
|
+ _unitPrice = deal_list[id3]
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
|
|
|
+ if re_price:
|
|
|
+ _unitPrice = re_price[0]
|
|
|
+ if '万元' in header_list[2] and '万' not in _unitPrice:
|
|
|
+ _unitPrice += '万元'
|
|
|
+ unitPrice = str(getUnifyMoney(_unitPrice))
|
|
|
+ if id4 != "":
|
|
|
+ if re.search('\w', deal_list[id4]):
|
|
|
+ brand = deal_list[id4]
|
|
|
+ else:
|
|
|
+ brand = ""
|
|
|
+ if id5 != "":
|
|
|
+ if re.search('\w', deal_list[id5]):
|
|
|
+ specs = deal_list[id5]
|
|
|
+ else:
|
|
|
+ specs = ""
|
|
|
+ if id6 != "":
|
|
|
+ if re.search('\w', deal_list[id6]):
|
|
|
+ demand = deal_list[id6]
|
|
|
+ else:
|
|
|
+ demand = ""
|
|
|
+ if id7 != "":
|
|
|
+ if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
|
|
|
+ _budget = deal_list[id7]
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
|
|
|
+ if re_price:
|
|
|
+ _budget = re_price[0]
|
|
|
+ if '万元' in header_list2[2] and '万' not in _budget:
|
|
|
+ _budget += '万元'
|
|
|
+ budget = str(getUnifyMoney(_budget))
|
|
|
+
|
|
|
+ if id8 != "":
|
|
|
+ if re.search('\w', deal_list[id8]):
|
|
|
+ order_time = deal_list[id8].strip()
|
|
|
+ order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
+ # print(quantity,unitPrice,brand,specs)
|
|
|
+ if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
+ link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
|
|
|
+ 'brand': brand[:50], 'specs': specs}
|
|
|
+ if link not in product_link:
|
|
|
+ product_link.append(link)
|
|
|
+ # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
+ # if link['unitPrice'] != "" and mat:
|
|
|
+ # try:
|
|
|
+ # total_product_money += float(link['unitPrice']) * float(
|
|
|
+ # mat.group(1).replace(',', ''))
|
|
|
+ # except:
|
|
|
+ # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
+ # link['unitPrice'], link['quantity']))
|
|
|
+ if order_begin != "" and order_end != "":
|
|
|
+ order_begin_year = int(order_begin.split("-")[0])
|
|
|
+ order_end_year = int(order_end.split("-")[0])
|
|
|
+ # 限制附件错误识别时间
|
|
|
+ if order_begin_year >= 2050 or order_end_year >= 2050:
|
|
|
+ order_begin = order_end = ""
|
|
|
+ # print(budget, order_time)
|
|
|
+ if budget != "" and order_time != "":
|
|
|
+ link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
|
+ 'order_begin': order_begin, 'order_end': order_end}
|
|
|
+ if link not in demand_link:
|
|
|
+ demand_link.append(link)
|
|
|
|
|
|
if len(product_link) > 0:
|
|
|
attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}
|
|
@@ -2958,19 +2964,20 @@ class DocChannel():
|
|
|
self.life_dic = {
|
|
|
'采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
'招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
|
|
|
- '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|\w{,5}材料)|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
|
+ '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
|
'资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
|
|
|
'招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示', # |异议的回复
|
|
|
'公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
|
|
|
- '候选人公示': '候选人公示|评标结果公示',
|
|
|
+ '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示',
|
|
|
'中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
'中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
|
|
|
'中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
- '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示',
|
|
|
+ '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标',
|
|
|
# |确定成交供应商[:,\s]
|
|
|
'合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|签订日期)|(供应商乙方|乙方供应商):|合同总?金额',
|
|
|
'废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
|
- '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)'
|
|
|
+ '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标',
|
|
|
+ '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|视为流标'
|
|
|
}
|
|
|
self.title_life_dic = {
|
|
|
'采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
|
|
@@ -4313,6 +4320,303 @@ class DistrictPredictor():
|
|
|
rs = rs2
|
|
|
return rs
|
|
|
|
|
|
+class TablePremExtractor(object):
|
|
|
+ def __init__(self):
|
|
|
+ '''各要素表头规则'''
|
|
|
+ self.head_rule_dic = {
|
|
|
+ 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
+ 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
|
|
|
+ "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
+ "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
+ "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
+ "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ }
|
|
|
+
|
|
|
+ with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
+ self.headerset = pickle.load(f)
|
|
|
+
|
|
|
+ def table2list(self, table):
|
|
|
+ self._output = []
|
|
|
+ row_ind = 0
|
|
|
+ col_ind = 0
|
|
|
+ for row in table.find_all('tr'):
|
|
|
+ # record the smallest row_span, so that we know how many rows
|
|
|
+ # we should skip
|
|
|
+ smallest_row_span = 1
|
|
|
+
|
|
|
+ for cell in row.children:
|
|
|
+ if cell.name in ('td', 'th'):
|
|
|
+ # check multiple rows
|
|
|
+ # pdb.set_trace()
|
|
|
+ row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and re.search('[0-9]', cell.get('rowspan')) else 1
|
|
|
+
|
|
|
+ # try updating smallest_row_span
|
|
|
+ smallest_row_span = min(smallest_row_span, row_span)
|
|
|
+
|
|
|
+ # check multiple columns
|
|
|
+ col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and re.search('[0-9]', cell.get('colspan')) else 1
|
|
|
+
|
|
|
+ # find the right index
|
|
|
+ while True:
|
|
|
+ if self._check_cell_validity(row_ind, col_ind):
|
|
|
+ break
|
|
|
+ col_ind += 1
|
|
|
+
|
|
|
+ # insert into self._output
|
|
|
+ try:
|
|
|
+ text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
|
|
|
+ text = re.sub('\s', '', text)[:200] # 只需取前200字即可
|
|
|
+ self._insert(row_ind, col_ind, row_span, col_span, text)
|
|
|
+ except UnicodeEncodeError:
|
|
|
+ raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
|
|
|
+
|
|
|
+ # update col_ind
|
|
|
+ col_ind += col_span
|
|
|
+ if col_ind > 50: # 表格列数大于50的去掉
|
|
|
+ return []
|
|
|
+
|
|
|
+ # update row_ind
|
|
|
+ row_ind += smallest_row_span
|
|
|
+ col_ind = 0
|
|
|
+ return self._output
|
|
|
+
|
|
|
+ def _check_validity(self, i, j, height, width):
|
|
|
+ """
|
|
|
+ check if a rectangle (i, j, height, width) can be put into self.output
|
|
|
+ """
|
|
|
+ return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width))
|
|
|
+
|
|
|
+ def _check_cell_validity(self, i, j):
|
|
|
+ """
|
|
|
+ check if a cell (i, j) can be put into self._output
|
|
|
+ """
|
|
|
+ if i >= len(self._output):
|
|
|
+ return True
|
|
|
+ if j >= len(self._output[i]):
|
|
|
+ return True
|
|
|
+ if self._output[i][j] is None:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def _insert(self, i, j, height, width, val):
|
|
|
+ # pdb.set_trace()
|
|
|
+ for ii in range(i, i+height):
|
|
|
+ for jj in range(j, j+width):
|
|
|
+ self._insert_cell(ii, jj, val)
|
|
|
+
|
|
|
+ def _insert_cell(self, i, j, val):
|
|
|
+ while i >= len(self._output):
|
|
|
+ self._output.append([])
|
|
|
+ while j >= len(self._output[i]):
|
|
|
+ self._output[i].append("")
|
|
|
+
|
|
|
+ if self._output[i][j] == "":
|
|
|
+ self._output[i][j] = val
|
|
|
+
|
|
|
+ def find_header(self, td_list):
|
|
|
+ header_dic = dict()
|
|
|
+ flag = False
|
|
|
+ if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>0.6:
|
|
|
+ flag = True
|
|
|
+ for i in range(len(td_list)) :
|
|
|
+ text = td_list[i]
|
|
|
+ if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
+ continue
|
|
|
+ if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
+ return flag, dict()
|
|
|
+ num = 0
|
|
|
+ for k, v in self.head_rule_dic.items():
|
|
|
+ if re.search(v, text):
|
|
|
+ header_dic[k] = (i, text)
|
|
|
+ num += 1
|
|
|
+ if num>1:
|
|
|
+ print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
+ return flag, dict()
|
|
|
+ if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
+ 'budget' in header_dic or 'tenderer' in header_dic):
|
|
|
+ return flag, header_dic
|
|
|
+ return flag, dict()
|
|
|
+
|
|
|
+ def is_role(self, text):
|
|
|
+ if len(text) > 25 or len(text)<5:
|
|
|
+ return False
|
|
|
+ elif len(re.findall('有限责?任?公司', text)) > 1:
|
|
|
+ return False
|
|
|
+ elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ ners = selffool.ner(text)
|
|
|
+ if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def extract_from_df(self, df, headers):
|
|
|
+ prem_dic = {}
|
|
|
+ previous_package = "" # 上一行包号
|
|
|
+ multi_same_package = False # 非连续的重复包号
|
|
|
+ package_fix2raw = dict() # 处理后包号:处理前包号 字典
|
|
|
+ link_set = set()
|
|
|
+ for i in df.index:
|
|
|
+ same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
|
+ project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
|
|
|
+ package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
|
|
|
+ project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
|
|
|
+ tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
|
|
|
+ tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
|
|
|
+ budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
|
|
|
+ bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
|
|
|
+ win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
+
|
|
|
+ package_code = package_code_raw
|
|
|
+ if re.search('合计|总计', package_code+project_code):
|
|
|
+ continue
|
|
|
+ if package_code != '' and package_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
+ same_package = True
|
|
|
+ project_name = ''
|
|
|
+ previous_package = package_code
|
|
|
+
|
|
|
+ if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
|
+ continue
|
|
|
+ if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
|
|
|
+ continue
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and 'bid_amount' in headers and re.search('(中标|成交)价', headers['bid_amount'][1])==None:
|
|
|
+ tenderer = ""
|
|
|
+
|
|
|
+ tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
+ tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
+
|
|
|
+ if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
+ break
|
|
|
+ if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
|
|
|
+ continue
|
|
|
+ link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
+
|
|
|
+ package = package_code if package_code else str(i+1)
|
|
|
+ package = uniform_package_name(package)
|
|
|
+
|
|
|
+ if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
|
+ package_fix2raw[package] = package_code_raw
|
|
|
+ elif same_package == False:
|
|
|
+ multi_same_package = True
|
|
|
+ if multi_same_package:
|
|
|
+ package = package_code_raw
|
|
|
+ if package not in prem_dic or not same_package:
|
|
|
+ prem_dic[package] = {
|
|
|
+ 'code': '',
|
|
|
+ 'name': '',
|
|
|
+ 'roleList': [],
|
|
|
+ 'tendereeMoney': 0,
|
|
|
+ 'tendereeMoneyUnit': ""
|
|
|
+ }
|
|
|
+
|
|
|
+ prem_dic[package]['code'] = project_code
|
|
|
+ prem_dic[package]['name'] = project_name
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", budget_)
|
|
|
+ if re_price:
|
|
|
+ budget_ = re_price[0]
|
|
|
+ if '万元' in headers['budget'][1] and '万' not in budget_:
|
|
|
+ budget_ += '万元'
|
|
|
+ budget = float(str(getUnifyMoney(budget_)))
|
|
|
+ if budget > 10000000000000: # 大于万亿的去除
|
|
|
+ budget = 0
|
|
|
+ if same_package and prem_dic[package]['tendereeMoney'] != budget: #
|
|
|
+ prem_dic[package]['tendereeMoney'] += budget
|
|
|
+ else:
|
|
|
+ prem_dic[package]['tendereeMoney'] = budget
|
|
|
+ prem_dic[package]['tendereeMoneyUnit'] = '万元' if '万' in budget_ else '元'
|
|
|
+ if tenderee and not same_package:
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": 0,
|
|
|
+ "money_unit": ""
|
|
|
+ },
|
|
|
+ "role_name": "tenderee",
|
|
|
+ "role_text": tenderee,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ if tenderer and not same_package:
|
|
|
+ bid_amount = 0
|
|
|
+ money_unit = ""
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", bid_amount_)
|
|
|
+ if re_price:
|
|
|
+ bid_amount_ = re_price[0]
|
|
|
+ if '万元' in headers['bid_amount'][1] and '万' not in bid_amount_:
|
|
|
+ bid_amount_ += '万元'
|
|
|
+ bid_amount = float(str(getUnifyMoney(bid_amount_)))
|
|
|
+ if bid_amount > 10000000000000: # 大于万亿的去除
|
|
|
+ bid_amount = 0
|
|
|
+ money_unit = '万元' if '万' in bid_amount_ else '元'
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": bid_amount,
|
|
|
+ "money_unit": money_unit
|
|
|
+ },
|
|
|
+ "role_name": "win_tenderer",
|
|
|
+ "role_text": tenderer,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃
|
|
|
+ prem_dic.pop(package)
|
|
|
+ if multi_same_package:
|
|
|
+ for k, v in package_fix2raw.items():
|
|
|
+ if k in prem_dic:
|
|
|
+ prem_dic[v] = prem_dic.pop(k)
|
|
|
+ return prem_dic
|
|
|
+
|
|
|
+ def get_prem(self, soup):
|
|
|
+ tables = soup.find_all('table')
|
|
|
+ tables.reverse()
|
|
|
+ rs_dic = {}
|
|
|
+ for table in tables:
|
|
|
+ trs = self.table2list(table)
|
|
|
+ table.extract()
|
|
|
+ i = 0
|
|
|
+ headers = ""
|
|
|
+ while i < len(trs) - 1:
|
|
|
+ flag_, headers_ = self.find_header(trs[i])
|
|
|
+ if flag_ and headers_ != dict():
|
|
|
+ table_items = []
|
|
|
+ headers = headers_
|
|
|
+ for j in range(i + 1, len(trs)):
|
|
|
+ if len(trs[j]) == len(trs[i]):
|
|
|
+ flag_, headers_ = self.find_header(trs[j])
|
|
|
+ if flag_:
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ table_items.append(trs[j])
|
|
|
+ else:
|
|
|
+ print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
|
|
|
+ break
|
|
|
+ if len(table_items) > 1:
|
|
|
+ df = pd.DataFrame(table_items)
|
|
|
+ prem_ = self.extract_from_df(df, headers)
|
|
|
+ rs_dic.update(prem_)
|
|
|
+ i = j - 1
|
|
|
+ i += 1
|
|
|
+ return rs_dic
|
|
|
+
|
|
|
+ def predict(self, html):
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
+ richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
+ if richText:
|
|
|
+ richText = richText.extract() # 过滤掉附件
|
|
|
+ prem = self.get_prem(soup)
|
|
|
+ if prem == {} and richText:
|
|
|
+ prem = self.get_prem(richText)
|
|
|
+ return prem
|
|
|
+
|
|
|
|
|
|
def getSavedModel():
|
|
|
#predictor = FormPredictor()
|