|
@@ -2538,8 +2538,8 @@ class ProductPredictor():
|
|
# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
|
|
# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
|
|
class ProductAttributesPredictor():
|
|
class ProductAttributesPredictor():
|
|
def __init__(self,):
|
|
def __init__(self,):
|
|
- self.p0 = '(品目|类别|类型|物类|目录|^品名|^品类)(名称|$)|(标项|项目|计划|标段|[分子]?包|子目|服务|招标|工程|招标内容)(名称|内容|描述)'
|
|
|
|
- self.p1 = '(标的|维修|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名)[\))的]?(名称|内容|描述)'
|
|
|
|
|
|
+ self.p0 = '(品目|类别|类型|物类|目录|类目|分类|^品名(及规格)?|^品类)(名称|$)|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|工程|招标内容)(名称|内容|描述)'
|
|
|
|
+ self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?(名称|内容|描述)'
|
|
self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称'
|
|
self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称'
|
|
# self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
# self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
# self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
|
|
# self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
|
|
@@ -2554,10 +2554,13 @@ class ProductAttributesPredictor():
|
|
if table.find_all(['caption', 'th']) != []:
|
|
if table.find_all(['caption', 'th']) != []:
|
|
return True
|
|
return True
|
|
elif len(table.find_all(['form', 'a', 'img'])) > 5:
|
|
elif len(table.find_all(['form', 'a', 'img'])) > 5:
|
|
|
|
+ # print('过滤表格:包含链接图片等大于5的为假表格')
|
|
return False
|
|
return False
|
|
elif len(table.find_all(['tr'])) < 2:
|
|
elif len(table.find_all(['tr'])) < 2:
|
|
|
|
+ # print('过滤表格:行数小于2的为假表格')
|
|
return False
|
|
return False
|
|
elif len(table.find_all(['table'])) >= 1:
|
|
elif len(table.find_all(['table'])) >= 1:
|
|
|
|
+ # print('过滤表格:包含多个表格的为假表格')
|
|
return False
|
|
return False
|
|
else:
|
|
else:
|
|
return True
|
|
return True
|
|
@@ -2820,6 +2823,7 @@ class ProductAttributesPredictor():
|
|
:param p2: 第二表头正则
|
|
:param p2: 第二表头正则
|
|
:return: 表头所在列序号,是否表头,表头内容
|
|
:return: 表头所在列序号,是否表头,表头内容
|
|
'''
|
|
'''
|
|
|
|
+ items = [re.sub('\s', '', it) for it in items]
|
|
flag = False
|
|
flag = False
|
|
header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
|
|
header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
|
|
product = "" # 产品
|
|
product = "" # 产品
|
|
@@ -2845,7 +2849,7 @@ class ProductAttributesPredictor():
|
|
flag = True
|
|
flag = True
|
|
product = it
|
|
product = it
|
|
header_dic['名称'] = i
|
|
header_dic['名称'] = i
|
|
- break
|
|
|
|
|
|
+ # break
|
|
# if not flag:
|
|
# if not flag:
|
|
if product == "":
|
|
if product == "":
|
|
for i in range(min(4, len(items))):
|
|
for i in range(min(4, len(items))):
|
|
@@ -2856,6 +2860,11 @@ class ProductAttributesPredictor():
|
|
product = it
|
|
product = it
|
|
header_dic['名称'] = i
|
|
header_dic['名称'] = i
|
|
break
|
|
break
|
|
|
|
+ if flag == False and len(items)>3 and re.search('^第[一二三四五六七八九十](包|标段)$', items[0]):
|
|
|
|
+ product = items[0]
|
|
|
|
+ header_dic['名称'] = 0
|
|
|
|
+ flag = True
|
|
|
|
+
|
|
if flag:
|
|
if flag:
|
|
# for j in range(i + 1, len(items)):
|
|
# for j in range(i + 1, len(items)):
|
|
for j in range(len(items)):
|
|
for j in range(len(items)):
|
|
@@ -2891,7 +2900,7 @@ class ProductAttributesPredictor():
|
|
elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
|
|
elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
|
|
header_dic['时间'] = j
|
|
header_dic['时间'] = j
|
|
order_time = items[j]
|
|
order_time = items[j]
|
|
- elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计))?(金额|价格?)', items[j]):
|
|
|
|
|
|
+ elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价', items[j]):
|
|
header_dic['总价'] = j
|
|
header_dic['总价'] = j
|
|
total_price = items[j]
|
|
total_price = items[j]
|
|
|
|
|
|
@@ -2915,6 +2924,8 @@ class ProductAttributesPredictor():
|
|
'''
|
|
'''
|
|
|
|
|
|
html = html.replace('<br>', '\n').replace('<br/>', '\n')
|
|
html = html.replace('<br>', '\n').replace('<br/>', '\n')
|
|
|
|
+ html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
|
|
+ html = re.sub("##attachment##","",html)
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
# flag_yx = True if re.search('采购意向', html) else False
|
|
# flag_yx = True if re.search('采购意向', html) else False
|
|
flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
|
|
flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
|
|
@@ -2926,12 +2937,15 @@ class ProductAttributesPredictor():
|
|
demand_link = []
|
|
demand_link = []
|
|
product_set = set()
|
|
product_set = set()
|
|
total_product_money = 0
|
|
total_product_money = 0
|
|
- unit_price_list = []
|
|
|
|
|
|
+ unit_price_list = [] # 单价列表,用于判断是否重复单价,避免多个表格重复提取造成合计产品价格错误。
|
|
|
|
+ total_price_list = [] # 总价列表,拥有判断是否为几行产品合计总价
|
|
|
|
+ # print('表格数:', len(tables))
|
|
for i in range(len(tables)-1, -1, -1):
|
|
for i in range(len(tables)-1, -1, -1):
|
|
table = tables[i]
|
|
table = tables[i]
|
|
if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
|
|
if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
|
|
table.string = table.get_text()
|
|
table.string = table.get_text()
|
|
table.name = 'turntable'
|
|
table.name = 'turntable'
|
|
|
|
+ # print('过滤表格:表格父节点为td,且表格td数量小于等于3')
|
|
continue
|
|
continue
|
|
if not self.isTrueTable(table):
|
|
if not self.isTrueTable(table):
|
|
continue
|
|
continue
|
|
@@ -2943,6 +2957,7 @@ class ProductAttributesPredictor():
|
|
header_quan_unit = "" # 数量表头 包含单位
|
|
header_quan_unit = "" # 数量表头 包含单位
|
|
header_colnum = 0
|
|
header_colnum = 0
|
|
if flag_yx:
|
|
if flag_yx:
|
|
|
|
+ # print('意向公告, 提取意向信息')
|
|
col0_l = []
|
|
col0_l = []
|
|
col1_l = []
|
|
col1_l = []
|
|
for tds in inner_table:
|
|
for tds in inner_table:
|
|
@@ -2995,9 +3010,10 @@ class ProductAttributesPredictor():
|
|
continue
|
|
continue
|
|
while i < (len(inner_table)):
|
|
while i < (len(inner_table)):
|
|
tds = inner_table[i]
|
|
tds = inner_table[i]
|
|
- not_empty = [it for it in tds if it != ""]
|
|
|
|
- if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
|
|
|
|
|
|
+ not_empty = [it for it in tds if re.sub('\s', '', it) != ""]
|
|
|
|
+ if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总', tds[0])): # 非空列或者不重复内容小于两列的 继续
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('表格产品提取:非空列或者不重复内容小于两列的 继续', i, tds)
|
|
continue
|
|
continue
|
|
product = "" # 产品
|
|
product = "" # 产品
|
|
quantity = "" # 数量
|
|
quantity = "" # 数量
|
|
@@ -3017,6 +3033,7 @@ class ProductAttributesPredictor():
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
if found_header:
|
|
if found_header:
|
|
header_colnum = len(tds) # 保存表头所在行列数
|
|
header_colnum = len(tds) # 保存表头所在行列数
|
|
|
|
+ # print('发现表头:', header_colnum, header_dic)
|
|
if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
|
|
if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
|
|
quantity_header = header_list[1].replace('单位:', '')
|
|
quantity_header = header_list[1].replace('单位:', '')
|
|
if re.search('(([\w/]{,5}))', quantity_header):
|
|
if re.search('(([\w/]{,5}))', quantity_header):
|
|
@@ -3029,10 +3046,12 @@ class ProductAttributesPredictor():
|
|
headers_demand.append('_'.join(header_list2))
|
|
headers_demand.append('_'.join(header_list2))
|
|
header_col.append('_'.join(tds))
|
|
header_col.append('_'.join(tds))
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('表头数量占行列数0.2倍不做内容匹配', set([re.sub('[::]','',td) for td in tds]) & self.header_set)
|
|
continue
|
|
continue
|
|
elif found_header:
|
|
elif found_header:
|
|
if len(tds) != header_colnum: # 表头、属性列数不一致跳过
|
|
if len(tds) != header_colnum: # 表头、属性列数不一致跳过
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('表头、属性列数不一致跳过', len(tds), header_colnum, tds)
|
|
continue
|
|
continue
|
|
id0 = header_dic.get('品目', "")
|
|
id0 = header_dic.get('品目', "")
|
|
id1 = header_dic.get('名称', "")
|
|
id1 = header_dic.get('名称', "")
|
|
@@ -3053,11 +3072,13 @@ class ProductAttributesPredictor():
|
|
for k, v in header_dic.items():
|
|
for k, v in header_dic.items():
|
|
if isinstance(v, int):
|
|
if isinstance(v, int):
|
|
if v >= len(tds) or tds[v] in self.header_set:
|
|
if v >= len(tds) or tds[v] in self.header_set:
|
|
|
|
+ # print('内容属性在表头集合里面', tds[v], v >= len(tds))
|
|
not_attr = 1
|
|
not_attr = 1
|
|
- break
|
|
|
|
- if not_attr: # 只要属性里面有一项为表头,停止匹配
|
|
|
|
|
|
+ # break
|
|
|
|
+ if not_attr>=2: # 只要属性里面有两项为表头,停止匹配
|
|
i += 1
|
|
i += 1
|
|
found_header = False
|
|
found_header = False
|
|
|
|
+ # print('只要属性里面有两项为表头,停止匹配')
|
|
continue
|
|
continue
|
|
|
|
|
|
if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
|
|
if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
|
|
@@ -3067,17 +3088,20 @@ class ProductAttributesPredictor():
|
|
if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
|
|
if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id0]) == None:
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id0]) == None:
|
|
category = tds[id0]
|
|
category = tds[id0]
|
|
- product = "%s_%s"%(category, product) if product!="" else category
|
|
|
|
|
|
+ product = "%s_%s"%(category, product) if product!="" and product!=category else category
|
|
|
|
|
|
if product != "":
|
|
if product != "":
|
|
|
|
+ # print('匹配产品内容: ', product)
|
|
if id2 != "":
|
|
if id2 != "":
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
|
|
- if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]):
|
|
|
|
- i += 1
|
|
|
|
- continue
|
|
|
|
|
|
+ # if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]): # 254816100 这篇数量很大,貌似正常
|
|
|
|
+ # i += 1
|
|
|
|
+ # print('过滤:数量包含金额单位或值很大类似金额', tds[id2])
|
|
|
|
+ # continue
|
|
quantity = tds[id2]
|
|
quantity = tds[id2]
|
|
elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
|
|
elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('过滤:数量包含五个字符以上且不包含^详见|^略等字符')
|
|
continue
|
|
continue
|
|
if id2_2 != "":
|
|
if id2_2 != "":
|
|
if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
|
|
if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
|
|
@@ -3087,8 +3111,9 @@ class ProductAttributesPredictor():
|
|
unitPrice = tds[id3]
|
|
unitPrice = tds[id3]
|
|
elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
|
|
elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
|
|
unitPrice = tds[id3]
|
|
unitPrice = tds[id3]
|
|
- elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id3])) > 5:
|
|
|
|
|
|
+ elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id3])) > 5 and re.search('^详见|^略', tds[id3])==None:
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('过滤:产品单价包含金额外的字符数大于5个')
|
|
continue
|
|
continue
|
|
if id4 != "":
|
|
if id4 != "":
|
|
if re.search('\w', tds[id4]):
|
|
if re.search('\w', tds[id4]):
|
|
@@ -3121,13 +3146,15 @@ class ProductAttributesPredictor():
|
|
total_price = tds[id9]
|
|
total_price = tds[id9]
|
|
elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
|
|
elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
|
|
total_price = tds[id9]
|
|
total_price = tds[id9]
|
|
- elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id9])) > 5:
|
|
|
|
|
|
+ elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id9])) > 5 and re.search('^详见|^略', tds[id9])==None:
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('过滤:产品总价包含金额外的字符数大于5个')
|
|
continue
|
|
continue
|
|
if id10 != "":
|
|
if id10 != "":
|
|
parameter = tds[id10][:500]
|
|
parameter = tds[id10][:500]
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
parameter = ""
|
|
parameter = ""
|
|
|
|
+ # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
products = re.split('[;;、,\n]', tds[id1])
|
|
products = re.split('[;;、,\n]', tds[id1])
|
|
@@ -3149,13 +3176,14 @@ class ProductAttributesPredictor():
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
if unitPrice != "":
|
|
if unitPrice != "":
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
- unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
|
|
if budget != "":
|
|
if budget != "":
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
- budget = str(budget) if budget != 0 else ''
|
|
|
|
|
|
+ budget = str(budget) if budget != 0 and budget<50000000000 else ''
|
|
if total_price != "":
|
|
if total_price != "":
|
|
total_price, _money_unit = money_process(total_price, header_list[6])
|
|
total_price, _money_unit = money_process(total_price, header_list[6])
|
|
- total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
|
|
|
+ total_price_list.append(total_price)
|
|
|
|
+ total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
|
|
link = {'product': product, 'quantity': quantity,
|
|
link = {'product': product, 'quantity': quantity,
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
|
|
'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
|
|
@@ -3182,8 +3210,9 @@ class ProductAttributesPredictor():
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
link['unitPrice'], link['quantity']))
|
|
link['unitPrice'], link['quantity']))
|
|
|
|
|
|
- elif len(unitPrice) > 15 or len(product)>100: # 单价大于15位数或 产品名称长于100字
|
|
|
|
|
|
+ elif len(product)>100: # 产品名称长于100字
|
|
i += 1
|
|
i += 1
|
|
|
|
+ # print('过滤: 产品名称长于100字',)
|
|
continue
|
|
continue
|
|
else:
|
|
else:
|
|
if quantity != "":
|
|
if quantity != "":
|
|
@@ -3191,13 +3220,14 @@ class ProductAttributesPredictor():
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
if unitPrice != "":
|
|
if unitPrice != "":
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
- unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
|
|
if budget != "":
|
|
if budget != "":
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
- budget = str(budget) if budget != 0 else ''
|
|
|
|
|
|
+ budget = str(budget) if budget != 0 and budget<50000000000 else ''
|
|
if total_price != "":
|
|
if total_price != "":
|
|
total_price, _money_unit = money_process(total_price, header_list[6])
|
|
total_price, _money_unit = money_process(total_price, header_list[6])
|
|
- total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
|
|
|
+ total_price_list.append(total_price)
|
|
|
|
+ total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
|
|
link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
|
|
'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
|
|
|
|
|
|
@@ -3210,8 +3240,8 @@ class ProductAttributesPredictor():
|
|
# except:
|
|
# except:
|
|
# log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
# log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
|
|
|
|
- if (product, specs, unitPrice, quantity) not in product_set:
|
|
|
|
- product_set.add((product, specs, unitPrice, quantity))
|
|
|
|
|
|
+ if (product, unitPrice, quantity) not in product_set:
|
|
|
|
+ product_set.add((product, unitPrice, quantity))
|
|
product_link.append(link)
|
|
product_link.append(link)
|
|
if link['unitPrice'] != "" and link['quantity'] != '':
|
|
if link['unitPrice'] != "" and link['quantity'] != '':
|
|
try:
|
|
try:
|
|
@@ -3236,6 +3266,18 @@ class ProductAttributesPredictor():
|
|
i += 1
|
|
i += 1
|
|
else:
|
|
else:
|
|
i += 1
|
|
i += 1
|
|
|
|
+ if len(total_price_list)>0 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
|
|
|
|
+ # print('总价一半以上重复的为多行一个总价,需去掉')
|
|
|
|
+ for link in product_link:
|
|
|
|
+ if 'total_price' in link:
|
|
|
|
+ link['total_price'] = ""
|
|
|
|
+ if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5: # 2023/7/18 如果单价重复率高不算总产品价避免错误
|
|
|
|
+ # print('如果单价重复率高不算总产品价避免错误')
|
|
|
|
+ total_product_money = 0
|
|
|
|
+ for link in product_link:
|
|
|
|
+ if 'unitPrice' in link:
|
|
|
|
+ link['unitPrice'] = ""
|
|
|
|
+
|
|
if len(product_link)>0:
|
|
if len(product_link)>0:
|
|
attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
|
|
attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
|
|
else:
|
|
else:
|
|
@@ -3244,8 +3286,7 @@ class ProductAttributesPredictor():
|
|
demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
|
|
demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
|
|
else:
|
|
else:
|
|
demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
|
|
demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
|
|
- if len(unit_price_list)>0 and len(set(unit_price_list))/len(unit_price_list)<=0.5: # 2023/7/18 如果单价重复率高不算总产品价避免错误
|
|
|
|
- total_product_money = 0
|
|
|
|
|
|
+ # print('表格产品属性提取:', attr_dic)
|
|
return [attr_dic, demand_dic], total_product_money
|
|
return [attr_dic, demand_dic], total_product_money
|
|
|
|
|
|
def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
|
|
def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
|
|
@@ -3283,6 +3324,7 @@ class ProductAttributesPredictor():
|
|
'order_begin': order_begin, 'order_end': order_end}
|
|
'order_begin': order_begin, 'order_end': order_end}
|
|
_data.append(link)
|
|
_data.append(link)
|
|
product_attrs[1]['demand_info']['data'] = _data
|
|
product_attrs[1]['demand_info']['data'] = _data
|
|
|
|
+ # print('predict_without_table: ', product_attrs)
|
|
return product_attrs
|
|
return product_attrs
|
|
|
|
|
|
def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
|
|
def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
|
|
@@ -3312,7 +3354,12 @@ class ProductAttributesPredictor():
|
|
begin_list = [0]
|
|
begin_list = [0]
|
|
for index,head in enumerate(head_list):
|
|
for index,head in enumerate(head_list):
|
|
if head not in loop_list:
|
|
if head not in loop_list:
|
|
- loop_list.append(head)
|
|
|
|
|
|
+ if re.search('第[一二三四五六七八九十](包|标段)', head) and re.search('第[一二三四五六七八九十](包|标段)', '|'.join(loop_list)):
|
|
|
|
+ begin_list.append(index)
|
|
|
|
+ loop_list = []
|
|
|
|
+ loop_list.append(head)
|
|
|
|
+ else:
|
|
|
|
+ loop_list.append(head)
|
|
else:
|
|
else:
|
|
begin_list.append(index)
|
|
begin_list.append(index)
|
|
loop_list = []
|
|
loop_list = []
|
|
@@ -3454,7 +3501,7 @@ class ProductAttributesPredictor():
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
parameter = ""
|
|
parameter = ""
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
- if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
|
|
|
|
|
|
+ if id1 != "" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
|
|
re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
products = re.split('[;;、,\n]', deal_list[id1])
|
|
products = re.split('[;;、,\n]', deal_list[id1])
|
|
quantitys = re.split('[;;、,\n]', deal_list[id2])
|
|
quantitys = re.split('[;;、,\n]', deal_list[id2])
|
|
@@ -3478,14 +3525,14 @@ class ProductAttributesPredictor():
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
if unitPrice != "":
|
|
if unitPrice != "":
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
- unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
|
|
if budget != "":
|
|
if budget != "":
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
- budget = str(budget) if budget != 0 else ''
|
|
|
|
|
|
+ budget = str(budget) if budget != 0 and budget<50000000000 else ''
|
|
if total_price != "":
|
|
if total_price != "":
|
|
total_price, _money_unit = money_process(total_price,
|
|
total_price, _money_unit = money_process(total_price,
|
|
header_list[6])
|
|
header_list[6])
|
|
- total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
|
|
|
+ total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
|
|
link = {'product': product, 'quantity': quantity,
|
|
link = {'product': product, 'quantity': quantity,
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'brand': brand[:50], 'specs': specs, 'total_price': total_price,
|
|
'brand': brand[:50], 'specs': specs, 'total_price': total_price,
|
|
@@ -3512,13 +3559,13 @@ class ProductAttributesPredictor():
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
if unitPrice != "":
|
|
if unitPrice != "":
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
- unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
|
|
if budget != "":
|
|
if budget != "":
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
budget, _money_unit = money_process(budget, header_list2[2])
|
|
- budget = str(budget) if budget != 0 else ''
|
|
|
|
|
|
+ budget = str(budget) if budget != 0 and budget<50000000000 else ''
|
|
if total_price != "":
|
|
if total_price != "":
|
|
total_price, _money_unit = money_process(total_price, header_list[6])
|
|
total_price, _money_unit = money_process(total_price, header_list[6])
|
|
- total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
|
|
|
+ total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
|
|
link = {'product': product, 'quantity': quantity,
|
|
link = {'product': product, 'quantity': quantity,
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
'brand': brand[:50], 'specs': specs, 'total_price': total_price,
|
|
'brand': brand[:50], 'specs': specs, 'total_price': total_price,
|
|
@@ -3562,6 +3609,7 @@ class ProductAttributesPredictor():
|
|
product_attrs[1] = demand_dic
|
|
product_attrs[1] = demand_dic
|
|
if get_product_attrs:
|
|
if get_product_attrs:
|
|
break
|
|
break
|
|
|
|
+ # print('predict_by_text: ', product_attrs)
|
|
return product_attrs
|
|
return product_attrs
|
|
|
|
|
|
|
|
|
|
@@ -3617,6 +3665,7 @@ class DocChannel():
|
|
}
|
|
}
|
|
self.life_dic = {
|
|
self.life_dic = {
|
|
'采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
'采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
|
+ '采购意向neg': '发布政府采购意向|采购意向公告已于',
|
|
'招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
|
|
'招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
|
|
'招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
'招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
'资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
|
|
'资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
|
|
@@ -4009,6 +4058,10 @@ class DocChannel():
|
|
elif life_score[k] == max_score and life_score[k] > 0:
|
|
elif life_score[k] == max_score and life_score[k] > 0:
|
|
life_list.append(k)
|
|
life_list.append(k)
|
|
if '采购意向' in life_kw_title or '采购意向' in life_list:
|
|
if '采购意向' in life_kw_title or '采购意向' in life_list:
|
|
|
|
+ if '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
|
|
+ return '中标信息', msc
|
|
|
|
+ elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
|
|
|
|
+ return '', msc
|
|
return '采购意向', msc
|
|
return '采购意向', msc
|
|
elif '招标预告' in life_kw_title or '招标预告' in life_list:
|
|
elif '招标预告' in life_kw_title or '招标预告' in life_list:
|
|
if '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
if '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
@@ -5395,7 +5448,7 @@ class TablePremExtractor(object):
|
|
prem_dic[package]['name'] = project_name
|
|
prem_dic[package]['name'] = project_name
|
|
|
|
|
|
if budget_ != "":
|
|
if budget_ != "":
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
@@ -5425,7 +5478,7 @@ class TablePremExtractor(object):
|
|
"serviceTime": ""
|
|
"serviceTime": ""
|
|
})
|
|
})
|
|
if tenderer and not same_package:
|
|
if tenderer and not same_package:
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
|
|
|
|
@@ -5696,7 +5749,7 @@ class CandidateExtractor(object):
|
|
header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
money, money_unit = money_process(text, header)
|
|
money, money_unit = money_process(text, header)
|
|
@@ -5731,7 +5784,7 @@ class CandidateExtractor(object):
|
|
'tendereeMoney': 0,
|
|
'tendereeMoney': 0,
|
|
'tendereeMoneyUnit': ""
|
|
'tendereeMoneyUnit': ""
|
|
}
|
|
}
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
break
|
|
break
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
|
|
|