|
@@ -22,6 +22,8 @@ from BiddingKG.dl.interface.Entitys import Entity
|
|
|
from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
|
|
|
from bs4 import BeautifulSoup
|
|
|
import copy
|
|
|
+import calendar
|
|
|
+import datetime
|
|
|
|
|
|
from threading import RLock
|
|
|
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
@@ -1532,7 +1534,7 @@ class ProductPredictor():
|
|
|
result.append(item) # 修正bug
|
|
|
return result
|
|
|
|
|
|
-# 产品数量单价品牌规格提取
|
|
|
+# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
|
|
|
class ProductAttributesPredictor():
|
|
|
def __init__(self,):
|
|
|
self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
@@ -1653,6 +1655,67 @@ class ProductAttributesPredictor():
|
|
|
elif len(tds1) > 0 and len(tds1) == indtd - 1:
|
|
|
tds1[indtd - 2].insert_after(copy.copy(td))
|
|
|
|
|
|
+ def get_monthlen(self, year, month):
|
|
|
+ '''输入年份、月份 int类型 得到该月份天数'''
|
|
|
+ try:
|
|
|
+ weekday, num = calendar.monthrange(int(year), int(month))
|
|
|
+ except:
|
|
|
+ num = 30
|
|
|
+ return str(num)
|
|
|
+ def fix_time(self, text):
|
|
|
+ '''输入日期字段返回格式化日期'''
|
|
|
+ t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
|
|
|
+ if t1:
|
|
|
+ year = t1.group(1)
|
|
|
+ month = t1.group(3)
|
|
|
+ num = self.get_monthlen(year, month)
|
|
|
+ if len(month)<2:
|
|
|
+ month = '0'+month
|
|
|
+ if len(num) < 2:
|
|
|
+ num = '0'+num
|
|
|
+ order_begin = "%s-%s-01" % (year, month)
|
|
|
+ order_end = "%s-%s-%s" % (year, month, num)
|
|
|
+ return order_begin, order_end
|
|
|
+ if re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)\d{1,2}日?$', text):
|
|
|
+ text = re.sub('年|月|/|-', '-', text)
|
|
|
+ text = text.replace('日', '')
|
|
|
+ order_begin = text
|
|
|
+ order_end = text
|
|
|
+ return order_begin, order_end
|
|
|
+ all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
|
|
|
+ '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
|
|
|
+ '(?:(?P<d2>\d{1,2})日)?)?$', text)
|
|
|
+ y1 = m1 = d1 = y2 = m2 = d2 = ""
|
|
|
+ found_math = False
|
|
|
+ for _match in all_match:
|
|
|
+ if len(_match.group()) > 0:
|
|
|
+ found_math = True
|
|
|
+ for k, v in _match.groupdict().items():
|
|
|
+ if v!="" and v is not None:
|
|
|
+ if k == 'y1':
|
|
|
+ y1 = v
|
|
|
+ elif k == 'm1':
|
|
|
+ m1 = v
|
|
|
+ elif k == 'd1':
|
|
|
+ d1 = v
|
|
|
+ elif k == 'y2':
|
|
|
+ y2 = v
|
|
|
+ elif k == 'm2':
|
|
|
+ m2 = v
|
|
|
+ elif k == 'd2':
|
|
|
+ d2 = v
|
|
|
+ if not found_math:
|
|
|
+ return "", ""
|
|
|
+ y2 = y1 if y2 == "" else y2
|
|
|
+ d1 = '1' if d1 == "" else d1
|
|
|
+ d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
|
|
|
+ for it in (m1,d1,m2,d2):
|
|
|
+ if len(it)<2:
|
|
|
+ it = '0'+it
|
|
|
+ order_begin = "%s-%s-%s"%(y1,m1,d1)
|
|
|
+ order_end = "%s-%s-%s"%(y2,m2,d2)
|
|
|
+ return order_begin, order_end
|
|
|
+
|
|
|
def find_header(self, items, p1, p2):
|
|
|
'''
|
|
|
inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
|
|
@@ -1662,12 +1725,16 @@ class ProductAttributesPredictor():
|
|
|
:return: 表头所在列序号,是否表头,表头内容
|
|
|
'''
|
|
|
flag = False
|
|
|
- header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': ''}
|
|
|
+ header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
|
|
|
product = "" # 产品
|
|
|
quantity = "" # 数量
|
|
|
unitPrice = "" # 单价
|
|
|
brand = "" # 品牌
|
|
|
specs = "" # 规格
|
|
|
+ demand = "" # 采购需求
|
|
|
+ budget = "" # 预算金额
|
|
|
+ order_time = "" # 采购时间
|
|
|
+
|
|
|
for i in range(min(4, len(items))):
|
|
|
it = items[i]
|
|
|
if len(it) < 15 and re.search(p1, it) != None:
|
|
@@ -1700,15 +1767,28 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('规格', items[j]):
|
|
|
header_dic['规格'] = j
|
|
|
specs = items[j]
|
|
|
- if header_dic.get('名称', "") != "" and (header_dic.get('数量', "") != "" or header_dic.get('单价', "") != ""
|
|
|
- or header_dic.get('品牌', "") != "" or header_dic.get('规格',
|
|
|
- "") != ""):
|
|
|
- return header_dic, flag, (product, quantity, unitPrice, brand, specs)
|
|
|
|
|
|
+ elif re.search('需求', items[j]):
|
|
|
+ header_dic['需求'] = j
|
|
|
+ demand = items[j]
|
|
|
+ elif re.search('预算', items[j]):
|
|
|
+ header_dic['预算'] = j
|
|
|
+ budget = items[j]
|
|
|
+ elif re.search('时间', items[j]):
|
|
|
+ header_dic['时间'] = j
|
|
|
+ order_time = items[j]
|
|
|
+
|
|
|
+ if header_dic.get('名称', "") != "" :
|
|
|
+ num = 0
|
|
|
+ for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
|
|
|
+ if it != "":
|
|
|
+ num += 1
|
|
|
+ if num >=2:
|
|
|
+ return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
|
|
|
flag = False
|
|
|
- return header_dic, flag, (product, quantity, unitPrice, brand, specs)
|
|
|
+ return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
|
|
|
|
|
|
- def predict(self, docid='', html=''):
|
|
|
+ def predict(self, docid='', html='', page_time=""):
|
|
|
'''
|
|
|
正则寻找table表格内 产品相关信息
|
|
|
:param html:公告HTML原文
|
|
@@ -1719,9 +1799,16 @@ class ProductAttributesPredictor():
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
tables = soup.find_all(['table'])
|
|
|
headers = []
|
|
|
+ headers_demand = []
|
|
|
header_col = []
|
|
|
product_link = []
|
|
|
- for table in tables:
|
|
|
+ demand_link = []
|
|
|
+ for i in range(len(tables)-1, -1, -1):
|
|
|
+ table = tables[i]
|
|
|
+ if table.parent.name == 'td' and len(table.find_all('td')) == 1:
|
|
|
+ table.string = table.get_text()
|
|
|
+ table.name = 'turntable'
|
|
|
+ continue
|
|
|
if not self.isTrueTable(table):
|
|
|
continue
|
|
|
self.fixSpan(table)
|
|
@@ -1740,10 +1827,17 @@ class ProductAttributesPredictor():
|
|
|
unitPrice = "" # 单价
|
|
|
brand = "" # 品牌
|
|
|
specs = "" # 规格
|
|
|
+ demand = "" # 采购需求
|
|
|
+ budget = "" # 预算金额
|
|
|
+ order_time = "" # 采购时间
|
|
|
+ order_begin = ""
|
|
|
+ order_end = ""
|
|
|
+
|
|
|
if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
- header_dic, found_header, header_list = self.find_header(tds, self.p1, self.p2)
|
|
|
+ header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
|
|
|
if found_header:
|
|
|
headers.append('_'.join(header_list))
|
|
|
+ headers_demand.append('_'.join(header_list2))
|
|
|
header_colnum = len(tds)
|
|
|
header_col.append('_'.join(tds))
|
|
|
i += 1
|
|
@@ -1757,6 +1851,10 @@ class ProductAttributesPredictor():
|
|
|
id3 = header_dic.get('单价', "")
|
|
|
id4 = header_dic.get('品牌', "")
|
|
|
id5 = header_dic.get('规格', "")
|
|
|
+
|
|
|
+ id6 = header_dic.get('需求', "")
|
|
|
+ id7 = header_dic.get('预算', "")
|
|
|
+ id8 = header_dic.get('时间', "")
|
|
|
if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
|
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
|
|
|
product = tds[id1]
|
|
@@ -1768,8 +1866,10 @@ class ProductAttributesPredictor():
|
|
|
if id3 != "":
|
|
|
if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
|
unitPrice = tds[id3]
|
|
|
- if '万元' in header_list[2] and '万元' not in unitPrice:
|
|
|
+ if '万元' in header_list[2] and '万' not in unitPrice:
|
|
|
unitPrice += '万元'
|
|
|
+ unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
|
|
|
+ unitPrice = str(getUnifyMoney(unitPrice))
|
|
|
else:
|
|
|
unitPrice = ""
|
|
|
if id4 != "":
|
|
@@ -1782,16 +1882,74 @@ class ProductAttributesPredictor():
|
|
|
specs = tds[id5]
|
|
|
else:
|
|
|
specs = ""
|
|
|
+ if id6 != "":
|
|
|
+ if re.search('\w', tds[id6]):
|
|
|
+ demand = tds[id6]
|
|
|
+ else:
|
|
|
+ demand = ""
|
|
|
+ if id7 != "":
|
|
|
+ if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
|
|
|
+ budget = tds[id7]
|
|
|
+ if '万元' in header_list2[2] and '万' not in budget:
|
|
|
+ budget += '万元'
|
|
|
+ budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
|
|
|
+ budget = str(getUnifyMoney(budget))
|
|
|
+ else:
|
|
|
+ budget = ""
|
|
|
+ if id8 != "":
|
|
|
+ if re.search('\w', tds[id8]):
|
|
|
+ order_time = tds[id8].strip()
|
|
|
+ if re.search('^\d{1,2}月$', order_time):
|
|
|
+ m = re.search('^(\d{1,2})月$', order_time).group(1)
|
|
|
+ if len(m) < 2:
|
|
|
+ m = '0'+m
|
|
|
+ year = re.search('(\d{4})年(.{,12}采购意向)?', html)
|
|
|
+ if year:
|
|
|
+ y = year.group(1)
|
|
|
+ num = self.get_monthlen(y, m)
|
|
|
+ if len(num)<2:
|
|
|
+ num = '0'+num
|
|
|
+ order_begin = "%s.%s.01" % (y, m)
|
|
|
+ order_end = "%s.%s.%s" % (y, m, num)
|
|
|
+ elif page_time!="":
|
|
|
+ year = re.search('\d{4}', page_time)
|
|
|
+ if year:
|
|
|
+ y = year.group(0)
|
|
|
+ num = self.get_monthlen(y, m)
|
|
|
+ if len(num) < 2:
|
|
|
+ num = '0'+num
|
|
|
+ order_begin = "%s.%s.01" % (y, m)
|
|
|
+ order_end = "%s.%s.%s" % (y, m, num)
|
|
|
+ else:
|
|
|
+ y = str(datetime.datetime.now().year)
|
|
|
+ num = self.get_monthlen(y, m)
|
|
|
+ if len(num) < 2:
|
|
|
+ num = '0'+num
|
|
|
+ order_begin = "%s.%s.01" % (y, m)
|
|
|
+ order_end = "%s.%s.%s" % (y, m, num)
|
|
|
+ else:
|
|
|
+ order_begin, order_end = self.fix_time(order_time)
|
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
- # link = "{0}\t{1}\t{2}\t{3}\t{4}".format(product, quantity, unitPrice, brand, specs)
|
|
|
link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
|
|
|
- 'brand': brand[:50], 'speces': specs[:100]}
|
|
|
+ 'brand': brand[:50], 'specs':specs}
|
|
|
if link not in product_link:
|
|
|
product_link.append(link)
|
|
|
+ if budget != "" and order_time != "" :
|
|
|
+ link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
|
|
|
+ if link not in demand_link:
|
|
|
+ demand_link.append(link)
|
|
|
i += 1
|
|
|
else:
|
|
|
i += 1
|
|
|
- return [{'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}]
|
|
|
+ if len(product_link)>0:
|
|
|
+ attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
|
|
|
+ else:
|
|
|
+ attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
|
|
|
+ if len(demand_link)>0:
|
|
|
+ demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
|
|
|
+ else:
|
|
|
+ demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
|
|
|
+ return [attr_dic, demand_dic]
|
|
|
|
|
|
# docchannel类型提取
|
|
|
class DocChannel():
|