|
@@ -3235,7 +3235,7 @@ class ProductAttributesPredictor():
|
|
|
list_sentence = list_sentences[0]
|
|
|
list_entity = list_entitys[0]
|
|
|
_data = product_attrs[1]['demand_info']['data']
|
|
|
- re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
|
|
|
+ re_bidding_time = re.compile("(采购|采购实施|预计招标)(时间|月份|日期)[::,].{0,2}$")
|
|
|
order_times = []
|
|
|
for entity in list_entity:
|
|
|
if entity.entity_type=='time':
|
|
@@ -3266,7 +3266,8 @@ class ProductAttributesPredictor():
|
|
|
product_attrs[1]['demand_info']['data'] = _data
|
|
|
return product_attrs
|
|
|
|
|
|
- def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
|
|
|
+ def predict_by_text(self,product_attrs,html,list_outlines,product_list,page_time=""):
|
|
|
+ product_entity_list = list(set(product_list))
|
|
|
list_outline = list_outlines[0]
|
|
|
get_product_attrs = False
|
|
|
for _outline in list_outline:
|
|
@@ -3281,12 +3282,22 @@ class ProductAttributesPredictor():
|
|
|
for key_value in key_value_list:
|
|
|
key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
|
|
|
temp = re.split("[::]",key_value)
|
|
|
- key = temp[-2]
|
|
|
+ if len(temp)>2:
|
|
|
+ if temp[0] in head_list:
|
|
|
+ key = temp[0]
|
|
|
+ value = "".join(temp[1:])
|
|
|
+ else:
|
|
|
+ key = temp[-2]
|
|
|
+ value = temp[-1]
|
|
|
+ else:
|
|
|
+ key = temp[0]
|
|
|
+ value = temp[1]
|
|
|
+
|
|
|
key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key)
|
|
|
- value = temp[-1]
|
|
|
head_list.append(key)
|
|
|
head_value_list.append(value)
|
|
|
head_set = set(head_list)
|
|
|
+ # print(head_list,head_value_list)
|
|
|
# print('head_set',head_set)
|
|
|
if len(head_set & self.header_set) > len(head_set)*0.2:
|
|
|
loop_list = []
|
|
@@ -3298,6 +3309,7 @@ class ProductAttributesPredictor():
|
|
|
begin_list.append(index)
|
|
|
loop_list = []
|
|
|
loop_list.append(head)
|
|
|
+ # print(begin_list)
|
|
|
headers = []
|
|
|
headers_demand = []
|
|
|
header_col = []
|
|
@@ -3324,8 +3336,9 @@ class ProductAttributesPredictor():
|
|
|
order_end = ""
|
|
|
total_price = "" # 总金额
|
|
|
parameter = "" # 参数
|
|
|
-
|
|
|
+ # print(tmp_head_list)
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
|
|
|
+ # print(found_header,header_list, header_list2)
|
|
|
if found_header:
|
|
|
headers.append('_'.join(header_list))
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
@@ -3353,6 +3366,14 @@ class ProductAttributesPredictor():
|
|
|
category = deal_list[id0]
|
|
|
product = "%s_%s" % (category, product) if product != "" else category
|
|
|
|
|
|
+ if product == "":
|
|
|
+ # print(deal_list[id4],deal_list[id5],tmp_head_list,deal_list)
|
|
|
+ if deal_list[id4] != "" or deal_list[id5] != "":
|
|
|
+ for head,value in zip(tmp_head_list,deal_list):
|
|
|
+ if value in product_entity_list:
|
|
|
+ product = value
|
|
|
+ break
|
|
|
+
|
|
|
if product != "":
|
|
|
if id2 != "":
|
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
|
|
@@ -3422,7 +3443,7 @@ class ProductAttributesPredictor():
|
|
|
if float(budget)>= 100000*10000:
|
|
|
budget = ""
|
|
|
if id8 != "":
|
|
|
- if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
|
|
|
+ if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计招标)(时间|月份|日期)",header_list2[3]):
|
|
|
order_time = deal_list[id8].strip()
|
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
if id9 != "":
|
|
@@ -3529,6 +3550,11 @@ class ProductAttributesPredictor():
|
|
|
if link not in demand_link:
|
|
|
demand_link.append(link)
|
|
|
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
if len(product_link) > 0:
|
|
|
attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}
|
|
|
get_product_attrs = True
|
|
@@ -3549,11 +3575,11 @@ class ProductAttributesPredictor():
|
|
|
|
|
|
|
|
|
|
|
|
- def add_product_attrs(self,channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time):
|
|
|
+ def add_product_attrs(self,channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
|
|
|
if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
|
|
|
product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
|
|
|
if len(product_attrs[0]['product_attrs']['data']) == 0:
|
|
|
- product_attrs = self.predict_by_text(product_attrs,text,list_outlines,page_time)
|
|
|
+ product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
|
|
|
if len(product_attrs[1]['demand_info']['data'])>0:
|
|
|
for d in product_attrs[1]['demand_info']['data']:
|
|
|
for product in set(prem[0]['product']):
|