|
@@ -458,6 +458,8 @@ class CodeNamePredict():
|
|
|
_name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
|
|
|
if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
|
|
|
continue
|
|
|
+ elif '公司:你单位在' in _name: # 避免类似 339900030 这种作为项目名称,导致中标角色作为招标角色
|
|
|
+ continue
|
|
|
|
|
|
#add name to entitys
|
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
@@ -2912,7 +2914,7 @@ class ProductAttributesPredictor():
|
|
|
:return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
|
|
|
'''
|
|
|
|
|
|
-
|
|
|
+ html = html.replace('<br>', '\n')
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
# flag_yx = True if re.search('采购意向', html) else False
|
|
|
flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
|
|
@@ -2924,6 +2926,7 @@ class ProductAttributesPredictor():
|
|
|
demand_link = []
|
|
|
product_set = set()
|
|
|
total_product_money = 0
|
|
|
+ unit_price_list = []
|
|
|
for i in range(len(tables)-1, -1, -1):
|
|
|
table = tables[i]
|
|
|
if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
|
|
@@ -3069,7 +3072,13 @@ class ProductAttributesPredictor():
|
|
|
if product != "":
|
|
|
if id2 != "":
|
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
|
|
|
+ if re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|万?元', tds[id2]):
|
|
|
+ i += 1
|
|
|
+ continue
|
|
|
quantity = tds[id2]
|
|
|
+ elif re.search('\w', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
|
|
|
+ i += 1
|
|
|
+ continue
|
|
|
if id2_2 != "":
|
|
|
if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
|
|
|
quantity_unit = tds[id2_2]
|
|
@@ -3078,6 +3087,9 @@ class ProductAttributesPredictor():
|
|
|
unitPrice = tds[id3]
|
|
|
elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id3].strip()):
|
|
|
unitPrice = tds[id3]
|
|
|
+ elif re.search('\w', tds[id3]) and re.search('^详见|^略', tds[id3])==None:
|
|
|
+ i += 1
|
|
|
+ continue
|
|
|
if id4 != "":
|
|
|
if re.search('\w', tds[id4]):
|
|
|
brand = tds[id4]
|
|
@@ -3109,6 +3121,9 @@ class ProductAttributesPredictor():
|
|
|
total_price = tds[id9]
|
|
|
elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id9].strip()):
|
|
|
total_price = tds[id9]
|
|
|
+ elif re.search('\w', tds[id9]) and re.search('^详见|^略', tds[id9])==None:
|
|
|
+ i += 1
|
|
|
+ continue
|
|
|
if id10 != "":
|
|
|
parameter = tds[id10][:500]
|
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
@@ -3200,6 +3215,8 @@ class ProductAttributesPredictor():
|
|
|
product_link.append(link)
|
|
|
if link['unitPrice'] != "" and link['quantity'] != '':
|
|
|
try:
|
|
|
+ if link['unitPrice']:
|
|
|
+ unit_price_list.append(link['unitPrice'])
|
|
|
total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
|
|
|
except:
|
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
@@ -3227,6 +3244,8 @@ class ProductAttributesPredictor():
|
|
|
demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
|
|
|
else:
|
|
|
demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
|
|
|
+ if len(unit_price_list)>0 and len(set(unit_price_list))/len(unit_price_list)<=0.5: # 2023/7/18 如果单价重复率高不算总产品价避免错误
|
|
|
+ total_product_money = 0
|
|
|
return [attr_dic, demand_dic], total_product_money
|
|
|
|
|
|
def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
|
|
@@ -5116,6 +5135,8 @@ class TablePremExtractor(object):
|
|
|
flag = True
|
|
|
for i in range(len(td_list)) :
|
|
|
text = td_list[i]
|
|
|
+ if text == '备选中标人':
|
|
|
+ text = '第二候选人'
|
|
|
if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
continue
|
|
|
if re.search('未(中标|成交)原因', text): # 不提取此种表格
|