před 1 rokem · d157782234
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -2538,8 +2538,8 @@ class ProductPredictor():
 
				 # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
			
 
				 class ProductAttributesPredictor():
			
 
				     def __init__(self,):
			
 
				-        self.p0 = '(品目|类别|类型|物类|目录|^品名|^品类)(名称|$)|(标项|项目|计划|标段|[分子]?包|子目|服务|招标|工程|招标内容)(名称|内容|描述)'
			
 
				-        self.p1 = '(标的|维修|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名)[\)）的]?(名称|内容|描述)'
			
 
				+        self.p0 = '(品目|类别|类型|物类|目录|类目|分类|^品名(及规格)?|^品类)(名称|$)|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|工程|招标内容)(名称|内容|描述)'
			
 
				+        self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\)）的]?(名称|内容|描述)'
			
 
				         self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称'
			
 
				         # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\)）]?(名称|内容|描述)'
			
 
				         # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
			
@@ -2554,10 +2554,13 @@ class ProductAttributesPredictor():
 
				         if table.find_all(['caption', 'th']) != []:
			
 
				             return True
			
 
				         elif len(table.find_all(['form', 'a', 'img'])) > 5:
			
 
				+            # print('过滤表格：包含链接图片等大于5的为假表格')
			
 
				             return False
			
 
				         elif len(table.find_all(['tr'])) < 2:
			
 
				+            # print('过滤表格：行数小于2的为假表格')
			
 
				             return False
			
 
				         elif len(table.find_all(['table'])) >= 1:
			
 
				+            # print('过滤表格：包含多个表格的为假表格')
			
 
				             return False
			
 
				         else:
			
 
				             return True
			
@@ -2820,6 +2823,7 @@ class ProductAttributesPredictor():
 
				         :param p2: 第二表头正则
			
 
				         :return: 表头所在列序号，是否表头，表头内容
			
 
				         '''
			
 
				+        items = [re.sub('\s', '', it) for it in items]
			
 
				         flag = False
			
 
				         header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
			
 
				         product = ""  # 产品
			
@@ -2845,7 +2849,7 @@ class ProductAttributesPredictor():
 
				                 flag = True
			
 
				                 product = it
			
 
				                 header_dic['名称'] = i
			
 
				-                break
			
 
				+                # break
			
 
				         # if not flag:
			
 
				         if product == "":
			
 
				             for i in range(min(4, len(items))):
			
@@ -2856,6 +2860,11 @@ class ProductAttributesPredictor():
 
				                     product = it
			
 
				                     header_dic['名称'] = i
			
 
				                     break
			
 
				+        if flag == False and len(items)>3 and re.search('^第[一二三四五六七八九十](包|标段)$', items[0]):
			
 
				+            product = items[0]
			
 
				+            header_dic['名称'] = 0
			
 
				+            flag = True
			
 
				+
			
 
				         if flag:
			
 
				             # for j in range(i + 1, len(items)):
			
 
				             for j in range(len(items)):
			
@@ -2891,7 +2900,7 @@ class ProductAttributesPredictor():
 
				                 elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
			
 
				                     header_dic['时间'] = j
			
 
				                     order_time = items[j]
			
 
				-                elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计)）?(金额|价格?)', items[j]):
			
 
				+                elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计)）?([金总]额|价格?)|最高限价', items[j]):
			
 
				                     header_dic['总价'] = j
			
 
				                     total_price = items[j]
			
 
				 
			
@@ -2915,6 +2924,8 @@ class ProductAttributesPredictor():
 
				         '''
			
 
				 
			
 
				         html = html.replace('<br>', '\n').replace('<br/>', '\n')
			
 
				+        html = re.sub("<html>|</html>|<body>|</body>","",html)
			
 
				+        html = re.sub("##attachment##","",html)
			
 
				         soup = BeautifulSoup(html, 'lxml')
			
 
				         # flag_yx = True if re.search('采购意向', html) else False
			
 
				         flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
			
@@ -2926,12 +2937,15 @@ class ProductAttributesPredictor():
 
				         demand_link = []
			
 
				         product_set = set()
			
 
				         total_product_money = 0
			
 
				-        unit_price_list = []
			
 
				+        unit_price_list = [] # 单价列表，用于判断是否重复单价，避免多个表格重复提取造成合计产品价格错误。
			
 
				+        total_price_list = []  # 总价列表，拥有判断是否为几行产品合计总价
			
 
				+        # print('表格数：', len(tables))
			
 
				         for i in range(len(tables)-1, -1, -1):
			
 
				             table = tables[i]
			
 
				             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
			
 
				                 table.string = table.get_text()
			
 
				                 table.name = 'turntable'
			
 
				+                # print('过滤表格：表格父节点为td,且表格td数量小于等于3')
			
 
				                 continue
			
 
				             if not self.isTrueTable(table):
			
 
				                 continue
			
@@ -2943,6 +2957,7 @@ class ProductAttributesPredictor():
 
				             header_quan_unit = ""  # 数量表头 包含单位
			
 
				             header_colnum = 0
			
 
				             if flag_yx:
			
 
				+                # print('意向公告， 提取意向信息')
			
 
				                 col0_l = []
			
 
				                 col1_l = []
			
 
				                 for tds in inner_table:
			
@@ -2995,9 +3010,10 @@ class ProductAttributesPredictor():
 
				                         continue
			
 
				             while i < (len(inner_table)):
			
 
				                 tds = inner_table[i]
			
 
				-                not_empty = [it for it in tds if it != ""]
			
 
				-                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
			
 
				+                not_empty = [it for it in tds if re.sub('\s', '', it) != ""]
			
 
				+                if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总', tds[0])): # 非空列或者不重复内容小于两列的 继续
			
 
				                     i += 1
			
 
				+                    # print('表格产品提取：非空列或者不重复内容小于两列的 继续', i, tds)
			
 
				                     continue
			
 
				                 product = ""  # 产品
			
 
				                 quantity = ""  # 数量
			
@@ -3017,6 +3033,7 @@ class ProductAttributesPredictor():
 
				                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
			
 
				                     if found_header:
			
 
				                         header_colnum = len(tds) # 保存表头所在行列数
			
 
				+                        # print('发现表头：', header_colnum, header_dic)
			
 
				                     if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
			
 
				                             quantity_header = header_list[1].replace('单位：', '')
			
 
				                             if re.search('（([\w/]{,5})）', quantity_header):
			
@@ -3029,10 +3046,12 @@ class ProductAttributesPredictor():
 
				                         headers_demand.append('_'.join(header_list2))
			
 
				                         header_col.append('_'.join(tds))
			
 
				                     i += 1
			
 
				+                    # print('表头数量占行列数0.2倍不做内容匹配', set([re.sub('[:：]','',td) for td in tds]) & self.header_set)
			
 
				                     continue
			
 
				                 elif found_header:
			
 
				                     if len(tds) != header_colnum:  # 表头、属性列数不一致跳过
			
 
				                         i += 1
			
 
				+                        # print('表头、属性列数不一致跳过', len(tds), header_colnum, tds)
			
 
				                         continue
			
 
				                     id0 = header_dic.get('品目', "")
			
 
				                     id1 = header_dic.get('名称', "")
			
@@ -3053,11 +3072,13 @@ class ProductAttributesPredictor():
 
				                     for k, v in header_dic.items():
			
 
				                         if isinstance(v, int):
			
 
				                             if v >= len(tds) or tds[v] in self.header_set:
			
 
				+                                # print('内容属性在表头集合里面', tds[v], v >= len(tds))
			
 
				                                 not_attr = 1
			
 
				-                                break
			
 
				-                    if not_attr: # 只要属性里面有一项为表头，停止匹配
			
 
				+                                # break
			
 
				+                    if not_attr>=2: # 只要属性里面有两项为表头，停止匹配
			
 
				                         i += 1
			
 
				                         found_header = False
			
 
				+                        # print('只要属性里面有两项为表头，停止匹配')
			
 
				                         continue
			
 
				 
			
 
				                     if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
			
@@ -3067,17 +3088,20 @@ class ProductAttributesPredictor():
 
				                     if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
			
 
				                             re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id0]) == None:
			
 
				                         category = tds[id0]
			
 
				-                        product = "%s_%s"%(category, product) if product!="" else category
			
 
				+                        product = "%s_%s"%(category, product) if product!="" and product!=category else category
			
 
				 
			
 
				                     if product != "":
			
 
				+                        # print('匹配产品内容： ', product)
			
 
				                         if id2 != "":
			
 
				                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
			
 
				-                                if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7}，?)$)|万?元', tds[id2]):
			
 
				-                                    i += 1
			
 
				-                                    continue
			
 
				+                                # if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7}，?)$)|万?元', tds[id2]):  # 254816100 这篇数量很大，貌似正常
			
 
				+                                #     i += 1
			
 
				+                                #     print('过滤：数量包含金额单位或值很大类似金额', tds[id2])
			
 
				+                                #     continue
			
 
				                                 quantity = tds[id2]
			
 
				                             elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
			
 
				                                 i += 1
			
 
				+                                # print('过滤：数量包含五个字符以上且不包含^详见|^略等字符')
			
 
				                                 continue
			
 
				                         if id2_2 != "":
			
 
				                             if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
			
@@ -3087,8 +3111,9 @@ class ProductAttributesPredictor():
 
				                                 unitPrice = tds[id3]
			
 
				                             elif re.search('^[\d,.亿万元人民币欧美日金额：（）();；、，\n]+$|￥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
			
 
				                                 unitPrice = tds[id3]
			
 
				-                            elif len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id3])) > 5:
			
 
				+                            elif len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\d,.]', '', tds[id3])) > 5 and re.search('^详见|^略', tds[id3])==None:
			
 
				                                 i += 1
			
 
				+                                # print('过滤：产品单价包含金额外的字符数大于5个')
			
 
				                                 continue
			
 
				                         if id4 != "":
			
 
				                             if re.search('\w', tds[id4]):
			
@@ -3121,13 +3146,15 @@ class ProductAttributesPredictor():
 
				                                 total_price = tds[id9]
			
 
				                             elif re.search('^[\d,.亿万元人民币欧美日金额：（）();；、，\n]+$|￥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
			
 
				                                 total_price = tds[id9]
			
 
				-                            elif len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id9])) > 5:
			
 
				+                            elif len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\d,.]', '', tds[id9])) > 5 and re.search('^详见|^略', tds[id9])==None:
			
 
				                                 i += 1
			
 
				+                                # print('过滤：产品总价包含金额外的字符数大于5个')
			
 
				                                 continue
			
 
				                         if id10 != "":
			
 
				                             parameter = tds[id10][:500]
			
 
				                             if re.match('^详见|^略$', parameter.strip()):
			
 
				                                 parameter = ""
			
 
				+                        # print('数量：{0}, 单价：{1}, 品牌：{2}， 规格：{3}，总价：{4}'.format(quantity ,unitPrice, brand, specs, total_price))
			
 
				                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
			
 
				                             if id1!="" and id2 != "" and id3 != "" and len(re.split('[;；、，\n]', tds[id2])) > 1 and len(re.split('[;；、，\n]', tds[id1])) == len(re.split('[;；、，\n]', tds[id2])): # 处理一个空格包含多个产品，逗号或空格分割情况 例子 292846806 292650743
			
 
				                                 products = re.split('[;；、，\n]', tds[id1])
			
@@ -3149,13 +3176,14 @@ class ProductAttributesPredictor():
 
				                                             quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
			
 
				                                         if unitPrice != "":
			
 
				                                             unitPrice, _money_unit = money_process(unitPrice, header_list[3])
			
 
				-                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
			
 
				+                                            unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
			
 
				                                         if budget != "":
			
 
				                                             budget, _money_unit = money_process(budget, header_list2[2])
			
 
				-                                            budget = str(budget) if budget != 0 else ''
			
 
				+                                            budget = str(budget) if budget != 0 and budget<50000000000 else ''
			
 
				                                         if total_price != "":
			
 
				                                             total_price, _money_unit = money_process(total_price, header_list[6])
			
 
				-                                            total_price = str(total_price) if unitPrice != 0 else ""
			
 
				+                                            total_price_list.append(total_price)
			
 
				+                                            total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
			
 
				                                         link = {'product': product, 'quantity': quantity,
			
 
				                                                 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
			
 
				                                                 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
			
@@ -3182,8 +3210,9 @@ class ProductAttributesPredictor():
 
				                                                     log('产品属性单价数量相乘出错, 单价： %s, 数量： %s' % (
			
 
				                                                     link['unitPrice'], link['quantity']))
			
 
				 
			
 
				-                            elif len(unitPrice) > 15 or len(product)>100:  # 单价大于15位数或 产品名称长于100字
			
 
				+                            elif len(product)>100:  # 产品名称长于100字
			
 
				                                 i += 1
			
 
				+                                # print('过滤： 产品名称长于100字',)
			
 
				                                 continue
			
 
				                             else:
			
 
				                                 if quantity != "":
			
@@ -3191,13 +3220,14 @@ class ProductAttributesPredictor():
 
				                                     quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
			
 
				                                 if unitPrice != "":
			
 
				                                     unitPrice, _money_unit = money_process(unitPrice, header_list[3])
			
 
				-                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
			
 
				+                                    unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
			
 
				                                 if budget != "":
			
 
				                                     budget, _money_unit = money_process(budget, header_list2[2])
			
 
				-                                    budget = str(budget) if budget != 0 else ''
			
 
				+                                    budget = str(budget) if budget != 0 and budget<50000000000 else ''
			
 
				                                 if total_price != "":
			
 
				                                     total_price, _money_unit = money_process(total_price, header_list[6])
			
 
				-                                    total_price = str(total_price) if unitPrice != 0 else ""
			
 
				+                                    total_price_list.append(total_price)
			
 
				+                                    total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
			
 
				                                 link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
			
 
				                                                           'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
			
 
				 
			
@@ -3210,8 +3240,8 @@ class ProductAttributesPredictor():
 
				                                 #         except:
			
 
				                                 #             log('产品属性单价数量相乘出错, 单价： %s, 数量： %s'%(link['unitPrice'], link['quantity']))
			
 
				 
			
 
				-                                if (product, specs, unitPrice, quantity) not in product_set:
			
 
				-                                    product_set.add((product, specs, unitPrice, quantity))
			
 
				+                                if (product, unitPrice, quantity) not in product_set:
			
 
				+                                    product_set.add((product, unitPrice, quantity))
			
 
				                                     product_link.append(link)
			
 
				                                     if link['unitPrice'] != "" and link['quantity'] != '':
			
 
				                                         try:
			
@@ -3236,6 +3266,18 @@ class ProductAttributesPredictor():
 
				                     i += 1
			
 
				                 else:
			
 
				                     i += 1
			
 
				+        if len(total_price_list)>0 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价，需去掉
			
 
				+            # print('总价一半以上重复的为多行一个总价，需去掉')
			
 
				+            for link in product_link:
			
 
				+                if 'total_price' in link:
			
 
				+                    link['total_price'] = ""
			
 
				+        if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
			
 
				+            # print('如果单价重复率高不算总产品价避免错误')
			
 
				+            total_product_money = 0
			
 
				+            for link in product_link:
			
 
				+                if 'unitPrice' in link:
			
 
				+                    link['unitPrice'] = ""
			
 
				+
			
 
				         if len(product_link)>0:
			
 
				             attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
			
 
				         else:
			
@@ -3244,8 +3286,7 @@ class ProductAttributesPredictor():
 
				             demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
			
 
				         else:
			
 
				             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
			
 
				-        if len(unit_price_list)>0 and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
			
 
				-            total_product_money = 0
			
 
				+        # print('表格产品属性提取：', attr_dic)
			
 
				         return [attr_dic, demand_dic], total_product_money
			
 
				 
			
 
				     def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
			
@@ -3283,6 +3324,7 @@ class ProductAttributesPredictor():
 
				                         'order_begin': order_begin, 'order_end': order_end}
			
 
				                 _data.append(link)
			
 
				             product_attrs[1]['demand_info']['data'] = _data
			
 
				+        # print('predict_without_table: ', product_attrs)
			
 
				         return product_attrs
			
 
				 
			
 
				     def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
			
@@ -3312,7 +3354,12 @@ class ProductAttributesPredictor():
 
				                     begin_list = [0]
			
 
				                     for index,head in enumerate(head_list):
			
 
				                         if head not in loop_list:
			
 
				-                            loop_list.append(head)
			
 
				+                            if re.search('第[一二三四五六七八九十](包|标段)', head) and re.search('第[一二三四五六七八九十](包|标段)', '|'.join(loop_list)):
			
 
				+                                begin_list.append(index)
			
 
				+                                loop_list = []
			
 
				+                                loop_list.append(head)
			
 
				+                            else:
			
 
				+                                loop_list.append(head)
			
 
				                         else:
			
 
				                             begin_list.append(index)
			
 
				                             loop_list = []
			
@@ -3454,7 +3501,7 @@ class ProductAttributesPredictor():
 
				                                     if re.match('^详见|^略$', parameter.strip()):
			
 
				                                         parameter = ""
			
 
				                                 if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
			
 
				-                                    if id2 != "" and id3 != "" and len(re.split('[;；、，\n]', deal_list[id2])) > 1 and len(
			
 
				+                                    if id1 != "" and id2 != "" and id3 != "" and len(re.split('[;；、，\n]', deal_list[id2])) > 1 and len(
			
 
				                                             re.split('[;；、，\n]', deal_list[id1])) == len(re.split('[;；、，\n]', deal_list[id2])):  # 处理一个空格包含多个产品，逗号或空格分割情况 例子 292846806 292650743
			
 
				                                         products = re.split('[;；、，\n]', deal_list[id1])
			
 
				                                         quantitys = re.split('[;；、，\n]', deal_list[id2])
			
@@ -3478,14 +3525,14 @@ class ProductAttributesPredictor():
 
				                                                     quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
			
 
				                                                 if unitPrice != "":
			
 
				                                                     unitPrice, _money_unit = money_process(unitPrice, header_list[3])
			
 
				-                                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
			
 
				+                                                    unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
			
 
				                                                 if budget != "":
			
 
				                                                     budget, _money_unit = money_process(budget, header_list2[2])
			
 
				-                                                    budget = str(budget) if budget != 0 else ''
			
 
				+                                                    budget = str(budget) if budget != 0 and budget<50000000000 else ''
			
 
				                                                 if total_price != "":
			
 
				                                                     total_price, _money_unit = money_process(total_price,
			
 
				                                                                                              header_list[6])
			
 
				-                                                    total_price = str(total_price) if unitPrice != 0 else ""
			
 
				+                                                    total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
			
 
				                                                 link = {'product': product, 'quantity': quantity,
			
 
				                                                         'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
			
 
				                                                         'brand': brand[:50], 'specs': specs, 'total_price': total_price,
			
@@ -3512,13 +3559,13 @@ class ProductAttributesPredictor():
 
				                                             quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
			
 
				                                         if unitPrice != "":
			
 
				                                             unitPrice, _money_unit = money_process(unitPrice, header_list[3])
			
 
				-                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
			
 
				+                                            unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
			
 
				                                         if budget != "":
			
 
				                                             budget, _money_unit = money_process(budget, header_list2[2])
			
 
				-                                            budget = str(budget) if budget != 0 else ''
			
 
				+                                            budget = str(budget) if budget != 0 and budget<50000000000 else ''
			
 
				                                         if total_price != "":
			
 
				                                             total_price, _money_unit = money_process(total_price, header_list[6])
			
 
				-                                            total_price = str(total_price) if unitPrice != 0 else ""
			
 
				+                                            total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
			
 
				                                         link = {'product': product, 'quantity': quantity,
			
 
				                                                 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
			
 
				                                                 'brand': brand[:50], 'specs': specs, 'total_price': total_price,
			
@@ -3562,6 +3609,7 @@ class ProductAttributesPredictor():
 
				                         product_attrs[1] = demand_dic
			
 
				                     if get_product_attrs:
			
 
				                         break
			
 
				+        # print('predict_by_text: ', product_attrs)
			
 
				         return product_attrs
			
 
				 
			
 
				 
			
@@ -3617,6 +3665,7 @@ class DocChannel():
 
				       }
			
 
				       self.life_dic = {
			
 
				           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
			
 
				+          '采购意向neg': '发布政府采购意向|采购意向公告已于',
			
 
				           '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
			
 
				           '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[：\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
			
 
				           '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
			
@@ -4009,6 +4058,10 @@ class DocChannel():
 
				               elif life_score[k] == max_score and life_score[k] > 0:
			
 
				                   life_list.append(k)
			
 
				           if '采购意向' in life_kw_title or '采购意向' in life_list:
			
 
				+              if '中标信息' in life_kw_title or '中标信息' in life_list:
			
 
				+                  return '中标信息', msc
			
 
				+              elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
			
 
				+                  return '', msc
			
 
				               return '采购意向', msc
			
 
				           elif '招标预告' in life_kw_title or '招标预告' in life_list:
			
 
				               if '中标信息' in life_kw_title or '中标信息' in life_list:
			
@@ -5395,7 +5448,7 @@ class TablePremExtractor(object):
 
				             prem_dic[package]['name'] = project_name
			
 
				 
			
 
				             if budget_ != "":
			
 
				-                if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				+                if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                     break
			
 
				                 budget_header = headers['budget'][1] if 'budget' in headers else ''
			
 
				                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%％‰折]|浮率', budget_)==None else (0, '')
			
@@ -5425,7 +5478,7 @@ class TablePremExtractor(object):
 
				                         "serviceTime": ""
			
 
				                 })
			
 
				             if tenderer and not same_package:
			
 
				-                if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
			
 
				+                if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\d,.]', '',
			
 
				                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                     break
			
 
				 
			
@@ -5696,7 +5749,7 @@ class CandidateExtractor(object):
 
				                     header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
			
 
				                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
			
 
				                                            [win_tenderer, second_tenderer, third_tenderer]):
			
 
				-                        if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
			
 
				+                        if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\d,.]', '',
			
 
				                                       text)) > 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                             break
			
 
				                         money, money_unit = money_process(text, header)
			
@@ -5731,7 +5784,7 @@ class CandidateExtractor(object):
 
				                             'tendereeMoney': 0,
			
 
				                             'tendereeMoneyUnit': ""
			
 
				                         }
			
 
				-                    if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				+                    if len(re.sub('[金额万元（）():：零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分￥整\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符，中断匹配
			
 
				                         break
			
 
				                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")