Procházet zdrojové kódy

优化产品单价数量等属性提取

lsm před 1 rokem
rodič
revize
d157782234
1 změnil soubory, kde provedl 91 přidání a 38 odebrání
  1. 91 38
      BiddingKG/dl/interface/predictor.py

+ 91 - 38
BiddingKG/dl/interface/predictor.py

@@ -2538,8 +2538,8 @@ class ProductPredictor():
 # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
 class ProductAttributesPredictor():
     def __init__(self,):
-        self.p0 = '(品目|类别|类型|物类|目录|^品名|^品类)(名称|$)|(标项|项目|计划|标段|[分子]?包|子目|服务|招标|工程|招标内容)(名称|内容|描述)'
-        self.p1 = '(标的|维修|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名)[\))的]?(名称|内容|描述)'
+        self.p0 = '(品目|类别|类型|物类|目录|类目|分类|^品名(及规格)?|^品类)(名称|$)|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|工程|招标内容)(名称|内容|描述)'
+        self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?(名称|内容|描述)'
         self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称'
         # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
         # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
@@ -2554,10 +2554,13 @@ class ProductAttributesPredictor():
         if table.find_all(['caption', 'th']) != []:
             return True
         elif len(table.find_all(['form', 'a', 'img'])) > 5:
+            # print('过滤表格:包含链接图片等大于5的为假表格')
             return False
         elif len(table.find_all(['tr'])) < 2:
+            # print('过滤表格:行数小于2的为假表格')
             return False
         elif len(table.find_all(['table'])) >= 1:
+            # print('过滤表格:包含多个表格的为假表格')
             return False
         else:
             return True
@@ -2820,6 +2823,7 @@ class ProductAttributesPredictor():
         :param p2: 第二表头正则
         :return: 表头所在列序号,是否表头,表头内容
         '''
+        items = [re.sub('\s', '', it) for it in items]
         flag = False
         header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
         product = ""  # 产品
@@ -2845,7 +2849,7 @@ class ProductAttributesPredictor():
                 flag = True
                 product = it
                 header_dic['名称'] = i
-                break
+                # break
         # if not flag:
         if product == "":
             for i in range(min(4, len(items))):
@@ -2856,6 +2860,11 @@ class ProductAttributesPredictor():
                     product = it
                     header_dic['名称'] = i
                     break
+        if flag == False and len(items)>3 and re.search('^第[一二三四五六七八九十](包|标段)$', items[0]):
+            product = items[0]
+            header_dic['名称'] = 0
+            flag = True
+
         if flag:
             # for j in range(i + 1, len(items)):
             for j in range(len(items)):
@@ -2891,7 +2900,7 @@ class ProductAttributesPredictor():
                 elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
                     header_dic['时间'] = j
                     order_time = items[j]
-                elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计))?(金额|价格?)', items[j]):
+                elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计))?([总]额|价格?)|最高限价', items[j]):
                     header_dic['总价'] = j
                     total_price = items[j]
 
@@ -2915,6 +2924,8 @@ class ProductAttributesPredictor():
         '''
 
         html = html.replace('<br>', '\n').replace('<br/>', '\n')
+        html = re.sub("<html>|</html>|<body>|</body>","",html)
+        html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
         # flag_yx = True if re.search('采购意向', html) else False
         flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
@@ -2926,12 +2937,15 @@ class ProductAttributesPredictor():
         demand_link = []
         product_set = set()
         total_product_money = 0
-        unit_price_list = []
+        unit_price_list = [] # 单价列表,用于判断是否重复单价,避免多个表格重复提取造成合计产品价格错误。
+        total_price_list = []  # 总价列表,拥有判断是否为几行产品合计总价
+        # print('表格数:', len(tables))
         for i in range(len(tables)-1, -1, -1):
             table = tables[i]
             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
                 table.string = table.get_text()
                 table.name = 'turntable'
+                # print('过滤表格:表格父节点为td,且表格td数量小于等于3')
                 continue
             if not self.isTrueTable(table):
                 continue
@@ -2943,6 +2957,7 @@ class ProductAttributesPredictor():
             header_quan_unit = ""  # 数量表头 包含单位
             header_colnum = 0
             if flag_yx:
+                # print('意向公告, 提取意向信息')
                 col0_l = []
                 col1_l = []
                 for tds in inner_table:
@@ -2995,9 +3010,10 @@ class ProductAttributesPredictor():
                         continue
             while i < (len(inner_table)):
                 tds = inner_table[i]
-                not_empty = [it for it in tds if it != ""]
-                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
+                not_empty = [it for it in tds if re.sub('\s', '', it) != ""]
+                if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总', tds[0])): # 非空列或者不重复内容小于两列的 继续
                     i += 1
+                    # print('表格产品提取:非空列或者不重复内容小于两列的 继续', i, tds)
                     continue
                 product = ""  # 产品
                 quantity = ""  # 数量
@@ -3017,6 +3033,7 @@ class ProductAttributesPredictor():
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
                     if found_header:
                         header_colnum = len(tds) # 保存表头所在行列数
+                        # print('发现表头:', header_colnum, header_dic)
                     if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
                             quantity_header = header_list[1].replace('单位:', '')
                             if re.search('(([\w/]{,5}))', quantity_header):
@@ -3029,10 +3046,12 @@ class ProductAttributesPredictor():
                         headers_demand.append('_'.join(header_list2))
                         header_col.append('_'.join(tds))
                     i += 1
+                    # print('表头数量占行列数0.2倍不做内容匹配', set([re.sub('[::]','',td) for td in tds]) & self.header_set)
                     continue
                 elif found_header:
                     if len(tds) != header_colnum:  # 表头、属性列数不一致跳过
                         i += 1
+                        # print('表头、属性列数不一致跳过', len(tds), header_colnum, tds)
                         continue
                     id0 = header_dic.get('品目', "")
                     id1 = header_dic.get('名称', "")
@@ -3053,11 +3072,13 @@ class ProductAttributesPredictor():
                     for k, v in header_dic.items():
                         if isinstance(v, int):
                             if v >= len(tds) or tds[v] in self.header_set:
+                                # print('内容属性在表头集合里面', tds[v], v >= len(tds))
                                 not_attr = 1
-                                break
-                    if not_attr: # 只要属性里面有一项为表头,停止匹配
+                                # break
+                    if not_attr>=2: # 只要属性里面有两项为表头,停止匹配
                         i += 1
                         found_header = False
+                        # print('只要属性里面有两项为表头,停止匹配')
                         continue
 
                     if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
@@ -3067,17 +3088,20 @@ class ProductAttributesPredictor():
                     if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id0]) == None:
                         category = tds[id0]
-                        product = "%s_%s"%(category, product) if product!="" else category
+                        product = "%s_%s"%(category, product) if product!="" and product!=category else category
 
                     if product != "":
+                        # print('匹配产品内容: ', product)
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
-                                if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]):
-                                    i += 1
-                                    continue
+                                # if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]):  # 254816100 这篇数量很大,貌似正常
+                                #     i += 1
+                                #     print('过滤:数量包含金额单位或值很大类似金额', tds[id2])
+                                #     continue
                                 quantity = tds[id2]
                             elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
                                 i += 1
+                                # print('过滤:数量包含五个字符以上且不包含^详见|^略等字符')
                                 continue
                         if id2_2 != "":
                             if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
@@ -3087,8 +3111,9 @@ class ProductAttributesPredictor():
                                 unitPrice = tds[id3]
                             elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
                                 unitPrice = tds[id3]
-                            elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id3])) > 5:
+                            elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id3])) > 5 and re.search('^详见|^略', tds[id3])==None:
                                 i += 1
+                                # print('过滤:产品单价包含金额外的字符数大于5个')
                                 continue
                         if id4 != "":
                             if re.search('\w', tds[id4]):
@@ -3121,13 +3146,15 @@ class ProductAttributesPredictor():
                                 total_price = tds[id9]
                             elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
                                 total_price = tds[id9]
-                            elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id9])) > 5:
+                            elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', tds[id9])) > 5 and re.search('^详见|^略', tds[id9])==None:
                                 i += 1
+                                # print('过滤:产品总价包含金额外的字符数大于5个')
                                 continue
                         if id10 != "":
                             parameter = tds[id10][:500]
                             if re.match('^详见|^略$', parameter.strip()):
                                 parameter = ""
+                        # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
                             if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
                                 products = re.split('[;;、,\n]', tds[id1])
@@ -3149,13 +3176,14 @@ class ProductAttributesPredictor():
                                             quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
                                         if unitPrice != "":
                                             unitPrice, _money_unit = money_process(unitPrice, header_list[3])
-                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                            unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
                                         if budget != "":
                                             budget, _money_unit = money_process(budget, header_list2[2])
-                                            budget = str(budget) if budget != 0 else ''
+                                            budget = str(budget) if budget != 0 and budget<50000000000 else ''
                                         if total_price != "":
                                             total_price, _money_unit = money_process(total_price, header_list[6])
-                                            total_price = str(total_price) if unitPrice != 0 else ""
+                                            total_price_list.append(total_price)
+                                            total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
                                         link = {'product': product, 'quantity': quantity,
                                                 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
                                                 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
@@ -3182,8 +3210,9 @@ class ProductAttributesPredictor():
                                                     log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
                                                     link['unitPrice'], link['quantity']))
 
-                            elif len(unitPrice) > 15 or len(product)>100:  # 单价大于15位数或 产品名称长于100字
+                            elif len(product)>100:  # 产品名称长于100字
                                 i += 1
+                                # print('过滤: 产品名称长于100字',)
                                 continue
                             else:
                                 if quantity != "":
@@ -3191,13 +3220,14 @@ class ProductAttributesPredictor():
                                     quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
                                 if unitPrice != "":
                                     unitPrice, _money_unit = money_process(unitPrice, header_list[3])
-                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                    unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
                                 if budget != "":
                                     budget, _money_unit = money_process(budget, header_list2[2])
-                                    budget = str(budget) if budget != 0 else ''
+                                    budget = str(budget) if budget != 0 and budget<50000000000 else ''
                                 if total_price != "":
                                     total_price, _money_unit = money_process(total_price, header_list[6])
-                                    total_price = str(total_price) if unitPrice != 0 else ""
+                                    total_price_list.append(total_price)
+                                    total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
                                 link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
                                                           'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
 
@@ -3210,8 +3240,8 @@ class ProductAttributesPredictor():
                                 #         except:
                                 #             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
 
-                                if (product, specs, unitPrice, quantity) not in product_set:
-                                    product_set.add((product, specs, unitPrice, quantity))
+                                if (product, unitPrice, quantity) not in product_set:
+                                    product_set.add((product, unitPrice, quantity))
                                     product_link.append(link)
                                     if link['unitPrice'] != "" and link['quantity'] != '':
                                         try:
@@ -3236,6 +3266,18 @@ class ProductAttributesPredictor():
                     i += 1
                 else:
                     i += 1
+        if len(total_price_list)>0 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
+            # print('总价一半以上重复的为多行一个总价,需去掉')
+            for link in product_link:
+                if 'total_price' in link:
+                    link['total_price'] = ""
+        if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
+            # print('如果单价重复率高不算总产品价避免错误')
+            total_product_money = 0
+            for link in product_link:
+                if 'unitPrice' in link:
+                    link['unitPrice'] = ""
+
         if len(product_link)>0:
             attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
         else:
@@ -3244,8 +3286,7 @@ class ProductAttributesPredictor():
             demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
         else:
             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
-        if len(unit_price_list)>0 and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
-            total_product_money = 0
+        # print('表格产品属性提取:', attr_dic)
         return [attr_dic, demand_dic], total_product_money
 
     def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
@@ -3283,6 +3324,7 @@ class ProductAttributesPredictor():
                         'order_begin': order_begin, 'order_end': order_end}
                 _data.append(link)
             product_attrs[1]['demand_info']['data'] = _data
+        # print('predict_without_table: ', product_attrs)
         return product_attrs
 
     def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
@@ -3312,7 +3354,12 @@ class ProductAttributesPredictor():
                     begin_list = [0]
                     for index,head in enumerate(head_list):
                         if head not in loop_list:
-                            loop_list.append(head)
+                            if re.search('第[一二三四五六七八九十](包|标段)', head) and re.search('第[一二三四五六七八九十](包|标段)', '|'.join(loop_list)):
+                                begin_list.append(index)
+                                loop_list = []
+                                loop_list.append(head)
+                            else:
+                                loop_list.append(head)
                         else:
                             begin_list.append(index)
                             loop_list = []
@@ -3454,7 +3501,7 @@ class ProductAttributesPredictor():
                                     if re.match('^详见|^略$', parameter.strip()):
                                         parameter = ""
                                 if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
-                                    if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
+                                    if id1 != "" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
                                             re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[id2])):  # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
                                         products = re.split('[;;、,\n]', deal_list[id1])
                                         quantitys = re.split('[;;、,\n]', deal_list[id2])
@@ -3478,14 +3525,14 @@ class ProductAttributesPredictor():
                                                     quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
                                                 if unitPrice != "":
                                                     unitPrice, _money_unit = money_process(unitPrice, header_list[3])
-                                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                                    unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
                                                 if budget != "":
                                                     budget, _money_unit = money_process(budget, header_list2[2])
-                                                    budget = str(budget) if budget != 0 else ''
+                                                    budget = str(budget) if budget != 0 and budget<50000000000 else ''
                                                 if total_price != "":
                                                     total_price, _money_unit = money_process(total_price,
                                                                                              header_list[6])
-                                                    total_price = str(total_price) if unitPrice != 0 else ""
+                                                    total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
                                                 link = {'product': product, 'quantity': quantity,
                                                         'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
                                                         'brand': brand[:50], 'specs': specs, 'total_price': total_price,
@@ -3512,13 +3559,13 @@ class ProductAttributesPredictor():
                                             quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
                                         if unitPrice != "":
                                             unitPrice, _money_unit = money_process(unitPrice, header_list[3])
-                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                            unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
                                         if budget != "":
                                             budget, _money_unit = money_process(budget, header_list2[2])
-                                            budget = str(budget) if budget != 0 else ''
+                                            budget = str(budget) if budget != 0 and budget<50000000000 else ''
                                         if total_price != "":
                                             total_price, _money_unit = money_process(total_price, header_list[6])
-                                            total_price = str(total_price) if unitPrice != 0 else ""
+                                            total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
                                         link = {'product': product, 'quantity': quantity,
                                                 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
                                                 'brand': brand[:50], 'specs': specs, 'total_price': total_price,
@@ -3562,6 +3609,7 @@ class ProductAttributesPredictor():
                         product_attrs[1] = demand_dic
                     if get_product_attrs:
                         break
+        # print('predict_by_text: ', product_attrs)
         return product_attrs
 
 
@@ -3617,6 +3665,7 @@ class DocChannel():
       }
       self.life_dic = {
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+          '采购意向neg': '发布政府采购意向|采购意向公告已于',
           '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
           '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
           '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
@@ -4009,6 +4058,10 @@ class DocChannel():
               elif life_score[k] == max_score and life_score[k] > 0:
                   life_list.append(k)
           if '采购意向' in life_kw_title or '采购意向' in life_list:
+              if '中标信息' in life_kw_title or '中标信息' in life_list:
+                  return '中标信息', msc
+              elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
+                  return '', msc
               return '采购意向', msc
           elif '招标预告' in life_kw_title or '招标预告' in life_list:
               if '中标信息' in life_kw_title or '中标信息' in life_list:
@@ -5395,7 +5448,7 @@ class TablePremExtractor(object):
             prem_dic[package]['name'] = project_name
 
             if budget_ != "":
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     break
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
@@ -5425,7 +5478,7 @@ class TablePremExtractor(object):
                         "serviceTime": ""
                 })
             if tenderer and not same_package:
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     break
 
@@ -5696,7 +5749,7 @@ class CandidateExtractor(object):
                     header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
-                        if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
+                        if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
                                       text)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                             break
                         money, money_unit = money_process(text, header)
@@ -5731,7 +5784,7 @@ class CandidateExtractor(object):
                             'tendereeMoney': 0,
                             'tendereeMoneyUnit': ""
                         }
-                    if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                    if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
                         break
                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")