소스 검색

采购意向提取新增‘采购人’字段

znj 1 년 전
부모
커밋
08afe27832
2개의 변경된 파일21개의 추가작업 그리고 10개의 파일을 삭제
  1. 19 8
      BiddingKG/dl/interface/predictor.py
  2. 2 2
      BiddingKG/dl/ratio/re_ratio.py

+ 19 - 8
BiddingKG/dl/interface/predictor.py

@@ -2883,7 +2883,7 @@ class ProductAttributesPredictor():
         '''
         items = [re.sub('\s', '', it) for it in items]
         flag = False
-        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':''}
         product = ""  # 产品
         quantity = ""  # 数量
         quantity_unit = "" # 数量单位
@@ -2896,6 +2896,7 @@ class ProductAttributesPredictor():
         total_price = "" # 总价
         category = "" # 品目
         parameter = "" # 参数
+        tenderee = "" # 采购人
 
         # for i in range(min(6, len(items))):
         for i in range(len(items)):
@@ -2954,11 +2955,13 @@ class ProductAttributesPredictor():
                 elif re.search('参数', items[j]):
                     header_dic['参数'] = j
                     parameter = items[j]
-
+                elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位',items[j]) and len(items[j])<=8:
+                    header_dic['采购人'] = j
+                    tenderee = items[j]
                 elif re.search('需求|服务要求|服务标准', items[j]):
                     header_dic['需求'] = j
                     demand = items[j]
-                elif re.search('预算|控制金额', items[j]):
+                elif re.search('预算|控制金额', items[j]) and not re.search('预算单位',items[j]):
                     header_dic['预算'] = j
                     budget = items[j]
                 elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
@@ -2976,9 +2979,9 @@ class ProductAttributesPredictor():
                 # if num >=2:
                 #     return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
                 if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
-                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
         flag = False
-        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
 
     def predict(self, docid='', html='', page_time=""):
         '''
@@ -3058,7 +3061,9 @@ class ProductAttributesPredictor():
                                 if '万元' in col0_l[i] and '万' not in _budget:
                                     _budget += '万元'
                                 budget = str(getUnifyMoney(_budget))
-
+                        elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            tenderee = re.sub("\s","",col1_l[i])
                         elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
@@ -3072,7 +3077,7 @@ class ProductAttributesPredictor():
                     # print(product,demand,budget,order_begin)
                     if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
                         link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
-                                'order_begin': order_begin, 'order_end': order_end}
+                                'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee}
                         if link not in demand_link:
                             demand_link.append(link)
                             headers_demand.append('_'.join(header_list2))
@@ -3124,6 +3129,7 @@ class ProductAttributesPredictor():
                 order_end = ""
                 total_price = "" # 总金额
                 parameter = "" # 参数
+                tenderee = "" # 采购人
                 if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
@@ -3163,6 +3169,7 @@ class ProductAttributesPredictor():
 
                     id9 = header_dic.get("总价", "")
                     id10 = header_dic.get('参数', "")
+                    id11 = header_dic.get('采购人', "")
 
                     not_attr = 0
                     for k, v in header_dic.items():
@@ -3250,6 +3257,10 @@ class ProductAttributesPredictor():
                             parameter = tds[id10][:500]
                             if re.match('^详见|^详情', parameter.strip()):
                                 parameter = ""
+                        if id11 != "":
+                            tenderee = re.sub("\s","",tds[id11])
+                            if len(tenderee) > 30:
+                                tenderee = ""
                         # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
                             if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
@@ -3363,7 +3374,7 @@ class ProductAttributesPredictor():
                                 order_begin = order_end = ""
                         # print(budget,order_time)
                         if budget != "" and order_time != "":
-                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
+                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee}
                             if link not in demand_link:
                                 demand_link.append(link)
                     i += 1

+ 2 - 2
BiddingKG/dl/ratio/re_ratio.py

@@ -3,9 +3,9 @@ from decimal import Decimal
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
 # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
 
-ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|折扣系数|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
+ratio = re.compile('(([((]?(上浮|下浮)费?(率|系数|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|折扣系数|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
                    '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
-                   '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)')
+                   '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?(|系数))[))]?)')
 ratio = ratio.pattern
 # print(ratio)