Переглянути джерело

补充采购意向备注及发布日期提取

lsm 1 рік тому
батько
коміт
2ee196dc3b

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -349,7 +349,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-03-11'}
+    version_date = {'version_date': '2024-03-28'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 31 - 6
BiddingKG/dl/interface/predictor.py

@@ -2890,7 +2890,7 @@ class ProductAttributesPredictor():
         '''
         items = [re.sub('\s', '', it) for it in items]
         flag = False
-        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':''}
+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':''}
         product = ""  # 产品
         quantity = ""  # 数量
         quantity_unit = "" # 数量单位
@@ -2904,6 +2904,8 @@ class ProductAttributesPredictor():
         category = "" # 品目
         parameter = "" # 参数
         tenderee = "" # 采购人
+        notes = "" # 备注  2024/3/27 达仁 需求
+        issue_date = ""  # 发布日期 2024/3/27 达仁 需求
 
         # for i in range(min(6, len(items))):
         for i in range(len(items)):
@@ -2977,6 +2979,12 @@ class ProductAttributesPredictor():
                 elif re.search('总价|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价|价格|金额', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None:
                     header_dic['总价'] = j
                     total_price = items[j]
+                elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', items[j]):
+                    header_dic['备注'] = j
+                    notes = items[j]
+                elif re.search('^\w{,4}发布(时间|日期)$', items[j]):
+                    header_dic['发布日期'] = j
+                    issue_date = items[j]
 
             if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "":
                 # num = 0
@@ -2986,9 +2994,9 @@ class ProductAttributesPredictor():
                 # if num >=2:
                 #     return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
                 if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
-                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee, notes,issue_date)
         flag = False
-        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee,notes,issue_date)
 
     def predict(self, docid='', html='', page_time=""):
         '''
@@ -3053,6 +3061,8 @@ class ProductAttributesPredictor():
                     header_list2 = []
                     product = demand = budget = order_begin = order_end = ""
                     tenderee = ""
+                    notes = ''
+                    issue_date = ''
                     for i in range(len(col0_l)):
                         if re.search('项目名称', col0_l[i]):
                             header_list2.append(col0_l[i])
@@ -3078,6 +3088,12 @@ class ProductAttributesPredictor():
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)
+                        elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            notes = col1_l[i].strip()
+                        elif re.search('^\w{,4}发布(时间|日期)$', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            issue_date = self.fix_time(col1_l[i].strip(), '', '')[0]
                     if order_begin != "" and order_end!="":
                         order_begin_year = int(order_begin.split("-")[0])
                         order_end_year = int(order_end.split("-")[0])
@@ -3087,7 +3103,7 @@ class ProductAttributesPredictor():
                     # print(product,demand,budget,order_begin)
                     if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
                         link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
-                                'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee}
+                                'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee, 'notes':notes, 'issue_date':issue_date}
                         if link not in demand_link:
                             demand_link.append(link)
                             headers_demand.append('_'.join(header_list2))
@@ -3140,6 +3156,8 @@ class ProductAttributesPredictor():
                 total_price = "" # 总金额
                 parameter = "" # 参数
                 tenderee = "" # 采购人
+                notes = '' # 备注
+                issue_date = '' # 发布日期
                 if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
@@ -3181,6 +3199,9 @@ class ProductAttributesPredictor():
                     id10 = header_dic.get('参数', "")
                     id11 = header_dic.get('采购人', "")
 
+                    id12 = header_dic.get('备注', "")
+                    id13 = header_dic.get('发布日期', "")
+
                     not_attr = 0
                     for k, v in header_dic.items():
                         if isinstance(v, int):
@@ -3271,6 +3292,10 @@ class ProductAttributesPredictor():
                             tenderee = re.sub("\s","",tds[id11])
                             if len(tenderee) > 30:
                                 tenderee = ""
+                        if id12 != "":
+                            notes = tds[id12].strip()
+                        if id13 != "":
+                            issue_date = self.fix_time(tds[id13].strip(), '', '')[0]
                         # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
                             if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
@@ -3384,7 +3409,7 @@ class ProductAttributesPredictor():
                                 order_begin = order_end = ""
                         # print(budget,order_time)
                         if budget != "" and order_time != "":
-                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee}
+                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee,'notes':notes,'issue_date':issue_date}
                             if link not in demand_link:
                                 demand_link.append(link)
                     i += 1
@@ -5764,7 +5789,7 @@ class DistrictPredictor():
 
         # print('招标人地址',role_addr, tenderee_address)
 
-        project_name = project_name + title if project_name not in title else project_name
+        project_name = project_name + title if project_name not in title else title
         project_name = project_name.replace(tenderee, '')
 
         text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)