Jelajahi Sumber

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

znj 1 tahun lalu
induk
melakukan
93ef878fee
2 mengubah file dengan 37 tambahan dan 8 penghapusan
  1. 1 1
      BiddingKG/dl/interface/extract.py
  2. 36 7
      BiddingKG/dl/interface/predictor.py

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -349,7 +349,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-03-11'}
+    version_date = {'version_date': '2024-03-28'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 36 - 7
BiddingKG/dl/interface/predictor.py

@@ -2890,7 +2890,7 @@ class ProductAttributesPredictor():
         '''
         items = [re.sub('\s', '', it) for it in items]
         flag = False
-        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':''}
+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':''}
         product = ""  # 产品
         quantity = ""  # 数量
         quantity_unit = "" # 数量单位
@@ -2904,6 +2904,8 @@ class ProductAttributesPredictor():
         category = "" # 品目
         parameter = "" # 参数
         tenderee = "" # 采购人
+        notes = "" # 备注  2024/3/27 达仁 需求
+        issue_date = ""  # 发布日期 2024/3/27 达仁 需求
 
         # for i in range(min(6, len(items))):
         for i in range(len(items)):
@@ -2977,6 +2979,12 @@ class ProductAttributesPredictor():
                 elif re.search('总价|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价|价格|金额', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None:
                     header_dic['总价'] = j
                     total_price = items[j]
+                elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', items[j]):
+                    header_dic['备注'] = j
+                    notes = items[j]
+                elif re.search('^\w{,4}发布(时间|日期)$', items[j]):
+                    header_dic['发布日期'] = j
+                    issue_date = items[j]
 
             if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "":
                 # num = 0
@@ -2986,9 +2994,9 @@ class ProductAttributesPredictor():
                 # if num >=2:
                 #     return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
                 if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
-                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee, notes,issue_date)
         flag = False
-        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee,notes,issue_date)
 
     def predict(self, docid='', html='', page_time=""):
         '''
@@ -3053,6 +3061,8 @@ class ProductAttributesPredictor():
                     header_list2 = []
                     product = demand = budget = order_begin = order_end = ""
                     tenderee = ""
+                    notes = ''
+                    issue_date = ''
                     for i in range(len(col0_l)):
                         if re.search('项目名称', col0_l[i]):
                             header_list2.append(col0_l[i])
@@ -3078,6 +3088,12 @@ class ProductAttributesPredictor():
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)
+                        elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            notes = col1_l[i].strip()
+                        elif re.search('^\w{,4}发布(时间|日期)$', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            issue_date = self.fix_time(col1_l[i].strip(), '', '')[0]
                     if order_begin != "" and order_end!="":
                         order_begin_year = int(order_begin.split("-")[0])
                         order_end_year = int(order_end.split("-")[0])
@@ -3087,7 +3103,7 @@ class ProductAttributesPredictor():
                     # print(product,demand,budget,order_begin)
                     if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
                         link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
-                                'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee}
+                                'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee, 'notes':notes, 'issue_date':issue_date}
                         if link not in demand_link:
                             demand_link.append(link)
                             headers_demand.append('_'.join(header_list2))
@@ -3140,6 +3156,8 @@ class ProductAttributesPredictor():
                 total_price = "" # 总金额
                 parameter = "" # 参数
                 tenderee = "" # 采购人
+                notes = '' # 备注
+                issue_date = '' # 发布日期
                 if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
@@ -3181,6 +3199,9 @@ class ProductAttributesPredictor():
                     id10 = header_dic.get('参数', "")
                     id11 = header_dic.get('采购人', "")
 
+                    id12 = header_dic.get('备注', "")
+                    id13 = header_dic.get('发布日期', "")
+
                     not_attr = 0
                     for k, v in header_dic.items():
                         if isinstance(v, int):
@@ -3271,6 +3292,10 @@ class ProductAttributesPredictor():
                             tenderee = re.sub("\s","",tds[id11])
                             if len(tenderee) > 30:
                                 tenderee = ""
+                        if id12 != "":
+                            notes = tds[id12].strip()
+                        if id13 != "":
+                            issue_date = self.fix_time(tds[id13].strip(), '', '')[0]
                         # print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
                         if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
                             if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
@@ -3384,7 +3409,7 @@ class ProductAttributesPredictor():
                                 order_begin = order_end = ""
                         # print(budget,order_time)
                         if budget != "" and order_time != "":
-                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee}
+                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee,'notes':notes,'issue_date':issue_date}
                             if link not in demand_link:
                                 demand_link.append(link)
                     i += 1
@@ -5489,7 +5514,7 @@ class DistrictPredictor():
                 return ''
 
         def get_bid_addr(text):
-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p2, text):
                 return re.search(p2, text).group('addr')
             else:
@@ -5525,6 +5550,8 @@ class DistrictPredictor():
                 if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
                         '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
                     continue
+                if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
+                    continue
                 addr.append((it.group(0), it.start(), it.end()))
                 if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
                     addr.append((it.group(0), it.start(), it.end()))
@@ -5532,6 +5559,8 @@ class DistrictPredictor():
 
         def get_pro_city_dis_score(text, text_weight=1):
             text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
+            text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
+            text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
             province_l = find_areas(p_pro, text)
             city_l = find_areas(p_city, text)
             district_l = find_areas(p_dis, text)
@@ -5760,7 +5789,7 @@ class DistrictPredictor():
 
         # print('招标人地址',role_addr, tenderee_address)
 
-        project_name = project_name + title if project_name not in title else project_name
+        project_name = project_name + title if project_name not in title else title
         project_name = project_name.replace(tenderee, '')
 
         text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)