Browse Source

新增合同签订时间、合同起始时间、合同结束时间字段的提取

znj 1 năm trước cách đây
mục cha
commit
da85fec0f3
2 tập tin đã thay đổi với 37 bổ sung9 xóa
  1. 3 1
      BiddingKG/dl/interface/extract.py
  2. 34 8
      BiddingKG/dl/interface/predictor.py

+ 3 - 1
BiddingKG/dl/interface/extract.py

@@ -180,6 +180,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
+    # 过滤掉Redis里值为0的错误实体
+    # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
     # #依赖句子顺序
     # start_time = time.time() # 公告类型/生命周期提取  此处作废 换到后面预测 2022/4/29
     # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
@@ -308,7 +310,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''公告无表格格式时,采购意向预测'''  #依赖 docchannel结果 依赖产品及prem
     '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
-    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time)
+    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
 
     '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
     industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)

+ 34 - 8
BiddingKG/dl/interface/predictor.py

@@ -3235,7 +3235,7 @@ class ProductAttributesPredictor():
             list_sentence = list_sentences[0]
             list_entity = list_entitys[0]
             _data = product_attrs[1]['demand_info']['data']
-            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
+            re_bidding_time = re.compile("(采购|采购实施|预计招标)(时间|月份|日期)[::,].{0,2}$")
             order_times = []
             for entity in list_entity:
                 if entity.entity_type=='time':
@@ -3266,7 +3266,8 @@ class ProductAttributesPredictor():
             product_attrs[1]['demand_info']['data'] = _data
         return product_attrs
 
-    def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
+    def predict_by_text(self,product_attrs,html,list_outlines,product_list,page_time=""):
+        product_entity_list = list(set(product_list))
         list_outline = list_outlines[0]
         get_product_attrs = False
         for _outline in list_outline:
@@ -3281,12 +3282,22 @@ class ProductAttributesPredictor():
                 for key_value in key_value_list:
                     key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
                     temp = re.split("[::]",key_value)
-                    key = temp[-2]
+                    if len(temp)>2:
+                        if temp[0] in head_list:
+                            key = temp[0]
+                            value = "".join(temp[1:])
+                        else:
+                            key = temp[-2]
+                            value = temp[-1]
+                    else:
+                        key = temp[0]
+                        value = temp[1]
+
                     key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key)
-                    value = temp[-1]
                     head_list.append(key)
                     head_value_list.append(value)
                 head_set = set(head_list)
+                # print(head_list,head_value_list)
                 # print('head_set',head_set)
                 if len(head_set & self.header_set) > len(head_set)*0.2:
                     loop_list = []
@@ -3298,6 +3309,7 @@ class ProductAttributesPredictor():
                             begin_list.append(index)
                             loop_list = []
                             loop_list.append(head)
+                    # print(begin_list)
                     headers = []
                     headers_demand = []
                     header_col = []
@@ -3324,8 +3336,9 @@ class ProductAttributesPredictor():
                         order_end = ""
                         total_price = ""  # 总金额
                         parameter = ""  # 参数
-
+                        # print(tmp_head_list)
                         header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
+                        # print(found_header,header_list, header_list2)
                         if found_header:
                             headers.append('_'.join(header_list))
                             headers_demand.append('_'.join(header_list2))
@@ -3353,6 +3366,14 @@ class ProductAttributesPredictor():
                                 category = deal_list[id0]
                                 product = "%s_%s" % (category, product) if product != "" else category
 
+                            if product == "":
+                                # print(deal_list[id4],deal_list[id5],tmp_head_list,deal_list)
+                                if deal_list[id4] != "" or deal_list[id5] != "":
+                                    for head,value in zip(tmp_head_list,deal_list):
+                                        if value in product_entity_list:
+                                            product = value
+                                            break
+
                             if product != "":
                                 if id2 != "":
                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
@@ -3422,7 +3443,7 @@ class ProductAttributesPredictor():
                                             if float(budget)>= 100000*10000:
                                                 budget = ""
                                 if id8 != "":
-                                    if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
+                                    if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计招标)(时间|月份|日期)",header_list2[3]):
                                         order_time = deal_list[id8].strip()
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
                                 if id9 != "":
@@ -3529,6 +3550,11 @@ class ProductAttributesPredictor():
                                     if link not in demand_link:
                                         demand_link.append(link)
 
+
+
+
+
+
                     if len(product_link) > 0:
                         attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}
                         get_product_attrs = True
@@ -3549,11 +3575,11 @@ class ProductAttributesPredictor():
 
 
 
-    def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time):
+    def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
         if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
             product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
         if len(product_attrs[0]['product_attrs']['data']) == 0:
-            product_attrs = self.predict_by_text(product_attrs,text,list_outlines,page_time)
+            product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
         if len(product_attrs[1]['demand_info']['data'])>0:
             for d in product_attrs[1]['demand_info']['data']:
                 for product in set(prem[0]['product']):