Parcourir la source

Merge remote-tracking branch 'origin/master'

lsm il y a 1 an
Parent
commit
32ad857d6e

+ 3 - 1
BiddingKG/dl/interface/extract.py

@@ -188,6 +188,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
+    # 过滤掉Redis里值为0的错误实体
+    # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
     # #依赖句子顺序
     # start_time = time.time() # 公告类型/生命周期提取  此处作废 换到后面预测 2022/4/29
     # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
@@ -319,7 +321,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''公告无表格格式时,采购意向预测'''  #依赖 docchannel结果 依赖产品及prem
     '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
-    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time)
+    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
 
     '''行业分类提取,需要用标题、项目名称、产品、及prem 里面的角色'''
     industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)

+ 62 - 11
BiddingKG/dl/interface/getAttributes.py

@@ -1552,8 +1552,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
                             continue
                         # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                        if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
+                        if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
                             continue
+                        if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
+                            if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
+                                                                         list_sentence[_object.sentence_index-1].sentence_text[-10:]+
+                                                                         list_sentence[_object.sentence_index].sentence_text[0:_object.wordOffset_begin]):
+                                continue
                         # 角色为中标候选人,排除距离过远的联系人
                         if _subject.label in [2, 3, 4] and distance>=40:
                             continue
@@ -1979,6 +1984,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
                                 if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
+                                if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
+                                    if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
+                                                                                 list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
+                                                                                 list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
+                                        continue
                                 if after_entity.label in [1, 2, 3]:
                                     # distance = (tokens_num_dict[
                                     #                 after_entity.sentence_index] + after_entity.begin_index) - (
@@ -3032,7 +3042,10 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_commencement':[] , #13 开工日期
         'time_completion': [],  # 14 竣工日期
         'time_listingStart': [],  # 15 挂牌开始日期(挂牌时间)
-        'time_listingEnd': []  # 16 挂牌结束日期、挂牌截止日期
+        'time_listingEnd': [],  # 16 挂牌结束日期、挂牌截止日期
+        'time_signContract': [],  # 17 合同签订时间
+        'time_contractStart': [],  # 18 合同开始时间
+        'time_contractEnd': []  # 19 合同结束时间
     }
     last_sentence_index = 0
     last_time_type = ""
@@ -3043,7 +3056,8 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_registrationStart':"time_registrationEnd",
         'time_earnestMoneyStart':"time_earnestMoneyEnd",
         'time_commencement':"time_completion",
-        'time_listingStart':"time_listingEnd"
+        'time_listingStart':"time_listingEnd",
+        'time_contractStart':"time_contractEnd"
     }
     for entity in time_entitys:
         sentence_text = list_sentence[entity.sentence_index].sentence_text
@@ -3174,7 +3188,32 @@ def getTimeAttributes(list_entity,list_sentence):
                     last_sentence_index = entity.sentence_index
                     continue
 
-            if re.search("至|到", entity_left):
+            # 2023/9/13 新增合同相关时间
+            if re.search("合同|服务|履[约行]", entity_left2):
+                if len(extract_time) == 1:
+                    if re.search("(合同.{,2}签[订定署].{,2}|签[订定署].{,2}合同.{,2})(?:时间|日期)|合同签[订定署].{,1}$", entity_left2):
+                        dict_time['time_signContract'].append((extract_time[0], 0.5, in_attachment))
+                        last_time_type = 'time_signContract'
+                    elif re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
+                        if re.search("到|至|截[至止]",entity_left) or re.search("前|止|截止",entity_right) or re.search("前",entity_text[-2:]):
+                            dict_time['time_contractEnd'].append((extract_time[0], 0.5, in_attachment))
+                            last_time_type = 'time_contractEnd'
+                        else:
+                            dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
+                            last_time_type = 'time_contractStart'
+                    elif re.search("(合同|服务|履约|(合同|服务)履行).{,2}(?:起始|开始)(?:时间|日期)", entity_left2):
+                        dict_time['time_contractStart'].append((extract_time[0], 0.55, in_attachment))
+                        last_time_type = 'time_contractStart'
+                    elif re.search("(合同|服务|履约).{,2}(?:完成|截止|结束)(?:时间|日期|时限)", entity_left2):
+                        dict_time['time_contractEnd'].append((extract_time[0], 0.55, in_attachment))
+                        last_time_type = 'time_contractEnd'
+                else:
+                    if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
+                        dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment))
+                        dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment))
+                        last_time_type = ''
+
+            if re.search("至|到|[日\d][-—]$", entity_left):
                 if entity.sentence_index == last_sentence_index:
                     time_type = last_time_index.get(last_time_type)
                     if time_type:
@@ -3349,8 +3388,9 @@ def getOtherAttributes(list_entity):
                 dict_other["moneysource"] = entity.entity_text
                 last_moneysource_prob = entity.prob
         elif entity.entity_type=='serviceTime':
-            if list_serviceTime and entity.in_attachment:
-                continue
+            # print(entity.entity_text)
+            # if list_serviceTime and entity.in_attachment:
+            #     continue
             if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
                 list_serviceTime.append(entity)
         elif entity.entity_type=="person" and entity.label ==4:
@@ -3361,11 +3401,22 @@ def getOtherAttributes(list_entity):
             dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
     if list_serviceTime:
-        list_serviceTime.sort(key=lambda x:x.prob,reverse=True)
-        max_prob = list_serviceTime[0].prob
-        max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
-        max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
-        dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
+        list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
+        list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
+        if not list_serviceTime:
+            list_serviceTime = list_serviceTime_inAtt
+        list_serviceTime.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
+        for _serviceTime in list_serviceTime:
+            # 优先取具体时间
+            if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}",_serviceTime.entity_text):
+                dict_other["serviceTime"] = _serviceTime.entity_text
+                break
+        if not dict_other["serviceTime"]:
+            max_prob = list_serviceTime[0].prob
+            max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
+            max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
+            dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
+
     if dict_other['moneysource']:
         dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
     # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留

+ 14 - 5
BiddingKG/dl/interface/predictor.py

@@ -3358,7 +3358,7 @@ class ProductAttributesPredictor():
             list_sentence = list_sentences[0]
             list_entity = list_entitys[0]
             _data = product_attrs[1]['demand_info']['data']
-            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
+            re_bidding_time = re.compile("(采购|采购实施|预计招标)(时间|月份|日期)[::,].{0,2}$")
             order_times = []
             for entity in list_entity:
                 if entity.entity_type=='time':
@@ -3390,7 +3390,8 @@ class ProductAttributesPredictor():
         # print('predict_without_table: ', product_attrs)
         return product_attrs
 
-    def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
+    def predict_by_text(self,product_attrs,html,list_outlines,product_list,page_time=""):
+        product_entity_list = list(set(product_list))
         list_outline = list_outlines[0]
         get_product_attrs = False
         for _outline in list_outline:
@@ -3491,6 +3492,14 @@ class ProductAttributesPredictor():
                                 category = deal_list[id0]
                                 product = "%s_%s" % (category, product) if product != "" else category
 
+                            if product == "":
+                                # print(deal_list[id4],deal_list[id5],tmp_head_list,deal_list)
+                                if (id4 != "" and deal_list[id4] != "") or (id5 != "" and deal_list[id5] != ""):
+                                    for head,value in zip(tmp_head_list,deal_list):
+                                        if value and value in product_entity_list:
+                                            product = value
+                                            break
+
                             if product != "":
                                 if id2 != "":
                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
@@ -3560,7 +3569,7 @@ class ProductAttributesPredictor():
                                             if float(budget)>= 100000*10000:
                                                 budget = ""
                                 if id8 != "":
-                                    if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
+                                    if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计招标)(时间|月份|日期)",header_list2[3]):
                                         order_time = deal_list[id8].strip()
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
                                 if id9 != "":
@@ -3688,11 +3697,11 @@ class ProductAttributesPredictor():
 
 
 
-    def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time):
+    def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
         if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
             product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
         if len(product_attrs[0]['product_attrs']['data']) == 0:
-            product_attrs = self.predict_by_text(product_attrs,text,list_outlines,page_time)
+            product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
         if len(product_attrs[1]['demand_info']['data'])>0:
             for d in product_attrs[1]['demand_info']['data']:
                 for product in set(prem[0]['product']):

+ 17 - 2
BiddingKG/dl/time/re_servicetime.py

@@ -242,10 +242,24 @@ def re_service_time(text):
     index2word = []
     for i in range(len(all_text_index_list)):
         word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
+        # print(word,text,all_text_index_list[i][0],all_text_index_list[i][1])
         if i != len(all_text_index_list)-1:
             word = word + " "
         index2word.append(word)
 
+        # 补充“服务期限12个月,自2022年10月1日至2023年9月30日。”类似数据
+        word2 = re.search("^[^。]{,8}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?.{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",text[all_text_index_list[i][1]:])
+        if not re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}",word) and word2:
+            word2 = word2.group()
+            word2 = re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?.{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",word2).group()
+            begin = all_text_index_list[i][1] + text[all_text_index_list[i][1]:].index(word2)
+            end = begin + len(word2)
+            # print(text[begin:end],"|",word2)
+            all_text_index_list.append([begin,end])
+            index2word.append(word2)
+
+
+    # print(index2word, all_text_index_list, prob)
     if TEST_MODE:
         print("index2word all_text_index_list", index2word, all_text_index_list)
     return index2word, all_text_index_list, prob
@@ -355,8 +369,9 @@ def test_from_str():
     # """
     # s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
     # s = "交货,1.交货时间:7天,2.交货地点:广东清远市清城区飞来峡镇人民政府高田应急安置点"
-    s = ''',莆田市财政局走廊及卫生间吊顶改造工程中标结果公告,莆田市财政局走廊及卫生间吊顶改造工程,工程预算价236878元,发包价194240元,招标编号为:宏福莆招字【2020】H001号,该项目招标方式为:邀请招标。2020年04月07日开标,2020年04月07日评标完成,中标主要结果公示如下:中标人名称,福建省东海伟业建设有限公司,中标价:194240元,评标办法,随机抽取法,资格评审结果,注册建造师:合格:余爱华(注册编号:闽235141578763),履约保证金(元):合格:合同金额的10%,施工工期:14日历天,工程质量,备注,被确定为废标、无效标的投标人及原因:合格:无废标,资格审查小组:合格:王宗仙、林慧灵、谢淑青,根据评标结果确定福建省东海伟业建设有限公司为中标人,现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为,可在公示期内向相关主管部门投诉,招标单位:招标代理机构:莆田市财政局,福建省宏福工程管理有限公司,联系电话:0594-2694413,联系电话:15160467775,2020年04月08日,2020年04月08日,
-'''
+    s = "本项目服务期限12个月,自2022年10月1日至2023年9月30日。"
+#     s = ''',莆田市财政局走廊及卫生间吊顶改造工程中标结果公告,莆田市财政局走廊及卫生间吊顶改造工程,工程预算价236878元,发包价194240元,招标编号为:宏福莆招字【2020】H001号,该项目招标方式为:邀请招标。2020年04月07日开标,2020年04月07日评标完成,中标主要结果公示如下:中标人名称,福建省东海伟业建设有限公司,中标价:194240元,评标办法,随机抽取法,资格评审结果,注册建造师:合格:余爱华(注册编号:闽235141578763),履约保证金(元):合格:合同金额的10%,施工工期:14日历天,工程质量,备注,被确定为废标、无效标的投标人及原因:合格:无废标,资格审查小组:合格:王宗仙、林慧灵、谢淑青,根据评标结果确定福建省东海伟业建设有限公司为中标人,现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为,可在公示期内向相关主管部门投诉,招标单位:招标代理机构:莆田市财政局,福建省宏福工程管理有限公司,联系电话:0594-2694413,联系电话:15160467775,2020年04月08日,2020年04月08日,
+# '''
     print(extract_servicetime(s))
     print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))
 

+ 4 - 3
BiddingKG/dl_dev/test/test4.py

@@ -57,6 +57,7 @@ def test(name,content,_url=None):
 
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
     _url = "http://127.0.0.1:15030/content_extract"
+    _url = "http://192.168.2.102:15030/content_extract"
     # _url = "http://192.168.2.102:15030/industry_extract"
     # _url = "http://192.168.2.102:15030/content_extract"
 
@@ -99,16 +100,16 @@ def run_one():
     from BiddingKG.dl.interface.extract import predict
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
     # text = codecs.open("2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
-    # content = "招标人:广州比地数据科技有限公司"
     a = time.time()
     # text = '''
     # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
     # '''
+    print("start")
     _time1 = time.time()
-    print(predict("12", content+"抚顺经济开发区拉古经济区卫生院(抚顺经济开发区拉古经济区预防保健所)","打印机"))
+    print(predict("12", text,""))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-a)