il y a 1 an · 32ad857d6e
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -188,6 +188,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     cost_time["preprocess"] = round(time.time()-start_time,2)
			
 
				     cost_time.update(_cost_time)
			
 
				 
			
 
				+    # 过滤掉Redis里值为0的错误实体
			
 
				+    # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
			
 
				     # #依赖句子顺序
			
 
				     # start_time = time.time() # 公告类型/生命周期提取  此处作废 换到后面预测 2022/4/29
			
 
				     # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
			
@@ -319,7 +321,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     '''公告无表格格式时，采购意向预测'''  #依赖 docchannel结果 依赖产品及prem
			
 
				     '''把产品要素提取结果在项目名称的添加到 采购需求，预算时间，采购时间 要素中'''
			
 
				-    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time)
			
 
				+    predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time)
			
 
				 
			
 
				     '''行业分类提取，需要用标题、项目名称、产品、及prem 里面的角色'''
			
 
				     industry = predictor.getPredictor('industry').predict(title, project=codeName[0]['name'], product=','.join(product_list), prem=prem)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1552,8 +1552,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
			
 
				                             continue
			
 
				                         # 角色为中标候选人，排除"质疑|投诉|监督|受理"相关的联系人
			
 
				-                        if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
			
 
				+                        if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
			
 
				                             continue
			
 
				+                        if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
			
 
				+                            if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
			
 
				+                                                                         list_sentence[_object.sentence_index-1].sentence_text[-10:]+
			
 
				+                                                                         list_sentence[_object.sentence_index].sentence_text[0:_object.wordOffset_begin]):
			
 
				+                                continue
			
 
				                         # 角色为中标候选人，排除距离过远的联系人
			
 
				                         if _subject.label in [2, 3, 4] and distance>=40:
			
 
				                             continue
			
@@ -1979,6 +1984,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                                 # 角色为中标候选人，排除"质疑|投诉|监督|受理"相关的联系人
			
 
				                                 if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
			
 
				                                     break
			
 
				+                                if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
			
 
				+                                    if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
			
 
				+                                                                                 list_sentence[after_entity.sentence_index - 1].sentence_text[-10:] +
			
 
				+                                                                                 list_sentence[after_entity.sentence_index].sentence_text[0:after_entity.wordOffset_begin]):
			
 
				+                                        continue
			
 
				                                 if after_entity.label in [1, 2, 3]:
			
 
				                                     # distance = (tokens_num_dict[
			
 
				                                     #                 after_entity.sentence_index] + after_entity.begin_index) - (
			
@@ -3032,7 +3042,10 @@ def getTimeAttributes(list_entity,list_sentence):
 
				         'time_commencement':[] , #13 开工日期
			
 
				         'time_completion': [],  # 14 竣工日期
			
 
				         'time_listingStart': [],  # 15 挂牌开始日期（挂牌时间）
			
 
				-        'time_listingEnd': []  # 16 挂牌结束日期、挂牌截止日期
			
 
				+        'time_listingEnd': [],  # 16 挂牌结束日期、挂牌截止日期
			
 
				+        'time_signContract': [],  # 17 合同签订时间
			
 
				+        'time_contractStart': [],  # 18 合同开始时间
			
 
				+        'time_contractEnd': []  # 19 合同结束时间
			
 
				     }
			
 
				     last_sentence_index = 0
			
 
				     last_time_type = ""
			
@@ -3043,7 +3056,8 @@ def getTimeAttributes(list_entity,list_sentence):
 
				         'time_registrationStart':"time_registrationEnd",
			
 
				         'time_earnestMoneyStart':"time_earnestMoneyEnd",
			
 
				         'time_commencement':"time_completion",
			
 
				-        'time_listingStart':"time_listingEnd"
			
 
				+        'time_listingStart':"time_listingEnd",
			
 
				+        'time_contractStart':"time_contractEnd"
			
 
				     }
			
 
				     for entity in time_entitys:
			
 
				         sentence_text = list_sentence[entity.sentence_index].sentence_text
			
@@ -3174,7 +3188,32 @@ def getTimeAttributes(list_entity,list_sentence):
 
				                     last_sentence_index = entity.sentence_index
			
 
				                     continue
			
 
				 
			
 
				-            if re.search("至|到", entity_left):
			
 
				+            # 2023/9/13 新增合同相关时间
			
 
				+            if re.search("合同|服务|履[约行]", entity_left2):
			
 
				+                if len(extract_time) == 1:
			
 
				+                    if re.search("(合同.{,2}签[订定署].{,2}|签[订定署].{,2}合同.{,2})(?:时间|日期)|合同签[订定署].{,1}$", entity_left2):
			
 
				+                        dict_time['time_signContract'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                        last_time_type = 'time_signContract'
			
 
				+                    elif re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
			
 
				+                        if re.search("到|至|截[至止]",entity_left) or re.search("前|止|截止",entity_right) or re.search("前",entity_text[-2:]):
			
 
				+                            dict_time['time_contractEnd'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                            last_time_type = 'time_contractEnd'
			
 
				+                        else:
			
 
				+                            dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
			
 
				+                            last_time_type = 'time_contractStart'
			
 
				+                    elif re.search("(合同|服务|履约|(合同|服务)履行).{,2}(?:起始|开始)(?:时间|日期)", entity_left2):
			
 
				+                        dict_time['time_contractStart'].append((extract_time[0], 0.55, in_attachment))
			
 
				+                        last_time_type = 'time_contractStart'
			
 
				+                    elif re.search("(合同|服务|履约).{,2}(?:完成|截止|结束)(?:时间|日期|时限)", entity_left2):
			
 
				+                        dict_time['time_contractEnd'].append((extract_time[0], 0.55, in_attachment))
			
 
				+                        last_time_type = 'time_contractEnd'
			
 
				+                else:
			
 
				+                    if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
			
 
				+                        dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment))
			
 
				+                        dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment))
			
 
				+                        last_time_type = ''
			
 
				+
			
 
				+            if re.search("至|到|[日\d][-—]$", entity_left):
			
 
				                 if entity.sentence_index == last_sentence_index:
			
 
				                     time_type = last_time_index.get(last_time_type)
			
 
				                     if time_type:
			
@@ -3349,8 +3388,9 @@ def getOtherAttributes(list_entity):
 
				                 dict_other["moneysource"] = entity.entity_text
			
 
				                 last_moneysource_prob = entity.prob
			
 
				         elif entity.entity_type=='serviceTime':
			
 
				-            if list_serviceTime and entity.in_attachment:
			
 
				-                continue
			
 
				+            # print(entity.entity_text)
			
 
				+            # if list_serviceTime and entity.in_attachment:
			
 
				+            #     continue
			
 
				             if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
			
 
				                 list_serviceTime.append(entity)
			
 
				         elif entity.entity_type=="person" and entity.label ==4:
			
@@ -3361,11 +3401,22 @@ def getOtherAttributes(list_entity):
 
				             dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
			
 
				             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
			
 
				     if list_serviceTime:
			
 
				-        list_serviceTime.sort(key=lambda x:x.prob,reverse=True)
			
 
				-        max_prob = list_serviceTime[0].prob
			
 
				-        max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
			
 
				-        max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
			
 
				-        dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
			
 
				+        list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
			
 
				+        list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
			
 
				+        if not list_serviceTime:
			
 
				+            list_serviceTime = list_serviceTime_inAtt
			
 
				+        list_serviceTime.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
			
 
				+        for _serviceTime in list_serviceTime:
			
 
				+            # 优先取具体时间
			
 
				+            if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}",_serviceTime.entity_text):
			
 
				+                dict_other["serviceTime"] = _serviceTime.entity_text
			
 
				+                break
			
 
				+        if not dict_other["serviceTime"]:
			
 
				+            max_prob = list_serviceTime[0].prob
			
 
				+            max_prob_serviceTime = [ent for ent in list_serviceTime if ent.prob==max_prob]
			
 
				+            max_prob_serviceTime.sort(key=lambda x:(x.sentence_index,x.begin_index))
			
 
				+            dict_other["serviceTime"] = max_prob_serviceTime[0].entity_text
			
 
				+
			
 
				     if dict_other['moneysource']:
			
 
				         dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
			
 
				     # dict_other["product"] = list(set(dict_other["product"])) # 已在添加时 顺序去重保留
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -3358,7 +3358,7 @@ class ProductAttributesPredictor():
 
				             list_sentence = list_sentences[0]
			
 
				             list_entity = list_entitys[0]
			
 
				             _data = product_attrs[1]['demand_info']['data']
			
 
				-            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[:：，].{0,2}$")
			
 
				+            re_bidding_time = re.compile("(采购|采购实施|预计招标)(时间|月份|日期)[:：，].{0,2}$")
			
 
				             order_times = []
			
 
				             for entity in list_entity:
			
 
				                 if entity.entity_type=='time':
			
@@ -3390,7 +3390,8 @@ class ProductAttributesPredictor():
 
				         # print('predict_without_table: ', product_attrs)
			
 
				         return product_attrs
			
 
				 
			
 
				-    def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
			
 
				+    def predict_by_text(self,product_attrs,html,list_outlines,product_list,page_time=""):
			
 
				+        product_entity_list = list(set(product_list))
			
 
				         list_outline = list_outlines[0]
			
 
				         get_product_attrs = False
			
 
				         for _outline in list_outline:
			
@@ -3491,6 +3492,14 @@ class ProductAttributesPredictor():
 
				                                 category = deal_list[id0]
			
 
				                                 product = "%s_%s" % (category, product) if product != "" else category
			
 
				 
			
 
				+                            if product == "":
			
 
				+                                # print(deal_list[id4],deal_list[id5],tmp_head_list,deal_list)
			
 
				+                                if (id4 != "" and deal_list[id4] != "") or (id5 != "" and deal_list[id5] != ""):
			
 
				+                                    for head,value in zip(tmp_head_list,deal_list):
			
 
				+                                        if value and value in product_entity_list:
			
 
				+                                            product = value
			
 
				+                                            break
			
 
				+
			
 
				                             if product != "":
			
 
				                                 if id2 != "":
			
 
				                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
			
@@ -3560,7 +3569,7 @@ class ProductAttributesPredictor():
 
				                                             if float(budget)>= 100000*10000:
			
 
				                                                 budget = ""
			
 
				                                 if id8 != "":
			
 
				-                                    if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
			
 
				+                                    if re.search('\w', deal_list[id8]) and re.search("(采购|采购实施|预计招标)(时间|月份|日期)",header_list2[3]):
			
 
				                                         order_time = deal_list[id8].strip()
			
 
				                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
			
 
				                                 if id9 != "":
			
@@ -3688,11 +3697,11 @@ class ProductAttributesPredictor():
 
				 
			
 
				 
			
 
				 
			
 
				-    def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time):
			
 
				+    def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
			
 
				         if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
			
 
				             product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
			
 
				         if len(product_attrs[0]['product_attrs']['data']) == 0:
			
 
				-            product_attrs = self.predict_by_text(product_attrs,text,list_outlines,page_time)
			
 
				+            product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
			
 
				         if len(product_attrs[1]['demand_info']['data'])>0:
			
 
				             for d in product_attrs[1]['demand_info']['data']:
			
 
				                 for product in set(prem[0]['product']):
			
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py
@@ -242,10 +242,24 @@ def re_service_time(text):
 
				     index2word = []
			
 
				     for i in range(len(all_text_index_list)):
			
 
				         word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
			
 
				+        # print(word,text,all_text_index_list[i][0],all_text_index_list[i][1])
			
 
				         if i != len(all_text_index_list)-1:
			
 
				             word = word + " "
			
 
				         index2word.append(word)
			
 
				 
			
 
				+        # 补充“服务期限12个月，自2022年10月1日至2023年9月30日。”类似数据
			
 
				+        word2 = re.search("^[^。]{,8}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?.{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",text[all_text_index_list[i][1]:])
			
 
				+        if not re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}",word) and word2:
			
 
				+            word2 = word2.group()
			
 
				+            word2 = re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?.{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",word2).group()
			
 
				+            begin = all_text_index_list[i][1] + text[all_text_index_list[i][1]:].index(word2)
			
 
				+            end = begin + len(word2)
			
 
				+            # print(text[begin:end],"|",word2)
			
 
				+            all_text_index_list.append([begin,end])
			
 
				+            index2word.append(word2)
			
 
				+
			
 
				+
			
 
				+    # print(index2word, all_text_index_list, prob)
			
 
				     if TEST_MODE:
			
 
				         print("index2word all_text_index_list", index2word, all_text_index_list)
			
 
				     return index2word, all_text_index_list, prob
			
@@ -355,8 +369,9 @@ def test_from_str():
 
				     # """
			
 
				     # s = "5元/年 服务期：交付使用之日起三年； 承诺服务等级"
			
 
				     # s = "交货，1.交货时间：7天，2.交货地点：广东清远市清城区飞来峡镇人民政府高田应急安置点"
			
 
				-    s = '''，莆田市财政局走廊及卫生间吊顶改造工程中标结果公告，莆田市财政局走廊及卫生间吊顶改造工程，工程预算价236878元，发包价194240元，招标编号为：宏福莆招字【2020】H001号，该项目招标方式为：邀请招标。2020年04月07日开标，2020年04月07日评标完成，中标主要结果公示如下：中标人名称，福建省东海伟业建设有限公司，中标价:194240元，评标办法，随机抽取法，资格评审结果，注册建造师：合格：余爱华(注册编号：闽235141578763)，履约保证金(元)：合格：合同金额的10%，施工工期：14日历天，工程质量，备注，被确定为废标、无效标的投标人及原因：合格：无废标，资格审查小组：合格：王宗仙、林慧灵、谢淑青，根据评标结果确定福建省东海伟业建设有限公司为中标人，现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为，可在公示期内向相关主管部门投诉，招标单位：招标代理机构：莆田市财政局，福建省宏福工程管理有限公司，联系电话：0594-2694413，联系电话：15160467775，2020年04月08日，2020年04月08日，
			
 
				-'''
			
 
				+    s = "本项目服务期限12个月，自2022年10月1日至2023年9月30日。"
			
 
				+#     s = '''，莆田市财政局走廊及卫生间吊顶改造工程中标结果公告，莆田市财政局走廊及卫生间吊顶改造工程，工程预算价236878元，发包价194240元，招标编号为：宏福莆招字【2020】H001号，该项目招标方式为：邀请招标。2020年04月07日开标，2020年04月07日评标完成，中标主要结果公示如下：中标人名称，福建省东海伟业建设有限公司，中标价:194240元，评标办法，随机抽取法，资格评审结果，注册建造师：合格：余爱华(注册编号：闽235141578763)，履约保证金(元)：合格：合同金额的10%，施工工期：14日历天，工程质量，备注，被确定为废标、无效标的投标人及原因：合格：无废标，资格审查小组：合格：王宗仙、林慧灵、谢淑青，根据评标结果确定福建省东海伟业建设有限公司为中标人，现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为，可在公示期内向相关主管部门投诉，招标单位：招标代理机构：莆田市财政局，福建省宏福工程管理有限公司，联系电话：0594-2694413，联系电话：15160467775，2020年04月08日，2020年04月08日，
			
 
				+# '''
			
 
				     print(extract_servicetime(s))
			
 
				     print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-～~起至到—]+\d{2,4}[-.年/]', s))
			
 
				 
			
--- a/BiddingKG/dl_dev/test/test4.py
+++ b/BiddingKG/dl_dev/test/test4.py
@@ -57,6 +57,7 @@ def test(name,content,_url=None):
 
				 
			
 
				     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
			
 
				     _url = "http://127.0.0.1:15030/content_extract"
			
 
				+    _url = "http://192.168.2.102:15030/content_extract"
			
 
				     # _url = "http://192.168.2.102:15030/industry_extract"
			
 
				     # _url = "http://192.168.2.102:15030/content_extract"
			
 
				 
			
@@ -99,16 +100,16 @@ def run_one():
 
				     from BiddingKG.dl.interface.extract import predict
			
 
				     # filename = "比地_52_79929693.html"
			
 
				     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				-    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt","r",encoding="utf8").read()
			
 
				     # text = codecs.open("2.html","r",encoding="utf8").read()
			
 
				     content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				-    # content = "招标人：广州比地数据科技有限公司"
			
 
				     a = time.time()
			
 
				     # text = '''
			
 
				     # 购安装工程二标段，第一中标候选人，投标人名称，南阳市宝琛装饰工程有限责任公司，投标报价:147892
			
 
				     # '''
			
 
				+    print("start")
			
 
				     _time1 = time.time()
			
 
				-    print(predict("12", content+"抚顺经济开发区拉古经济区卫生院（抚顺经济开发区拉古经济区预防保健所）","打印机"))
			
 
				+    print(predict("12", text,""))
			
 
				     # test(12,content)
			
 
				     # test(12,text)
			
 
				     print("takes",time.time()-a)