vor 1 Jahr · 42ae8dd269
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1964,9 +1964,11 @@ def article_limit(soup,limit_words=30000):
 
				         _gap = _count - max_count
			
 
				         _is_skip = False
			
 
				         next_soup = None
			
 
				+        # 跳过层级结构为1的标签，向下取值
			
 
				         while len(_soup.find_all(recursive=False)) == 1 and \
			
 
				                 _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
			
 
				             _soup = _soup.find_all(recursive=False)[0]
			
 
				+        # 无结构的纯文本直接取值
			
 
				         if len(_soup.find_all(recursive=False)) == 0:
			
 
				             _soup.string = str(_soup.get_text())[:max_count-_count]
			
 
				             _count += len(re.sub(sub_space, "", _soup.string))
			
@@ -2000,22 +2002,24 @@ def article_limit(soup,limit_words=30000):
 
				                 have_attachment = True
			
 
				                 break
			
 
				     if not have_attachment:
			
 
				-        # 无附件
			
 
				+        # 无附件，通过get_text()方法与limit_words大小判断是否要限制字数
			
 
				         if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
			
 
				-            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
			
 
				+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=1000)
			
 
				             while n_soup:
			
 
				-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
			
 
				+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
			
 
				 
			
 
				     else:
			
 
				         # 有附件
			
 
				         _text = re.sub(sub_space, "", soup.get_text())
			
 
				         _text_split = _text.split("##attachment##")
			
 
				+        # 正文部分
			
 
				         if len(_text_split[0])>limit_words:
			
 
				             main_soup = attachment_part.parent
			
 
				             main_text = main_soup.find_all(recursive=False)[0]
			
 
				-            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
			
 
				+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=1000)
			
 
				             while n_soup:
			
 
				-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
			
 
				+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
			
 
				+        # 附件部分
			
 
				         if len(_text_split[1])>limit_words:
			
 
				             # attachment_html纯文本，无子结构
			
 
				             if len(attachment_part.find_all(recursive=False))==0:
			
@@ -2035,6 +2039,12 @@ def article_limit(soup,limit_words=30000):
 
				                                         attachment_skip = True
			
 
				                                 else:
			
 
				                                     p_part.decompose()
			
 
				+                            # attachment_text_nums, gap, n_part = soup_limit(part, attachment_text_nums,
			
 
				+                            #                                     max_count=limit_words,max_gap=1000)
			
 
				+                            # while n_part:
			
 
				+                            #     attachment_text_nums, gap, n_part = soup_limit(n_part, attachment_text_nums,
			
 
				+                            #                                         max_count=limit_words,max_gap=1000)
			
 
				+                            # print(attachment_text_nums)
			
 
				                         else:
			
 
				                             last_attachment_text_nums = attachment_text_nums
			
 
				                             attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -272,7 +272,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     entityLink.link_entitys(list_entitys)
			
 
				     doctitle_refine = entityLink.doctitle_refine(title)
			
 
				     nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
			
 
				-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
			
 
				+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
			
 
				     log("get attributes done of doc_id%s"%(doc_id))
			
 
				     cost_time["attrs"] = round(time.time()-start_time,2)
			
 
				 
			
@@ -317,7 +317,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
			
 
				     log("get product done of doc_id%s"%(doc_id))
			
 
				     cost_time["product"] = round(time.time()-start_time,2)
			
 
				-    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
			
 
				+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time))
			
 
				 
			
 
				     '''更新单一来源招标公告中标角色为预中标'''
			
 
				     getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -1417,7 +1417,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
				                     last_phone_mask = False
			
 
				                     continue
			
 
				                 # 排除号码实体为时间格式 ，例如：20150515
			
 
				-                if re.search("^20(1[0-9]|2[0-2])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
			
 
				+                if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
			
 
				                     error_numStr_index.append(numStr_index)
			
 
				                     last_phone_mask = False
			
 
				                     continue
			
@@ -2964,9 +2964,12 @@ def turnMoneySource(moneysource):
 
				 
			
 
				 my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
			
 
				 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
			
 
				-import time
			
 
				-def my_timeFormat(_time):
			
 
				-    current_year = time.strftime("%Y",time.localtime())
			
 
				+import time,datetime
			
 
				+def my_timeFormat(_time,page_time):
			
 
				+    if page_time:
			
 
				+        current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
			
 
				+    else:
			
 
				+        current_year = time.strftime("%Y",time.localtime())
			
 
				     all_match = re.finditer(my_time_format_pattern,_time)
			
 
				     time_list = []
			
 
				     for _match in all_match:
			
@@ -2986,10 +2989,10 @@ def my_timeFormat(_time):
 
				                 if re.search("^\d+$", year):
			
 
				                     if len(year) == 2:
			
 
				                         year = "20" + year
			
 
				-                        if int(year) - int(current_year) > 5:
			
 
				+                        if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
			
 
				                             legal = False
			
 
				                     else:
			
 
				-                        if int(year) - int(current_year)>10:
			
 
				+                        if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
			
 
				                             legal = False
			
 
				                 else:
			
 
				                     _year = ""
			
@@ -3035,7 +3038,7 @@ def my_timeFormat(_time):
 
				                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
			
 
				     return time_list
			
 
				 
			
 
				-def getTimeAttributes(list_entity,list_sentence):
			
 
				+def getTimeAttributes(list_entity,list_sentence,page_time):
			
 
				     time_entitys = [i for i in list_entity if i.entity_type=='time']
			
 
				     time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
			
 
				     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
			
@@ -3104,24 +3107,29 @@ def getTimeAttributes(list_entity,list_sentence):
 
				             last_time_type = ""
			
 
				         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
			
 
				         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
			
 
				-        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
			
 
				+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
			
 
				         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
			
 
				         label_prob = entity.values[entity.label]
			
 
				         entity_text = entity.entity_text
			
 
				         in_attachment = entity.in_attachment
			
 
				-        extract_time = my_timeFormat(entity_text)
			
 
				+        extract_time = my_timeFormat(entity_text,page_time)
			
 
				         # print(entity_text,entity_left2)
			
 
				-        # definite_time = "00:00:00"
			
 
				         if extract_time:
			
 
				             definite_time_list = []
			
 
				-            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[:：时点](?P<half_hour>半)?(?P<minute>\d{2})?[:：分]?(?P<second>\d{2})?秒?")
			
 
				+            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[:：时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[:：分]?(?P<second>\d{2})?秒?")
			
 
				             _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
			
 
				+            _entity_text_len = len(_entity_text)
			
 
				+            _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
			
 
				             t_in_word_num = len(re.findall(t,_entity_text))
			
 
				-            t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
			
 
				+            # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
			
 
				             begin_index = 0
			
 
				             for _num in range(t_in_word_num):
			
 
				+                if begin_index> _entity_text_len + 8:
			
 
				+                    break
			
 
				                 t_in_word = re.search(t, _entity_text[begin_index:])
			
 
				                 if t_in_word:
			
 
				+                    if _num==0 and t_in_word.start() > _entity_text_len + 8:
			
 
				+                        break
			
 
				                     begin_index = t_in_word.end()
			
 
				                     # print('t_in_word',entity_text,t_in_word.groupdict())
			
 
				                     day = t_in_word.groupdict().get('day',"")
			
@@ -3151,35 +3159,35 @@ def getTimeAttributes(list_entity,list_sentence):
 
				                     # print(definite_time)
			
 
				                     definite_time_list.append(definite_time)
			
 
				 
			
 
				-            if t_out_of_word:
			
 
				-                # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
			
 
				-                day = t_out_of_word.groupdict().get('day', "")
			
 
				-                hour = t_out_of_word.groupdict().get('hour', "")
			
 
				-                half_hour = t_out_of_word.groupdict().get('half_hour', "")
			
 
				-                minute = t_out_of_word.groupdict().get('minute', "")
			
 
				-                second = t_out_of_word.groupdict().get('second', "")
			
 
				-                if hour:
			
 
				-                    if day == '下午' and int(hour) < 12:
			
 
				-                        hour = str(int(hour) + 12)
			
 
				-                    if int(hour) > 24:
			
 
				-                        continue
			
 
				-                else:
			
 
				-                    hour = "00"
			
 
				-                if not minute:
			
 
				-                    if half_hour:
			
 
				-                        minute = "30"
			
 
				-                    else:
			
 
				-                        minute = "00"
			
 
				-                if int(minute) > 60:
			
 
				-                    continue
			
 
				-                if not second:
			
 
				-                    second = "00"
			
 
				-                if int(second) > 60:
			
 
				-                    continue
			
 
				-                definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
			
 
				-                # print(definite_time)
			
 
				-                definite_time_list.append(definite_time)
			
 
				-
			
 
				+            # if t_out_of_word:
			
 
				+            #     # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
			
 
				+            #     day = t_out_of_word.groupdict().get('day', "")
			
 
				+            #     hour = t_out_of_word.groupdict().get('hour', "")
			
 
				+            #     half_hour = t_out_of_word.groupdict().get('half_hour', "")
			
 
				+            #     minute = t_out_of_word.groupdict().get('minute', "")
			
 
				+            #     second = t_out_of_word.groupdict().get('second', "")
			
 
				+            #     if hour:
			
 
				+            #         if day == '下午' and int(hour) < 12:
			
 
				+            #             hour = str(int(hour) + 12)
			
 
				+            #         if int(hour) > 24:
			
 
				+            #             continue
			
 
				+            #     else:
			
 
				+            #         hour = "00"
			
 
				+            #     if not minute:
			
 
				+            #         if half_hour:
			
 
				+            #             minute = "30"
			
 
				+            #         else:
			
 
				+            #             minute = "00"
			
 
				+            #     if int(minute) > 60:
			
 
				+            #         continue
			
 
				+            #     if not second:
			
 
				+            #         second = "00"
			
 
				+            #     if int(second) > 60:
			
 
				+            #         continue
			
 
				+            #     definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
			
 
				+            #     # print(definite_time)
			
 
				+            #     definite_time_list.append(definite_time)
			
 
				+            #
			
 
				 
			
 
				             min_len = min(len(extract_time),len(definite_time_list))
			
 
				             for i in range(min_len):
			
@@ -3489,7 +3497,7 @@ def getTimeAttributes(list_entity,list_sentence):
 
				     return result_dict
			
 
				 
			
 
				 
			
 
				-def getOtherAttributes(list_entity):
			
 
				+def getOtherAttributes(list_entity,page_time):
			
 
				     dict_other = {"moneysource":"",
			
 
				                   "person_review":[],
			
 
				                   "serviceTime":"",
			
@@ -3535,7 +3543,7 @@ def getOtherAttributes(list_entity):
 
				                 for _serviceTime in list_time:
			
 
				                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
			
 
				                     if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;；]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
			
 
				-                        _extract_time = my_timeFormat(_serviceTime.entity_text)
			
 
				+                        _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
			
 
				                         if _extract_time and len(_extract_time)==2:
			
 
				                             # 排除开始和结束时间一样的错误模板，例：“履约期限：2023年02月15日至2023年02月15日”
			
 
				                             if _extract_time[0]!=_extract_time[1]:
			
@@ -3570,7 +3578,7 @@ def getOtherAttributes(list_entity):
 
				 def getMoneyRange(RoleList):
			
 
				     pass
			
 
				 
			
 
				-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
			
 
				+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
			
 
				     '''
			
 
				     @param:
			
 
				         list_sentence:所有文章的句子list
			
@@ -3581,7 +3589,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
 
				     for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
			
 
				         RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
			
 
				         result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
			
 
				-                           **getTimeAttributes(list_entity, list_sentence),
			
 
				+                           **getTimeAttributes(list_entity, list_sentence,page_time),
			
 
				                            **{"fingerprint": list_article.fingerprint,
			
 
				                               "match_enterprise": list_article.match_enterprise,
			
 
				                               "match_enterprise_type": list_article.match_enterprise_type,