Quellcode durchsuchen

截标时间提取优化、时分秒提取优化

znj vor 1 Jahr
Ursprung
Commit
42ae8dd269

+ 15 - 5
BiddingKG/dl/interface/Preprocessing.py

@@ -1964,9 +1964,11 @@ def article_limit(soup,limit_words=30000):
         _gap = _count - max_count
         _is_skip = False
         next_soup = None
+        # 跳过层级结构为1的标签,向下取值
         while len(_soup.find_all(recursive=False)) == 1 and \
                 _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
             _soup = _soup.find_all(recursive=False)[0]
+        # 无结构的纯文本直接取值
         if len(_soup.find_all(recursive=False)) == 0:
             _soup.string = str(_soup.get_text())[:max_count-_count]
             _count += len(re.sub(sub_space, "", _soup.string))
@@ -2000,22 +2002,24 @@ def article_limit(soup,limit_words=30000):
                 have_attachment = True
                 break
     if not have_attachment:
-        # 无附件
+        # 无附件,通过get_text()方法与limit_words大小判断是否要限制字数
         if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
-            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=1000)
             while n_soup:
-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
 
     else:
         # 有附件
         _text = re.sub(sub_space, "", soup.get_text())
         _text_split = _text.split("##attachment##")
+        # 正文部分
         if len(_text_split[0])>limit_words:
             main_soup = attachment_part.parent
             main_text = main_soup.find_all(recursive=False)[0]
-            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=1000)
             while n_soup:
-                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=1000)
+        # 附件部分
         if len(_text_split[1])>limit_words:
             # attachment_html纯文本,无子结构
             if len(attachment_part.find_all(recursive=False))==0:
@@ -2035,6 +2039,12 @@ def article_limit(soup,limit_words=30000):
                                         attachment_skip = True
                                 else:
                                     p_part.decompose()
+                            # attachment_text_nums, gap, n_part = soup_limit(part, attachment_text_nums,
+                            #                                     max_count=limit_words,max_gap=1000)
+                            # while n_part:
+                            #     attachment_text_nums, gap, n_part = soup_limit(n_part, attachment_text_nums,
+                            #                                         max_count=limit_words,max_gap=1000)
+                            # print(attachment_text_nums)
                         else:
                             last_attachment_text_nums = attachment_text_nums
                             attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))

+ 2 - 2
BiddingKG/dl/interface/extract.py

@@ -272,7 +272,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     entityLink.link_entitys(list_entitys)
     doctitle_refine = entityLink.doctitle_refine(title)
     nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
@@ -317,7 +317,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
-    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0],page_time))
 
     '''更新单一来源招标公告中标角色为预中标'''
     getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)

+ 53 - 45
BiddingKG/dl/interface/getAttributes.py

@@ -1417,7 +1417,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     last_phone_mask = False
                     continue
                 # 排除号码实体为时间格式 ,例如:20150515
-                if re.search("^20(1[0-9]|2[0-2])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
+                if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
                     error_numStr_index.append(numStr_index)
                     last_phone_mask = False
                     continue
@@ -2964,9 +2964,12 @@ def turnMoneySource(moneysource):
 
 my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
-import time
-def my_timeFormat(_time):
-    current_year = time.strftime("%Y",time.localtime())
+import time,datetime
+def my_timeFormat(_time,page_time):
+    if page_time:
+        current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
+    else:
+        current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(my_time_format_pattern,_time)
     time_list = []
     for _match in all_match:
@@ -2986,10 +2989,10 @@ def my_timeFormat(_time):
                 if re.search("^\d+$", year):
                     if len(year) == 2:
                         year = "20" + year
-                        if int(year) - int(current_year) > 5:
+                        if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
                             legal = False
                     else:
-                        if int(year) - int(current_year)>10:
+                        if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
                             legal = False
                 else:
                     _year = ""
@@ -3035,7 +3038,7 @@ def my_timeFormat(_time):
                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
     return time_list
 
-def getTimeAttributes(list_entity,list_sentence):
+def getTimeAttributes(list_entity,list_sentence,page_time):
     time_entitys = [i for i in list_entity if i.entity_type=='time']
     time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
     list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
@@ -3104,24 +3107,29 @@ def getTimeAttributes(list_entity,list_sentence):
             last_time_type = ""
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
         entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
-        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
+        entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
-        extract_time = my_timeFormat(entity_text)
+        extract_time = my_timeFormat(entity_text,page_time)
         # print(entity_text,entity_left2)
-        # definite_time = "00:00:00"
         if extract_time:
             definite_time_list = []
-            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
+            t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
             _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
+            _entity_text_len = len(_entity_text)
+            _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
             t_in_word_num = len(re.findall(t,_entity_text))
-            t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
+            # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
             begin_index = 0
             for _num in range(t_in_word_num):
+                if begin_index> _entity_text_len + 8:
+                    break
                 t_in_word = re.search(t, _entity_text[begin_index:])
                 if t_in_word:
+                    if _num==0 and t_in_word.start() > _entity_text_len + 8:
+                        break
                     begin_index = t_in_word.end()
                     # print('t_in_word',entity_text,t_in_word.groupdict())
                     day = t_in_word.groupdict().get('day',"")
@@ -3151,35 +3159,35 @@ def getTimeAttributes(list_entity,list_sentence):
                     # print(definite_time)
                     definite_time_list.append(definite_time)
 
-            if t_out_of_word:
-                # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
-                day = t_out_of_word.groupdict().get('day', "")
-                hour = t_out_of_word.groupdict().get('hour', "")
-                half_hour = t_out_of_word.groupdict().get('half_hour', "")
-                minute = t_out_of_word.groupdict().get('minute', "")
-                second = t_out_of_word.groupdict().get('second', "")
-                if hour:
-                    if day == '下午' and int(hour) < 12:
-                        hour = str(int(hour) + 12)
-                    if int(hour) > 24:
-                        continue
-                else:
-                    hour = "00"
-                if not minute:
-                    if half_hour:
-                        minute = "30"
-                    else:
-                        minute = "00"
-                if int(minute) > 60:
-                    continue
-                if not second:
-                    second = "00"
-                if int(second) > 60:
-                    continue
-                definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
-                # print(definite_time)
-                definite_time_list.append(definite_time)
-
+            # if t_out_of_word:
+            #     # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
+            #     day = t_out_of_word.groupdict().get('day', "")
+            #     hour = t_out_of_word.groupdict().get('hour', "")
+            #     half_hour = t_out_of_word.groupdict().get('half_hour', "")
+            #     minute = t_out_of_word.groupdict().get('minute', "")
+            #     second = t_out_of_word.groupdict().get('second', "")
+            #     if hour:
+            #         if day == '下午' and int(hour) < 12:
+            #             hour = str(int(hour) + 12)
+            #         if int(hour) > 24:
+            #             continue
+            #     else:
+            #         hour = "00"
+            #     if not minute:
+            #         if half_hour:
+            #             minute = "30"
+            #         else:
+            #             minute = "00"
+            #     if int(minute) > 60:
+            #         continue
+            #     if not second:
+            #         second = "00"
+            #     if int(second) > 60:
+            #         continue
+            #     definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
+            #     # print(definite_time)
+            #     definite_time_list.append(definite_time)
+            #
 
             min_len = min(len(extract_time),len(definite_time_list))
             for i in range(min_len):
@@ -3489,7 +3497,7 @@ def getTimeAttributes(list_entity,list_sentence):
     return result_dict
 
 
-def getOtherAttributes(list_entity):
+def getOtherAttributes(list_entity,page_time):
     dict_other = {"moneysource":"",
                   "person_review":[],
                   "serviceTime":"",
@@ -3535,7 +3543,7 @@ def getOtherAttributes(list_entity):
                 for _serviceTime in list_time:
                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
                     if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
-                        _extract_time = my_timeFormat(_serviceTime.entity_text)
+                        _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
                         if _extract_time and len(_extract_time)==2:
                             # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
                             if _extract_time[0]!=_extract_time[1]:
@@ -3570,7 +3578,7 @@ def getOtherAttributes(list_entity):
 def getMoneyRange(RoleList):
     pass
 
-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
     '''
     @param:
         list_sentence:所有文章的句子list
@@ -3581,7 +3589,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
     for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
         RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
         result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
-                           **getTimeAttributes(list_entity, list_sentence),
+                           **getTimeAttributes(list_entity, list_sentence,page_time),
                            **{"fingerprint": list_article.fingerprint,
                               "match_enterprise": list_article.match_enterprise,
                               "match_enterprise_type": list_article.match_enterprise_type,