Ver Fonte

Merge remote-tracking branch 'origin/master'

lsm há 1 ano atrás
pai
commit
9418dc9ae0

+ 3 - 3
BiddingKG/dl/fingerprint/documentFingerprint.py

@@ -13,9 +13,9 @@ def getHtmlText(sourceHtml):
         _href = _a.attrs.get("href","")
         if _href.find("www.bidizhaobiao.com")>0:
             _a.decompose()
-    richText = _soup.find("div",attrs={"class":"richTextFetch"})
-    if richText is not None:
-        richText.decompose()
+    # richText = _soup.find("div",attrs={"class":"richTextFetch"})
+    # if richText is not None:
+    #     richText.decompose()
     _text = _soup.get_text()
 
     _text = re.sub("\s*",'',_text)

+ 4 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -3195,13 +3195,13 @@ def getTimeAttributes(list_entity,list_sentence):
             # 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
             if entity.label in [2,3,9]:
                 if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
-                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+                    dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
                 if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
-                    dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
+                    dict_time['time_bidopen'].append((extract_time[0], label_prob, in_attachment))
                 if entity.label==3 and re.search("报名",entity_left3):
                     dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
                 if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
-                    dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
+                    dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
             if entity.label in [11, 3]:
                 if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
                     dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
@@ -3326,7 +3326,7 @@ def getTimeAttributes(list_entity,list_sentence):
                         dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
                         last_time_type = 'time_bidclose'
                     elif len(extract_time)==2:
-                        dict_time['time_bidstart'].append((extract_time[0], 0.5, in_attachment))
+                        dict_time['time_bidstart'].append((extract_time[0], 0.6, in_attachment))
                         dict_time['time_bidclose'].append((extract_time[1], label_prob, in_attachment))
                         last_time_type = 'time_bidclose'
                 elif entity.label==12 and label_prob>0.5: