|
@@ -2224,6 +2224,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
for article in articles:
|
|
for article in articles:
|
|
doc_id = article[0]
|
|
doc_id = article[0]
|
|
sourceContent = article[1]
|
|
sourceContent = article[1]
|
|
|
|
+ sourceContent_raw = article[1] # 原始html数据,fingerprint计算MD5用
|
|
sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
|
|
sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
|
|
sourceContent = re.sub("##attachment##","",sourceContent)
|
|
sourceContent = re.sub("##attachment##","",sourceContent)
|
|
sourceContent = sourceContent.replace('<br/>', '<br>')
|
|
sourceContent = sourceContent.replace('<br/>', '<br>')
|
|
@@ -2237,6 +2238,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
# sourceContent = sourceContent.replace(br_match,_new,1)
|
|
# sourceContent = sourceContent.replace(br_match,_new,1)
|
|
_send_doc_id = article[3]
|
|
_send_doc_id = article[3]
|
|
_title = article[4]
|
|
_title = article[4]
|
|
|
|
+ _title_raw = article[4]
|
|
page_time = article[5]
|
|
page_time = article[5]
|
|
web_source_no = article[6]
|
|
web_source_no = article[6]
|
|
'''特别数据源对 html 做特别修改'''
|
|
'''特别数据源对 html 做特别修改'''
|
|
@@ -2364,7 +2366,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
#article_processed = article[1]
|
|
#article_processed = article[1]
|
|
_article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
|
|
_article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
|
|
bidway=bidway)
|
|
bidway=bidway)
|
|
- _article.fingerprint = getFingerprint(_title+sourceContent)
|
|
|
|
|
|
+ _article.fingerprint = getFingerprint(_title_raw+sourceContent_raw)
|
|
_article.page_time = page_time
|
|
_article.page_time = page_time
|
|
list_articles.append(_article)
|
|
list_articles.append(_article)
|
|
return list_articles
|
|
return list_articles
|
|
@@ -2719,7 +2721,19 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
|
|
continue
|
|
continue
|
|
money_list.append((entity_text, start_index, end_index, unit, notes))
|
|
money_list.append((entity_text, start_index, end_index, unit, notes))
|
|
return money_list, found_yeji
|
|
return money_list, found_yeji
|
|
-
|
|
|
|
|
|
+def cut_repeat_name(s):
|
|
|
|
+ '''
|
|
|
|
+ 公司连续重复名称去重
|
|
|
|
+ :param s:
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ if len(s) >= 8:
|
|
|
|
+ n = s.count(s[-4:])
|
|
|
|
+ id = s.find(s[-4:]) + 4
|
|
|
|
+ sub_s = s[:id]
|
|
|
|
+ if n>=2 and s == sub_s * n:
|
|
|
|
+ s = sub_s
|
|
|
|
+ return s
|
|
def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
'''
|
|
'''
|
|
|
|
|
|
@@ -2913,6 +2927,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
# print('公司实体不符合规范:', entity_text)
|
|
# print('公司实体不符合规范:', entity_text)
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
+ entity_text = cut_repeat_name(entity_text) # 20231201 重复名称去重 如:中山大学附属第一医院中山大学附属第一医院中山大学附属第一医院
|
|
|
|
+
|
|
list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
|
|
list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
|
|
# 标记文章末尾的"发布人”、“发布时间”实体
|
|
# 标记文章末尾的"发布人”、“发布时间”实体
|
|
if sentence_index==len(list_sentence)-1 or sentence_index==doctextcon_sentence_len-1:
|
|
if sentence_index==len(list_sentence)-1 or sentence_index==doctextcon_sentence_len-1:
|