|
@@ -1019,6 +1019,112 @@ def tableToText(soup):
|
|
return soup
|
|
return soup
|
|
# return list_innerTable
|
|
# return list_innerTable
|
|
|
|
|
|
|
|
+re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
|
|
|
|
+num_dict = {
|
|
|
|
+ "一": 1, "二": 2,
|
|
|
|
+ "三": 3, "四": 4,
|
|
|
|
+ "五": 5, "六": 6,
|
|
|
|
+ "七": 7, "八": 8,
|
|
|
|
+ "九": 9, "十": 10}
|
|
|
|
+# 一百以内的中文大写转换为数字
|
|
|
|
+def change2num(text):
|
|
|
|
+ result_num = -1
|
|
|
|
+ # text = text[:6]
|
|
|
|
+ match = re_num.search(text)
|
|
|
|
+ if match:
|
|
|
|
+ _num = match.group()
|
|
|
|
+ if num_dict.get(_num):
|
|
|
|
+ return num_dict.get(_num)
|
|
|
|
+ else:
|
|
|
|
+ tenths = 1
|
|
|
|
+ the_unit = 0
|
|
|
|
+ num_split = _num.split("十")
|
|
|
|
+ if num_dict.get(num_split[0]):
|
|
|
|
+ tenths = num_dict.get(num_split[0])
|
|
|
|
+ if num_dict.get(num_split[1]):
|
|
|
|
+ the_unit = num_dict.get(num_split[1])
|
|
|
|
+ result_num = tenths * 10 + the_unit
|
|
|
|
+ elif re.search("\d{1,2}",text):
|
|
|
|
+ _num = re.search("\d{1,2}",text).group()
|
|
|
|
+ result_num = int(_num)
|
|
|
|
+ return result_num
|
|
|
|
+#大纲分段处理
|
|
|
|
+def get_preprocessed_outline(soup):
|
|
|
|
+ pattern_0 = re.compile("^(?:[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十])[、.\.]")
|
|
|
|
+ pattern_1 = re.compile("^[\((]?(?:[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十])[\))]")
|
|
|
|
+ pattern_2 = re.compile("^\d{1,2}[、.\.](?=[^\d]{1,2}|$)")
|
|
|
|
+ pattern_3 = re.compile("^[\((]?\d{1,2}[\))]")
|
|
|
|
+ pattern_list = [pattern_0, pattern_1, pattern_2, pattern_3]
|
|
|
|
+
|
|
|
|
+ body = soup.find("body")
|
|
|
|
+ body_child = body.find_all(recursive=False)
|
|
|
|
+ deal_part = body
|
|
|
|
+ # print(body_child[0]['id'])
|
|
|
|
+ if 'id' in body_child[0].attrs:
|
|
|
|
+ if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
|
|
|
|
+ deal_part = body_child[0]
|
|
|
|
+ if len(deal_part.find_all(recursive=False))>2:
|
|
|
|
+ deal_part = deal_part.parent
|
|
|
|
+ skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','<thead>','<tfoot>']
|
|
|
|
+ for part in deal_part.find_all(recursive=False):
|
|
|
|
+ # 查找解析文本的主干部分
|
|
|
|
+ is_main_text = False
|
|
|
|
+ through_text_num = 0
|
|
|
|
+ while (not is_main_text and part.find_all(recursive=False)):
|
|
|
|
+ while len(part.find_all(recursive=False)) == 1 and part.get_text(strip=True) == \
|
|
|
|
+ part.find_all(recursive=False)[0].get_text(strip=True):
|
|
|
|
+ part = part.find_all(recursive=False)[0]
|
|
|
|
+ max_len = len(part.get_text(strip=True))
|
|
|
|
+ is_main_text = True
|
|
|
|
+ for t_part in part.find_all(recursive=False):
|
|
|
|
+ if t_part.name not in skip_tag and t_part.get_text(strip=True)!="":
|
|
|
|
+ through_text_num += 1
|
|
|
|
+ if t_part.get_text(strip=True)!="" and len(t_part.get_text(strip=True))/max_len>=0.65:
|
|
|
|
+ if t_part.name not in skip_tag:
|
|
|
|
+ is_main_text = False
|
|
|
|
+ part = t_part
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ while len(t_part.find_all(recursive=False)) == 1 and t_part.get_text(strip=True) == \
|
|
|
|
+ t_part.find_all(recursive=False)[0].get_text(strip=True):
|
|
|
|
+ t_part = t_part.find_all(recursive=False)[0]
|
|
|
|
+ if through_text_num>2:
|
|
|
|
+ is_table = True
|
|
|
|
+ for _t_part in t_part.find_all(recursive=False):
|
|
|
|
+ if _t_part.name not in skip_tag:
|
|
|
|
+ is_table = False
|
|
|
|
+ break
|
|
|
|
+ if not is_table:
|
|
|
|
+ is_main_text = False
|
|
|
|
+ part = t_part
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ is_main_text = False
|
|
|
|
+ part = t_part
|
|
|
|
+ break
|
|
|
|
+ is_find = False
|
|
|
|
+ for _pattern in pattern_list:
|
|
|
|
+ last_index = 0
|
|
|
|
+ handle_list = []
|
|
|
|
+ for _part in part.find_all(recursive=False):
|
|
|
|
+ if _part.name not in skip_tag and _part.get_text(strip=True) != "":
|
|
|
|
+ # print('text:', _part.get_text(strip=True))
|
|
|
|
+ re_match = re.search(_pattern, _part.get_text(strip=True))
|
|
|
|
+ if re_match:
|
|
|
|
+ outline_index = change2num(re_match.group())
|
|
|
|
+ if last_index < outline_index:
|
|
|
|
+ # _part.insert_before("##split##")
|
|
|
|
+ handle_list.append(_part)
|
|
|
|
+ last_index = outline_index
|
|
|
|
+ if len(handle_list)>1:
|
|
|
|
+ is_find = True
|
|
|
|
+ for _part in handle_list:
|
|
|
|
+ _part.insert_before("##split##")
|
|
|
|
+ if is_find:
|
|
|
|
+ break
|
|
|
|
+ # print(soup)
|
|
|
|
+ return soup
|
|
|
|
+
|
|
#数据清洗
|
|
#数据清洗
|
|
def segment(soup,final=True):
|
|
def segment(soup,final=True):
|
|
# print("==")
|
|
# print("==")
|
|
@@ -1071,6 +1177,11 @@ def segment(soup,final=True):
|
|
child.insert_after("。")
|
|
child.insert_after("。")
|
|
if child.name in commaList:
|
|
if child.name in commaList:
|
|
child.insert_after(",")
|
|
child.insert_after(",")
|
|
|
|
+ if child.name == 'div' and 'class' in child.attrs:
|
|
|
|
+ # 添加附件"attachment"标识
|
|
|
|
+ if "richTextFetch" in child['class']:
|
|
|
|
+ child.insert_before("##attachment##")
|
|
|
|
+ # print(child.parent)
|
|
# if child.name in subspaceList:
|
|
# if child.name in subspaceList:
|
|
# child.insert_before("#subs"+str(child.name)+"#")
|
|
# child.insert_before("#subs"+str(child.name)+"#")
|
|
# child.insert_after("#sube"+str(child.name)+"#")
|
|
# child.insert_after("#sube"+str(child.name)+"#")
|
|
@@ -1164,7 +1275,8 @@ def segment(soup,final=True):
|
|
_text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
|
|
_text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
|
|
LOOP_BEGIN += LOOP_LEN
|
|
LOOP_BEGIN += LOOP_LEN
|
|
text = _text
|
|
text = _text
|
|
-
|
|
|
|
|
|
+ # 附件标识前修改为句号,避免正文和附件内容混合在一起
|
|
|
|
+ text = re.sub("[^。](?=##attachment##)","。",text)
|
|
return text
|
|
return text
|
|
|
|
|
|
'''
|
|
'''
|
|
@@ -1467,12 +1579,12 @@ def get_preprocessed(articles,useselffool=False):
|
|
'''
|
|
'''
|
|
cost_time = dict()
|
|
cost_time = dict()
|
|
list_articles = get_preprocessed_article(articles,cost_time)
|
|
list_articles = get_preprocessed_article(articles,cost_time)
|
|
- list_sentences = get_preprocessed_sentences(list_articles,True,cost_time)
|
|
|
|
|
|
+ list_sentences,list_outlines = get_preprocessed_sentences(list_articles,True,cost_time)
|
|
list_entitys = get_preprocessed_entitys(list_sentences,True,cost_time)
|
|
list_entitys = get_preprocessed_entitys(list_sentences,True,cost_time)
|
|
|
|
|
|
calibrateEnterprise(list_articles,list_sentences,list_entitys)
|
|
calibrateEnterprise(list_articles,list_sentences,list_entitys)
|
|
|
|
|
|
- return list_articles,list_sentences,list_entitys,cost_time
|
|
|
|
|
|
+ return list_articles,list_sentences,list_entitys,list_outlines,cost_time
|
|
|
|
|
|
|
|
|
|
def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
@@ -1486,13 +1598,36 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
doc_id = article[0]
|
|
doc_id = article[0]
|
|
sourceContent = article[1]
|
|
sourceContent = article[1]
|
|
sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
|
|
sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
|
|
|
|
+
|
|
|
|
+ sourceContent = sourceContent.replace('<br/>', '<br>')
|
|
|
|
+ sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
|
|
|
|
+ for br_match in re.findall("[^>]+?<br>",sourceContent):
|
|
|
|
+ _new = re.sub("<br>","",br_match)
|
|
|
|
+ # <br>标签替换为<p>标签
|
|
|
|
+ if not re.search("^\s+$",_new):
|
|
|
|
+ _new = '<p>'+_new + '</p>'
|
|
|
|
+ # print(br_match,_new)
|
|
|
|
+ sourceContent = sourceContent.replace(br_match,_new,1)
|
|
|
|
+
|
|
_send_doc_id = article[3]
|
|
_send_doc_id = article[3]
|
|
_title = article[4]
|
|
_title = article[4]
|
|
page_time = article[5]
|
|
page_time = article[5]
|
|
#表格处理
|
|
#表格处理
|
|
key_preprocess = "tableToText"
|
|
key_preprocess = "tableToText"
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
- article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
|
|
|
|
+ # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
|
|
|
|
+ article_processed = BeautifulSoup(sourceContent,"lxml")
|
|
|
|
+ # article_processed = preprocessed_html(article_processed,"")
|
|
|
|
+ for _soup in article_processed.descendants:
|
|
|
|
+ # 识别无标签文本,添加<p>标签
|
|
|
|
+ if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
|
|
|
|
+ # print(_soup.parent.string,_soup.string.strip())
|
|
|
|
+ _soup.wrap(article_processed.new_tag("p"))
|
|
|
|
+ # print(article_processed)
|
|
|
|
+ article_processed = get_preprocessed_outline(article_processed)
|
|
|
|
+ article_processed = tableToText(article_processed)
|
|
|
|
+ # print(article_processed)
|
|
|
|
+ article_processed = segment(article_processed)
|
|
article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
|
|
article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|
|
article_processed = article_processed.replace('成交工程价款', '成交工程价') # 2021/12/21 修正为中标价
|
|
article_processed = article_processed.replace('成交工程价款', '成交工程价') # 2021/12/21 修正为中标价
|
|
@@ -1547,6 +1682,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
:return: list_sentences
|
|
:return: list_sentences
|
|
'''
|
|
'''
|
|
list_sentences = []
|
|
list_sentences = []
|
|
|
|
+ list_outlines = []
|
|
for article in list_articles:
|
|
for article in list_articles:
|
|
list_sentences_temp = []
|
|
list_sentences_temp = []
|
|
list_entitys_temp = []
|
|
list_entitys_temp = []
|
|
@@ -1557,7 +1693,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
key_preprocess = "tableToText"
|
|
key_preprocess = "tableToText"
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
article_processed = article.content
|
|
article_processed = article.content
|
|
-
|
|
|
|
|
|
+ attachment_begin_index = -1
|
|
|
|
|
|
if key_preprocess not in cost_time:
|
|
if key_preprocess not in cost_time:
|
|
cost_time[key_preprocess] = 0
|
|
cost_time[key_preprocess] = 0
|
|
@@ -1572,13 +1708,66 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
for _iter in re.finditer(split_patten,article_processed):
|
|
for _iter in re.finditer(split_patten,article_processed):
|
|
_sen = article_processed[_begin:_iter.span()[1]]
|
|
_sen = article_processed[_begin:_iter.span()[1]]
|
|
if len(_sen)>0 and _sen not in sentences_set:
|
|
if len(_sen)>0 and _sen not in sentences_set:
|
|
|
|
+ # 标识在附件里的句子
|
|
|
|
+ if re.search("##attachment##",_sen):
|
|
|
|
+ attachment_begin_index = len(sentences)
|
|
|
|
+ # _sen = re.sub("##attachment##","",_sen)
|
|
sentences.append(_sen)
|
|
sentences.append(_sen)
|
|
sentences_set.add(_sen)
|
|
sentences_set.add(_sen)
|
|
_begin = _iter.span()[1]
|
|
_begin = _iter.span()[1]
|
|
_sen = article_processed[_begin:]
|
|
_sen = article_processed[_begin:]
|
|
|
|
+ if re.search("##attachment##", _sen):
|
|
|
|
+ # _sen = re.sub("##attachment##", "", _sen)
|
|
|
|
+ attachment_begin_index = len(sentences)
|
|
if len(_sen)>0 and _sen not in sentences_set:
|
|
if len(_sen)>0 and _sen not in sentences_set:
|
|
sentences.append(_sen)
|
|
sentences.append(_sen)
|
|
sentences_set.add(_sen)
|
|
sentences_set.add(_sen)
|
|
|
|
+ # 解析outline大纲分段
|
|
|
|
+ outline_list = []
|
|
|
|
+ if re.search("##split##",article.content):
|
|
|
|
+ temp_sentences = []
|
|
|
|
+ last_sentence_index = (-1,-1)
|
|
|
|
+ outline_index = 0
|
|
|
|
+ for sentence_index in range(len(sentences)):
|
|
|
|
+ sentence_text = sentences[sentence_index]
|
|
|
|
+ for _ in re.findall("##split##", sentence_text):
|
|
|
|
+ _match = re.search("##split##", sentence_text)
|
|
|
|
+ if last_sentence_index[0] > -1:
|
|
|
|
+ sentence_begin_index,wordOffset_begin = last_sentence_index
|
|
|
|
+ sentence_end_index = sentence_index
|
|
|
|
+ wordOffset_end = _match.start()
|
|
|
|
+ if sentence_begin_index<attachment_begin_index and sentence_end_index>=attachment_begin_index:
|
|
|
|
+ outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,attachment_begin_index-1,wordOffset_begin,len(sentences[attachment_begin_index-1])))
|
|
|
|
+ else:
|
|
|
|
+ outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,sentence_end_index,wordOffset_begin,wordOffset_end))
|
|
|
|
+ outline_index += 1
|
|
|
|
+ sentence_text = re.sub("##split##", "", sentence_text,count=1)
|
|
|
|
+ last_sentence_index = (sentence_index,_match.start())
|
|
|
|
+ temp_sentences.append(sentence_text)
|
|
|
|
+ if attachment_begin_index>-1 and last_sentence_index[0]<attachment_begin_index:
|
|
|
|
+ outline_list.append(Outline(doc_id,outline_index,'',last_sentence_index[0],attachment_begin_index-1,last_sentence_index[1],len(temp_sentences[attachment_begin_index-1])))
|
|
|
|
+ else:
|
|
|
|
+ outline_list.append(Outline(doc_id,outline_index,'',last_sentence_index[0],len(sentences)-1,last_sentence_index[1],len(temp_sentences[-1])))
|
|
|
|
+ sentences = temp_sentences
|
|
|
|
+ #解析outline的outline_text内容
|
|
|
|
+ for _outline in outline_list:
|
|
|
|
+ if _outline.sentence_begin_index==_outline.sentence_end_index:
|
|
|
|
+ _text = sentences[_outline.sentence_begin_index][_outline.wordOffset_begin:_outline.wordOffset_end]
|
|
|
|
+ else:
|
|
|
|
+ _text = ""
|
|
|
|
+ for idx in range(_outline.sentence_begin_index,_outline.sentence_end_index+1):
|
|
|
|
+ if idx==_outline.sentence_begin_index:
|
|
|
|
+ _text += sentences[idx][_outline.wordOffset_begin:]
|
|
|
|
+ elif idx==_outline.sentence_end_index:
|
|
|
|
+ _text += sentences[idx][:_outline.wordOffset_end]
|
|
|
|
+ else:
|
|
|
|
+ _text += sentences[idx]
|
|
|
|
+ _outline.outline_text = _text
|
|
|
|
+ _outline_summary = re.split("[::,]",_text,1)[0]
|
|
|
|
+ if len(_outline_summary)<20:
|
|
|
|
+ _outline.outline_summary = _outline_summary
|
|
|
|
+ # print(_outline.outline_index,_outline.outline_text)
|
|
|
|
+
|
|
article.content = "".join(sentences)
|
|
article.content = "".join(sentences)
|
|
# sentences.append(article_processed[_begin:])
|
|
# sentences.append(article_processed[_begin:])
|
|
|
|
|
|
@@ -1603,9 +1792,10 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
cost_time[key_nerToken] = 0
|
|
cost_time[key_nerToken] = 0
|
|
cost_time[key_nerToken] += round(time.time()-start_time,2)
|
|
cost_time[key_nerToken] += round(time.time()-start_time,2)
|
|
|
|
|
|
-
|
|
|
|
|
|
+ in_attachment = False
|
|
for sentence_index in range(len(sentences)):
|
|
for sentence_index in range(len(sentences)):
|
|
-
|
|
|
|
|
|
+ if sentence_index == attachment_begin_index:
|
|
|
|
+ in_attachment = True
|
|
sentence_text = sentences[sentence_index]
|
|
sentence_text = sentences[sentence_index]
|
|
tokens = tokens_all[sentence_index]
|
|
tokens = tokens_all[sentence_index]
|
|
|
|
|
|
@@ -1614,12 +1804,12 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
|
|
|
|
ner_entitys = ""
|
|
ner_entitys = ""
|
|
|
|
|
|
- list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys))
|
|
|
|
-
|
|
|
|
|
|
+ list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys,in_attachment=in_attachment))
|
|
if len(list_sentences_temp)==0:
|
|
if len(list_sentences_temp)==0:
|
|
list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=0,sentence_text="sentence_text",tokens=[],pos_tags=[],ner_tags=""))
|
|
list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=0,sentence_text="sentence_text",tokens=[],pos_tags=[],ner_tags=""))
|
|
list_sentences.append(list_sentences_temp)
|
|
list_sentences.append(list_sentences_temp)
|
|
- return list_sentences
|
|
|
|
|
|
+ list_outlines.append(outline_list)
|
|
|
|
+ return list_sentences,list_outlines
|
|
|
|
|
|
def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
'''
|
|
'''
|
|
@@ -1666,6 +1856,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
sentence_text = list_sentence[sentence_index].sentence_text
|
|
sentence_text = list_sentence[sentence_index].sentence_text
|
|
tokens = list_sentence[sentence_index].tokens
|
|
tokens = list_sentence[sentence_index].tokens
|
|
doc_id = list_sentence[sentence_index].doc_id
|
|
doc_id = list_sentence[sentence_index].doc_id
|
|
|
|
+ in_attachment = list_sentence[sentence_index].in_attachment
|
|
list_tokenbegin = []
|
|
list_tokenbegin = []
|
|
begin = 0
|
|
begin = 0
|
|
for i in range(0,len(tokens)):
|
|
for i in range(0,len(tokens)):
|
|
@@ -1739,7 +1930,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
#去掉标点符号
|
|
#去掉标点符号
|
|
entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
|
|
entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
|
|
entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
|
|
entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
|
|
- list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]))
|
|
|
|
|
|
+ list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
|
|
# 标记文章末尾的"发布人”、“发布时间”实体
|
|
# 标记文章末尾的"发布人”、“发布时间”实体
|
|
if sentence_index==len(list_sentence)-1:
|
|
if sentence_index==len(list_sentence)-1:
|
|
if len(list_sentence_entitys[-2:])>2:
|
|
if len(list_sentence_entitys[-2:])>2:
|
|
@@ -1974,7 +2165,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
_exists = True
|
|
_exists = True
|
|
if not _exists:
|
|
if not _exists:
|
|
if float(entity_text)>1:
|
|
if float(entity_text)>1:
|
|
- list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
|
|
|
|
|
|
+ list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
|
|
list_sentence_entitys[-1].notes = notes # 2021/7/20 新增金额备注
|
|
list_sentence_entitys[-1].notes = notes # 2021/7/20 新增金额备注
|
|
list_sentence_entitys[-1].money_unit = unit # 2021/7/20 新增金额备注
|
|
list_sentence_entitys[-1].money_unit = unit # 2021/7/20 新增金额备注
|
|
# print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
|
|
# print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
|
|
@@ -2026,7 +2217,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
entity_text = person['body']
|
|
entity_text = person['body']
|
|
list_sentence_entitys.append(
|
|
list_sentence_entitys.append(
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
- begin_index_temp, end_index_temp))
|
|
|
|
|
|
+ begin_index_temp, end_index_temp,in_attachment=in_attachment))
|
|
|
|
|
|
# 资金来源提取 2020/12/30 新增
|
|
# 资金来源提取 2020/12/30 新增
|
|
list_moneySource = extract_moneySource(sentence_text)
|
|
list_moneySource = extract_moneySource(sentence_text)
|
|
@@ -2050,7 +2241,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
entity_text = moneySource['body']
|
|
entity_text = moneySource['body']
|
|
list_sentence_entitys.append(
|
|
list_sentence_entitys.append(
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
- begin_index_temp, end_index_temp))
|
|
|
|
|
|
+ begin_index_temp, end_index_temp,in_attachment=in_attachment))
|
|
|
|
|
|
# 电子邮箱提取 2021/11/04 新增
|
|
# 电子邮箱提取 2021/11/04 新增
|
|
list_email = extract_email(sentence_text)
|
|
list_email = extract_email(sentence_text)
|
|
@@ -2074,7 +2265,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
entity_text = email['body']
|
|
entity_text = email['body']
|
|
list_sentence_entitys.append(
|
|
list_sentence_entitys.append(
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
- begin_index_temp, end_index_temp))
|
|
|
|
|
|
+ begin_index_temp, end_index_temp,in_attachment=in_attachment))
|
|
|
|
|
|
# 服务期限提取 2020/12/30 新增
|
|
# 服务期限提取 2020/12/30 新增
|
|
list_servicetime = extract_servicetime(sentence_text)
|
|
list_servicetime = extract_servicetime(sentence_text)
|
|
@@ -2098,7 +2289,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
entity_text = servicetime['body']
|
|
entity_text = servicetime['body']
|
|
list_sentence_entitys.append(
|
|
list_sentence_entitys.append(
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
- begin_index_temp, end_index_temp))
|
|
|
|
|
|
+ begin_index_temp, end_index_temp,in_attachment=in_attachment))
|
|
|
|
|
|
# 招标方式提取 2020/12/30 新增
|
|
# 招标方式提取 2020/12/30 新增
|
|
# list_bidway = extract_bidway(sentence_text, )
|
|
# list_bidway = extract_bidway(sentence_text, )
|
|
@@ -2140,7 +2331,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
entity_text = ratio['body']
|
|
entity_text = ratio['body']
|
|
list_sentence_entitys.append(
|
|
list_sentence_entitys.append(
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
|
|
- begin_index_temp, end_index_temp))
|
|
|
|
|
|
+ begin_index_temp, end_index_temp,in_attachment=in_attachment))
|
|
|
|
|
|
list_sentence_entitys.sort(key=lambda x:x.begin_index)
|
|
list_sentence_entitys.sort(key=lambda x:x.begin_index)
|
|
list_entitys_temp = list_entitys_temp+list_sentence_entitys
|
|
list_entitys_temp = list_entitys_temp+list_sentence_entitys
|