123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- from BiddingKG.dl.interface.Preprocessing import get_preprocessed_article,get_preprocessed_sentences
- import pandas as pd
- import re
- from BiddingKG.dl.common.nerUtils import getTokens
- def preprocess(text):
- text = re.sub("\n+",',',text)
- text = re.sub("\s+|?+",'',text)
- text = re.sub("[\.·_]{2,}", ',', text)
- text = re.sub("_", '', text)
- text = text[:1800]
- sentences = text.split("。")
- sentences = [s for s in sentences if s]
- if not sentences:
- return []
- tokens = getTokens(sentences)
- new_tokens = []
- for t in tokens:
- new_tokens.extend(t)
- return new_tokens
- def data_process1():
- data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data01.csv",index_col=0)
- text = []
- idx = 1
- for html_text in data['attachmenthtml']:
- res = get_preprocessed_article([[0,html_text,"","",""]])
- text.append(res[0].content)
- print(idx)
- idx += 1
- data['text'] = text
- data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv")
- def data_process2():
- data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv",index_col=0)
- tokens = []
- idx = 1
- for html_text in data['text']:
- _tokens = []
- list_articles = get_preprocessed_article([[0,html_text,"","",""]])
- list_sentences = get_preprocessed_sentences(list_articles, True)
- for sent in list_sentences[0]:
- _tokens.extend(sent.tokens)
- tokens.append(_tokens)
- # print(_tokens)
- print(idx)
- idx += 1
- data['tokens'] = tokens
- data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv")
- classes_dict = {
- '其他':0,
- '招标文件':1,
- '限价(控制价)':2,
- '工程量清单':3,
- '采购清单':4,
- '评标办法':5
- }
- def data_process3():
- data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data01.csv",index_col=0)
- re_str = []
- re_label = []
- new_text = []
- idx = 0
- for text in data['attachmentcon']:
- print(idx)
- idx+=1
- text = str(text)
- # text = re.sub("\n+", ',', text)
- # text = re.sub("\s+", '', text)
- # text = re.sub("?+", '', text)
- text_tokens = preprocess(text)
- text_tokens = text_tokens[:512]
- text = "".join(text_tokens)
- text = re.sub("[\.·…]{2,}", ',', text)
- # text = text[:800]
- new_text.append(text)
- if re.search("中标人?公[示告]",text) or re.search('候选人公[示告]',text) or re.search('成交公[示告]',text) or re.search('中标结果公示',text):
- re_str.append("中标候选人公示")
- re_label.append(classes_dict['其他'])
- elif re.search("招标文件",text):
- re_str.append("招标文件")
- re_label.append(classes_dict['招标文件'])
- elif re.search("限价",text) or re.search('控制价',text):
- re_str.append("限价(控制价)")
- re_label.append(classes_dict['限价(控制价)'])
- elif re.search('工程量清单',text):
- re_str.append("工程量清单")
- re_label.append(classes_dict['工程量清单'])
- elif re.search("采购.{0,2}清单",text):
- re_str.append("采购清单")
- re_label.append(classes_dict['采购清单'])
- elif re.search('评标办法',text):
- re_str.append("评标办法")
- re_label.append(classes_dict['评标办法'])
- else:
- re_str.append("")
- re_label.append('')
- data['re_str'] = re_str
- data['re_label'] = re_label
- data['new_label'] = re_label
- data['attachmentcon'] = new_text
- # data =data.drop(columns=['attachmenthtml'])
- data['attachmenthtml'] = [re.sub('\n{2,}','',i.replace("<div> </div>",'',i))[:4000] for i in data['attachmenthtml']]
- label_data = data[data['re_str']!='']
- label_data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_relabel01.csv")
- no_label_data = data[data['re_str']=='']
- no_label_data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_nolabel01.csv")
- if __name__ == '__main__':
- # data_process1()
- # data_process2()
- # data_process3()
- # data_process4()
- pass
|