from BiddingKG.dl.interface.Preprocessing import get_preprocessed_article,get_preprocessed_sentences import pandas as pd import re from BiddingKG.dl.common.nerUtils import getTokens def preprocess(text): text = re.sub("\n+",',',text) text = re.sub("\s+|?+",'',text) text = re.sub("[\.·_]{2,}", ',', text) text = re.sub("_", '', text) text = text[:1800] sentences = text.split("。") sentences = [s for s in sentences if s] if not sentences: return [] tokens = getTokens(sentences) new_tokens = [] for t in tokens: new_tokens.extend(t) return new_tokens def data_process1(): data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data01.csv",index_col=0) text = [] idx = 1 for html_text in data['attachmenthtml']: res = get_preprocessed_article([[0,html_text,"","",""]]) text.append(res[0].content) print(idx) idx += 1 data['text'] = text data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv") def data_process2(): data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv",index_col=0) tokens = [] idx = 1 for html_text in data['text']: _tokens = [] list_articles = get_preprocessed_article([[0,html_text,"","",""]]) list_sentences = get_preprocessed_sentences(list_articles, True) for sent in list_sentences[0]: _tokens.extend(sent.tokens) tokens.append(_tokens) # print(_tokens) print(idx) idx += 1 data['tokens'] = tokens data.to_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_process.csv") classes_dict = { '其他':0, '招标文件':1, '限价(控制价)':2, '工程量清单':3, '采购清单':4, '评标办法':5 } def data_process3(): data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data01.csv",index_col=0) re_str = [] re_label = [] new_text = [] idx = 0 for text in data['attachmentcon']: print(idx) idx+=1 text = str(text) # text = re.sub("\n+", ',', text) # text = re.sub("\s+", '', text) # text = re.sub("?+", '', text) text_tokens = preprocess(text) text_tokens = text_tokens[:512] text = "".join(text_tokens) text = re.sub("[\.·…]{2,}", ',', text) # text = text[:800] new_text.append(text) if re.search("中标人?公[示告]",text) or re.search('候选人公[示告]',text) or re.search('成交公[示告]',text) or re.search('中标结果公示',text): re_str.append("中标候选人公示") re_label.append(classes_dict['其他']) elif re.search("招标文件",text): re_str.append("招标文件") re_label.append(classes_dict['招标文件']) elif re.search("限价",text) or re.search('控制价',text): re_str.append("限价(控制价)") re_label.append(classes_dict['限价(控制价)']) elif re.search('工程量清单',text): re_str.append("工程量清单") re_label.append(classes_dict['工程量清单']) elif re.search("采购.{0,2}清单",text): re_str.append("采购清单") re_label.append(classes_dict['采购清单']) elif re.search('评标办法',text): re_str.append("评标办法") re_label.append(classes_dict['评标办法']) else: re_str.append("") re_label.append('') data['re_str'] = re_str data['re_label'] = re_label data['new_label'] = re_label data['attachmentcon'] = new_text # data =data.drop(columns=['attachmenthtml']) data['attachmenthtml'] = [re.sub('\n{2,}','',i.replace("