import pymysql import pandas as pd def getTitles(): connect = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') cursor = connect.cursor() ALL_count = 100000 tables = ["sys_document_20","sys_document_19","sys_document_18","sys_document_17","sys_document_16"] channels = [52,101] every_count = ALL_count//(len(tables)*len(channels)) set_doctitle = set() list_id_title = [] for _table in tables: for _channel in channels: sql = "select docid,doctitle from %s where docchannel=%d limit %d "%(_table,_channel,every_count) print(sql) cursor.execute(sql) for row in cursor.fetchall(): docid,doctitle = row if doctitle not in set_doctitle: set_doctitle.add(doctitle) list_id_title.append([docid,doctitle]) with open("titles.txt","w",encoding="utf8") as f: for _id,_title in list_id_title: f.write("%s %s"%(str(_id),str(_title))) f.write("\n") def getDatasToLabel(): connect = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') cursor = connect.cursor() ALL_count = 20000 tables = ["sys_document_20","sys_document_19","sys_document_18","sys_document_17","sys_document_16"] channels = [52,101] every_count = ALL_count//len(tables)//len(channels) list_docid = [] list_htmlcon = [] dict_web_source_no = {} for _table in tables: for _channel in channels: sql = "select docid,dochtmlcon,web_source_no from %s where docchannel=%d limit %d"%(_table,_channel,every_count*10) print(sql) cursor.execute(sql) _count = 0 for row in cursor.fetchall(): if row[2] not in dict_web_source_no: dict_web_source_no[row[2]] = 0 dict_web_source_no[row[2]] += 1 if dict_web_source_no[row[2]]>20: continue else: if len(row[1])>100000: continue list_docid.append(row[0]) list_htmlcon.append(row[1]) _count += 1 if _count>=every_count: break print("len:",len(list_docid)) df = pd.DataFrame({"document_id":list_docid,"document_text":list_htmlcon}) df.to_csv("article_20000.csv",columns=["document_id","document_text"],index=False,encoding="utf8") if __name__=="__main__": # getTitles() getDatasToLabel()