1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- import pymysql
- import pandas as pd
- def getTitles():
- connect = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD')
- cursor = connect.cursor()
- ALL_count = 100000
- tables = ["sys_document_20","sys_document_19","sys_document_18","sys_document_17","sys_document_16"]
- channels = [52,101]
- every_count = ALL_count//(len(tables)*len(channels))
- set_doctitle = set()
- list_id_title = []
- for _table in tables:
- for _channel in channels:
- sql = "select docid,doctitle from %s where docchannel=%d limit %d "%(_table,_channel,every_count)
- print(sql)
- cursor.execute(sql)
- for row in cursor.fetchall():
- docid,doctitle = row
- if doctitle not in set_doctitle:
- set_doctitle.add(doctitle)
- list_id_title.append([docid,doctitle])
- with open("titles.txt","w",encoding="utf8") as f:
- for _id,_title in list_id_title:
- f.write("%s %s"%(str(_id),str(_title)))
- f.write("\n")
- def getDatasToLabel():
- connect = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD')
- cursor = connect.cursor()
- ALL_count = 20000
- tables = ["sys_document_20","sys_document_19","sys_document_18","sys_document_17","sys_document_16"]
- channels = [52,101]
- every_count = ALL_count//len(tables)//len(channels)
- list_docid = []
- list_htmlcon = []
- dict_web_source_no = {}
- for _table in tables:
- for _channel in channels:
- sql = "select docid,dochtmlcon,web_source_no from %s where docchannel=%d limit %d"%(_table,_channel,every_count*10)
- print(sql)
- cursor.execute(sql)
- _count = 0
- for row in cursor.fetchall():
- if row[2] not in dict_web_source_no:
- dict_web_source_no[row[2]] = 0
- dict_web_source_no[row[2]] += 1
- if dict_web_source_no[row[2]]>20:
- continue
- else:
- if len(row[1])>100000:
- continue
- list_docid.append(row[0])
- list_htmlcon.append(row[1])
- _count += 1
- if _count>=every_count:
- break
- print("len:",len(list_docid))
- df = pd.DataFrame({"document_id":list_docid,"document_text":list_htmlcon})
- df.to_csv("article_20000.csv",columns=["document_id","document_text"],index=False,encoding="utf8")
- if __name__=="__main__":
- # getTitles()
- getDatasToLabel()
|