getDataFromAliyun.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. import pymysql
  2. import pandas as pd
  3. def getTitles():
  4. connect = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD')
  5. cursor = connect.cursor()
  6. ALL_count = 100000
  7. tables = ["sys_document_20","sys_document_19","sys_document_18","sys_document_17","sys_document_16"]
  8. channels = [52,101]
  9. every_count = ALL_count//(len(tables)*len(channels))
  10. set_doctitle = set()
  11. list_id_title = []
  12. for _table in tables:
  13. for _channel in channels:
  14. sql = "select docid,doctitle from %s where docchannel=%d limit %d "%(_table,_channel,every_count)
  15. print(sql)
  16. cursor.execute(sql)
  17. for row in cursor.fetchall():
  18. docid,doctitle = row
  19. if doctitle not in set_doctitle:
  20. set_doctitle.add(doctitle)
  21. list_id_title.append([docid,doctitle])
  22. with open("titles.txt","w",encoding="utf8") as f:
  23. for _id,_title in list_id_title:
  24. f.write("%s %s"%(str(_id),str(_title)))
  25. f.write("\n")
  26. def getDatasToLabel():
  27. connect = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD')
  28. cursor = connect.cursor()
  29. ALL_count = 20000
  30. tables = ["sys_document_20","sys_document_19","sys_document_18","sys_document_17","sys_document_16"]
  31. channels = [52,101]
  32. every_count = ALL_count//len(tables)//len(channels)
  33. list_docid = []
  34. list_htmlcon = []
  35. dict_web_source_no = {}
  36. for _table in tables:
  37. for _channel in channels:
  38. sql = "select docid,dochtmlcon,web_source_no from %s where docchannel=%d limit %d"%(_table,_channel,every_count*10)
  39. print(sql)
  40. cursor.execute(sql)
  41. _count = 0
  42. for row in cursor.fetchall():
  43. if row[2] not in dict_web_source_no:
  44. dict_web_source_no[row[2]] = 0
  45. dict_web_source_no[row[2]] += 1
  46. if dict_web_source_no[row[2]]>20:
  47. continue
  48. else:
  49. if len(row[1])>100000:
  50. continue
  51. list_docid.append(row[0])
  52. list_htmlcon.append(row[1])
  53. _count += 1
  54. if _count>=every_count:
  55. break
  56. print("len:",len(list_docid))
  57. df = pd.DataFrame({"document_id":list_docid,"document_text":list_htmlcon})
  58. df.to_csv("article_20000.csv",columns=["document_id","document_text"],index=False,encoding="utf8")
  59. if __name__=="__main__":
  60. # getTitles()
  61. getDatasToLabel()