demo_1.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. '''
  2. 使用Kaggle提供的数据集进行信用评分分析
  3. '''
  4. import pandas as pd
  5. #加载数据
  6. data = pd.read_csv(r"C:\Users\Suface\Desktop\Python_code\Give_me_some_credit\Data\cs-training.csv",engine = "python")
  7. #数据描述
  8. pd.set_option('display.max_columns', None) #显示完整的列
  9. pd.set_option('display.max_rows', None) #显示完整的行
  10. print("\033[7;37;41m\t 数据详细描述: \033[0m")
  11. print(data.describe())
  12. data=data.iloc[:,1:] #去掉第一列无用序号
  13. print("\033[7;37;41m\t 数据简要描述: \033[0m")
  14. print(data.info()) #查看数据集属性
  15. # 用随机森林对缺失值进行预测(针对于缺失率比较大的变量MonthlyIncome)
  16. from sklearn.ensemble import RandomForestRegressor
  17. def set_missing(df):
  18. # 把已有的数值型特征取出来,MonthlyIncome位于第5列,NumberOfDependents位于第10列
  19. process_df = df.ix[:, [5, 0, 1, 2, 3, 4, 6, 7, 8, 9]]
  20. # 分成已知该特征和未知该特征两部分
  21. known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
  22. unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
  23. X = known[:, 1:] # X为特征属性值
  24. y = known[:, 0] # y为结果标签值
  25. rfr = RandomForestRegressor(random_state=0, n_estimators=200, max_depth=3, n_jobs=-1)
  26. rfr.fit(X, y)
  27. # 用得到的模型进行未知特征值预测月收入
  28. predicted = rfr.predict(unknown[:, 1:]).round(0)
  29. # 用得到的预测结果填补原缺失数据
  30. df.loc[df.MonthlyIncome.isnull(), 'MonthlyIncome'] = predicted
  31. return df
  32. #用随机森林填补比较多的缺失值
  33. data=set_missing(data)
  34. # #直接删除比较少的缺失值 :NumberOfDependents,或者用下边的方法补全缺失值
  35. data=data.dropna()
  36. # #补全NumberOfDependents中的缺失值
  37. # import seaborn as sns
  38. # import matplotlib.pyplot as plt
  39. #
  40. # print("\033[7;37;41m\t 查看NumberOfDependents中的情况: \033[0m")
  41. # print(data.NumberOfDependents.value_counts())
  42. # sns.countplot(x = 'NumberOfDependents', data = data)
  43. # plt.show( )
  44. #
  45. # Dependents = pd.Series([0,1,2,3,4]) #用 0 1 2 3 4 代替缺失值
  46. # for i in data['NumberOfDependents'][data['NumberOfDependents'].isnull()].index:
  47. # data['NumberOfDependents'][i] = Dependents.sample(1)
  48. data['NumberOfDependents'][data['NumberOfDependents']>8] = 8 #家庭人口数超过8的全部用8代替(常识)
  49. data = data.drop_duplicates()#删除重复项
  50. print("\033[7;37;41m\t 展示填补完缺失值后的数据情况: \033[0m")
  51. data.info()
  52. #制作箱线图,观察数据是否存在异常
  53. #这三个属性 NumberOfTime30-59DaysPastDueNotWorse逾期30-59天的次数,
  54. # NumberOfTimes90DaysLate贷款数量,
  55. # NumberOfTime60-89DaysPastDueNotWorse逾期60-89天的次数
  56. import matplotlib.pyplot as plt
  57. train_box = data.iloc[:,[3,7,9]]
  58. train_box.boxplot(figsize=(10,4))
  59. plt.show()
  60. #观察到2个异常值点,删除,此外年龄应该大于0,小于100
  61. data = data[data['age'] < 100]
  62. data = data[data['age'] > 0]
  63. data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]
  64. data['SeriousDlqin2yrs'] = 1-data['SeriousDlqin2yrs'] # 使好客户为1 , 坏客户为0,方便计数
  65. #探索性分析,观察数据分布
  66. import seaborn as sns
  67. age=data['age']
  68. sns.distplot(age)
  69. plt.show()
  70. mi=data[data['MonthlyIncome']<50000]['MonthlyIncome']
  71. sns.distplot(mi)
  72. plt.show() #观察图,年龄和收入分布皆近似正态分布!)
  73. #数据切割,将数据分为训练集和测试集:3-7
  74. from sklearn.cross_validation import train_test_split
  75. Y=data['SeriousDlqin2yrs']
  76. X=data.ix[:, 1:]
  77. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
  78. trainDf = pd.concat([Y_train, X_train], axis=1)
  79. testDf = pd.concat([Y_test, X_test], axis=1)
  80. clasTest = testDf.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count()
  81. #变量分箱(binning)
  82. import numpy as np
  83. import pandas as pd
  84. from scipy import stats
  85. #最优分段
  86. def mono_bin(Y, X, n ):
  87. r = 0 #设定斯皮尔曼初始值
  88. good=Y.sum() #好客户的人数
  89. bad=Y.count()-good #坏客户的人数
  90. #分箱的核心是用机器来选最优的分箱节点
  91. while np.abs(r) < 1: #while ,不满足条件时,跳出循环
  92. d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
  93. #注意这里是pd.qcut, Bucket:将 X 分为 n 段,n由斯皮尔曼系数决定
  94. d2 = d1.groupby('Bucket', as_index = True)
  95. r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) # 以斯皮尔曼系数作为分箱终止条件
  96. n = n - 1
  97. d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
  98. d3['min']=d2.min().X # min 就是分箱的节点
  99. d3['max'] = d2.max().X
  100. d3['sum'] = d2.sum().Y
  101. d3['total'] = d2.count().Y
  102. d3['rate'] = d2.mean().Y
  103. d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
  104. d3['goodattribute']=d3['sum']/good
  105. d3['badattribute']=(d3['total']-d3['sum'])/bad
  106. iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum() #返回 iv
  107. d4 = (d3.sort_index(by = 'min')).reset_index(drop=True) # 返回 d
  108. pd.set_option('display.max_columns', None) # 显示完整的列
  109. pd.set_option('display.max_rows', None) # 显示完整的行
  110. print("\033[7;37;41m\t 分箱情况为: \033[0m")
  111. print("=" * 60)
  112. print(d4)
  113. woe=list(d4['woe'].round(3)) #返回 woe
  114. cut=[] # cut 存放箱段节点
  115. cut.append(float('-inf')) # 在列表前加 -inf
  116. for i in range(1,n+1): # n 是前面分箱的分割数 ,所以分成n+1份
  117. qua=X.quantile(i/(n+1)) #quantile 分为数 得到分箱的节点
  118. cut.append(round(qua,4)) # 保留4位小数,返回cut
  119. cut.append(float('inf')) # 在列表后加inf
  120. return d4,iv,cut,woe
  121. #对于上述分箱方法不能合理拆分的特征,采用无监督分箱的手动分箱
  122. #贷款以及信用卡可用额度与总额度比例
  123. dfx1, ivx1, cutx1, woex1 = mono_bin(trainDf.SeriousDlqin2yrs, trainDf.RevolvingUtilizationOfUnsecuredLines, n = 10)
  124. dfx2, ivx2, cutx2, woex2 = mono_bin(trainDf.SeriousDlqin2yrs, trainDf.age, n=10) # 年龄
  125. dfx4, ivx4, cutx4, woex4 = mono_bin(trainDf.SeriousDlqin2yrs, trainDf.DebtRatio, n=20) #负债比率
  126. dfx5, ivx5, cutx5, woex5 = mono_bin(trainDf.SeriousDlqin2yrs, trainDf.MonthlyIncome, n=10) #月收入对
  127. pinf = float('inf') #正无穷大
  128. ninf = float('-inf') #负无穷大
  129. cutx3 = [ninf, 0, 1, 3, 5, pinf]
  130. cutx6 = [ninf, 1, 2, 3, 5, 7, 9, pinf] #加了 7,9
  131. cutx7 = [ninf, 0, 1, 3, 5, pinf]
  132. cutx8 = [ninf, 0,1,2, 3, pinf]
  133. cutx9 = [ninf, 0, 1, 3, pinf]
  134. cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]
  135. def self_bin(Y, X, cat): #自定义人工分箱函数
  136. good = Y.sum() #好用户数量
  137. bad = Y.count()-good # 坏用户数量
  138. d1 = pd.DataFrame({'X': X, 'Y': Y, 'Bucket': pd.cut(X, cat)}) #建立个数据框 X-- 各个特征变量 , Y--用户好坏标签 , Bucket--各个分箱
  139. d2 = d1.groupby('Bucket', as_index = True) # 按分箱分组聚合 ,并且设为 Index
  140. d3 = pd.DataFrame(d2.X.min(), columns=['min']) # 添加 min 列 ,不用管里面的 d2.X.min()
  141. d3['min'] = d2.min().X #求每个箱段内 X 比如家庭人数的最小值
  142. d3['max'] = d2.max().X #求每个箱段内 X 比如家庭人数的最大值
  143. d3['sum'] = d2.sum().Y #求每个箱段内 Y 好客户的个数
  144. d3['total'] = d2.count().Y #求每个箱段内 总共客户数
  145. d3['rate'] = d2.mean().Y # 好客户率
  146. #WOE的全称是“Weight of Evidence”,即证据权重。WOE是对原始自变量的一种编码形式。是为了 计算 iv 准备的
  147. #要对一个变量进行WOE编码,需要首先把这个变量进行分组处理(也叫离散化、分箱等等,说的都是一个意思)
  148. d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
  149. d3['goodattribute'] = d3['sum'] / good # 每个箱段内好用户占总好用户数的比率
  150. d3['badattribute'] = (d3['total'] - d3['sum']) / bad # 每个箱段内坏用户占总坏用户数的比率
  151. #IV的全称是Information Value,中文意思是信息价值,或者信息量。通俗的说就是变量的预测能力
  152. iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
  153. d4 = (d3.sort_index(by='min')) #数据框的按min升序排列
  154. woe = list(d4['woe'].round(3))
  155. return d4, iv, woe
  156. #对他们就行分箱处理:
  157. dfx3, ivx3, woex3 = self_bin(trainDf.SeriousDlqin2yrs, trainDf['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)
  158. dfx6, ivx6, woex6 = self_bin(trainDf.SeriousDlqin2yrs, trainDf['NumberOfOpenCreditLinesAndLoans'], cutx6)
  159. dfx7, ivx7, woex7 = self_bin(trainDf.SeriousDlqin2yrs, trainDf['NumberOfTimes90DaysLate'], cutx7)
  160. dfx8, ivx8, woex8 = self_bin(trainDf.SeriousDlqin2yrs, trainDf['NumberRealEstateLoansOrLines'], cutx8)
  161. dfx9, ivx9, woex9 = self_bin(trainDf.SeriousDlqin2yrs, trainDf['NumberOfTime60-89DaysPastDueNotWorse'], cutx9)
  162. dfx10, ivx10, woex10 = self_bin(trainDf.SeriousDlqin2yrs, trainDf['NumberOfDependents'], cutx10)
  163. #相关性分析:
  164. import seaborn as sns
  165. corr = trainDf.corr() #计算各变量的相关性系数
  166. xticks = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'] #x轴标签
  167. yticks = list(corr.index) #y轴标签
  168. fig = plt.figure(figsize = (10, 8))
  169. ax1 = fig.add_subplot(1, 1, 1)
  170. sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 12, 'weight': 'bold', 'color': 'black'}) #绘制相关性系数热力图
  171. ax1.set_xticklabels(xticks, rotation=0, fontsize=14)
  172. ax1.set_yticklabels(yticks, rotation=0, fontsize=14)
  173. plt.savefig('矩阵热力图.png', dpi=200)
  174. plt.show()
  175. #IV值筛选:通过IV值判断变量预测能力的标准是:
  176. #小于 0.02: unpredictive;0.02 to 0.1: weak;0.1 to 0.3: medium; 0.3 to 0.5: strong
  177. ivlist = [ivx1, ivx2, ivx3, ivx4, ivx5, ivx6, ivx7, ivx8, ivx9, ivx10] #各变量IV
  178. index=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'] #x轴的标签
  179. fig1 = plt.figure(1, figsize=(8, 5))
  180. ax1 = fig1.add_subplot(1, 1, 1)
  181. x = np.arange(len(index))+1
  182. ax1.bar(x, ivlist, width=.4)
  183. #ax1.bar(range(len(index)),ivlist, width=0.4)#生成柱状图
  184. #ax1.bar(x,ivlist,width=.04)
  185. ax1.set_xticks(x)
  186. ax1.set_xticklabels(index, rotation=0, fontsize=15)
  187. ax1.set_ylabel('IV', fontsize=16) #IV(Information Value),
  188. #在柱状图上添加数字标签
  189. for a, b in zip(x, ivlist):
  190. plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=12)
  191. plt.show()
  192. #建立模型之前,我们需要将筛选后的变量转换为WoE值,便于信用评分
  193. #替换成woe函数
  194. def trans_woe(var, var_name, woe, cut):
  195. woe_name = var_name+'_woe'
  196. for i in range(len(woe)):
  197. # len(woe) 得到woe里 有多少个数值
  198. if i == 0:
  199. var.loc[(var[var_name] <= cut[i+1]), woe_name] = woe[i]
  200. #将woe的值按 cut分箱的下节点,顺序赋值给var的woe_name 列 ,分箱的第一段
  201. elif (i > 0) and (i <= len(woe)-2):
  202. var.loc[((var[var_name] > cut[i]) & (var[var_name] <= cut[i+1])), woe_name] = woe[i]
  203. # 中间的分箱区间,数手指头就很清楚了
  204. else:
  205. var.loc[(var[var_name] > cut[len(woe)-1]), woe_name] = woe[len(woe)-1]
  206. #大于最后一个分箱区间的上限值,最后一个值是正无穷
  207. return var
  208. x1_name = 'RevolvingUtilizationOfUnsecuredLines'
  209. x2_name = 'age'
  210. x3_name = 'NumberOfTime30-59DaysPastDueNotWorse'
  211. x7_name = 'NumberOfTimes90DaysLate'
  212. x9_name = 'NumberOfTime60-89DaysPastDueNotWorse'
  213. trainDf = trans_woe(trainDf,x1_name,woex1,cutx1)
  214. trainDf = trans_woe(trainDf,x2_name,woex2,cutx2)
  215. trainDf = trans_woe(trainDf,x3_name,woex3,cutx3)
  216. trainDf = trans_woe(trainDf,x7_name,woex7,cutx7)
  217. trainDf = trans_woe(trainDf,x9_name,woex9,cutx9)
  218. Y = trainDf['SeriousDlqin2yrs'] #因变量
  219. #自变量,剔除对因变量影响不明显的变量
  220. X = trainDf.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
  221. 'NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
  222. X = trainDf.iloc[:,-5:]
  223. X.head()
  224. #回归
  225. import statsmodels.api as sm
  226. X1 = sm.add_constant(X)
  227. logit = sm.Logit(Y, X1)
  228. result = logit.fit()
  229. print("\033[7;37;41m\t 展示回归各变量的值: \033[0m")
  230. print(result.summary())
  231. #模型评估:我们需要验证一下模型的预测能力如何。我们使用在建模开始阶段预留的test数据进行检验。
  232. #通过ROC曲线和AUC来评估模型的拟合能力。
  233. #在Python中,可以利用sklearn.metrics,它能方便比较两个分类器,自动计算ROC和AUC
  234. testDf = trans_woe(testDf, x1_name, woex1, cutx1)
  235. testDf = trans_woe(testDf, x2_name, woex2, cutx2)
  236. testDf = trans_woe(testDf, x3_name, woex3, cutx3)
  237. testDf = trans_woe(testDf, x7_name, woex7, cutx7)
  238. testDf = trans_woe(testDf, x9_name, woex9, cutx9)
  239. #构建测试集的特征和标签
  240. test_X = testDf.iloc[:, -5:] #测试数据特征
  241. test_Y = testDf.iloc[:, 0] #测试数据标签
  242. #评估
  243. from sklearn import metrics
  244. X3 = sm.add_constant(test_X)
  245. resu = result.predict(X3) #进行预测
  246. fpr, tpr, threshold=metrics.roc_curve(test_Y, resu) #评估算法
  247. rocauc = metrics.auc(fpr, tpr) #计算AUC
  248. print("\033[7;37;41m\t 用预留出的测试集测试模型: \033[0m")
  249. print("\033[7;37;41m\t 模型的AUG的值为: \033[0m")
  250. print('AUG = ',rocauc)
  251. plt.figure(figsize = (8, 5)) #只能在这里面设置
  252. plt.plot(fpr, tpr, 'b',label = 'AUC=%0.2f'% rocauc)
  253. plt.legend(loc = 'lower right', fontsize=14)
  254. plt.plot([0, 1], [0, 1], 'r--')
  255. plt.xlim([0, 1])
  256. plt.ylim([0, 1])
  257. plt.xticks(fontsize = 14)
  258. plt.yticks(fontsize = 14)
  259. plt.ylabel('TPR-真正率', fontsize=16)
  260. plt.xlabel('FPR-假正率', fontsize=16)
  261. plt.show()
  262. #设定几个评分卡参数:基础分值、PDO(比率翻倍的分值)和好坏比。
  263. #这里, 我们取600分为基础分值b,取20为PDO (每高20分好坏比翻一倍),好坏比O取20。
  264. p = 20/np.log(2) #比例因子
  265. q = 600-20*np.log(20)/np.log(2) #等于offset,偏移量
  266. x_coe = [2.6084, 0.6327, 0.5151, 0.5520, 0.5747, 0.4074] #回归系数
  267. baseScore = round(q+p*x_coe[0],0)
  268. #个人总评分=基础分+各部分得分
  269. def get_score(coe,woe,factor):
  270. scores = []
  271. for w in woe:
  272. score = round(coe*w*factor, 0)
  273. scores.append(score)
  274. return scores
  275. #每一项得分
  276. x1_score = get_score(x_coe[1], woex1, p)
  277. x2_score = get_score(x_coe[2], woex2, p)
  278. x3_score = get_score(x_coe[3], woex3, p)
  279. x7_score = get_score(x_coe[4], woex7, p)
  280. x9_score = get_score(x_coe[5], woex9, p)
  281. def compute_score(series,cut,score):
  282. list = []
  283. i = 0
  284. while i < len(series):
  285. value = series[i]
  286. j = len(cut) - 2
  287. m = len(cut) - 2
  288. while j >= 0:
  289. if value >= cut[j]:
  290. j = -1
  291. else:
  292. j -= 1
  293. m -= 1
  294. list.append(score[m])
  295. i += 1
  296. return list
  297. test1 = pd.read_csv(r"C:\Users\Suface\Desktop\Python_code\Give_me_some_credit\Data\cs-test.csv")
  298. test1['BaseScore']=np.zeros(len(test1))+baseScore
  299. test1['x1'] =compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], cutx1, x1_score)
  300. test1['x2'] = compute_score(test1['age'], cutx2, x2_score)
  301. test1['x3'] = compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3_score)
  302. test1['x7'] = compute_score(test1['NumberOfTimes90DaysLate'], cutx7, x7_score)
  303. test1['x9'] = compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'],cutx9,x9_score)
  304. test1['Score'] = test1['x1'] + test1['x2'] + test1['x3'] + test1['x7'] +test1['x9'] + baseScore
  305. scoretable1 = test1.iloc[:, [1, -7, -6, -5, -4, -3, -2, -1]] #选取需要的列,就是评分列
  306. scoretable1.head()
  307. scoretable1.to_csv('ScoreData简化版.csv') #属性名以Xn代替的
  308. print("\033[7;37;41m\t 利用评分卡,在测试集上进行评分(仅展示前10个): \033[0m")
  309. print(scoretable1[:10])
  310. colNameDict = {'x1': 'RevolvingUtilizationOfUnsecuredLines', 'x2': 'age', 'x3': 'NumberOfTime30-59DaysPastDueNotWorse',
  311. 'x7': 'NumberOfTimes90DaysLate', 'x9': 'NumberOfTime60-89DaysPastDueNotWorse'}
  312. scoretable2 = scoretable1.rename(columns=colNameDict, inplace=False)
  313. scoretable2.to_csv('ScoreData.csv')
  314. print('Done!')