import sys import pandas as pd from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams['font.sans-serif'] = ['KaiTi'] mpl.rcParams['font.serif'] = ['KaiTi'] import warnings warnings.filterwarnings("ignore") from scipy import stats import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC,LinearSVC from sklearn.preprocessing import MinMaxScaler,StandardScaler from sklearn.cross_validation import train_test_split import xgboost as xgb def get_data(): data = pd.read_csv("../Data/cs-training.csv", index_col=0) # print(data.info()) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) # print("\033[7;37;41m\t 数据详细描述: \033[0m") # print(data.describe()) print("get data!") return data def data_process(data): def set_missing(df): # 把已有的数值型特征取出来,MonthlyIncome位于第5列,NumberOfDependents位于第10列 process_df = df.ix[:, [5, 0, 1, 2, 3, 4, 6, 7, 8, 9]] # 分成已知该特征和未知该特征两部分 known = process_df[process_df.MonthlyIncome.notnull()].as_matrix() unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix() X = known[:, 1:] # X为特征属性值 y = known[:, 0] # y为结果标签值 rfr = RandomForestRegressor(random_state=0, n_estimators=200, max_depth=3) rfr.fit(X, y) print(rfr.feature_importances_) # 用得到的模型进行未知特征值预测月收入 predicted = rfr.predict(unknown[:, 1:]).round(0) # 用得到的预测结果填补原缺失数据 df.loc[df.MonthlyIncome.isnull(), 'MonthlyIncome'] = predicted return df # 用随机森林填补比较多的缺失值 data = set_missing(data) data = data.dropna() data = data.drop_duplicates() states = { 'SeriousDlqin2yrs': '好坏客户', 'RevolvingUtilizationOfUnsecuredLines': '可用额度比值', # 无担保放款循环利用比值 'age': '年龄', 'NumberOfTime30-59DaysPastDueNotWorse': '逾期30-59天笔数', 'DebtRatio': '负债率', 'MonthlyIncome': '月收入', 'NumberOfOpenCreditLinesAndLoans': '信贷数量', 'NumberOfTimes90DaysLate': '逾期90天笔数', 'NumberRealEstateLoansOrLines': '固定资产贷款量', 'NumberOfTime60-89DaysPastDueNotWorse': '逾期60-89天笔数', 'NumberOfDependents': '家属数量' } # 创建字典 data.rename(columns=states, inplace=True) print(data[data['年龄']>90].count()) data = data[data['年龄']>0] data = data[data['年龄']<100] data = data[data['逾期30-59天笔数']<85] data = data[data['家属数量']<30] data = data[data['负债率']<100] print(data.info()) print("data process!") return data def data_analysis(data): # train_box = data.iloc[:,[3,7,9]] # train_box.boxplot(figsize=(10,4)) # plt.show() fig = plt.figure(figsize=(15, 10)) a = fig.add_subplot(3, 2, 1) b = fig.add_subplot(3, 2, 2) c = fig.add_subplot(3, 2, 3) d = fig.add_subplot(3, 2, 4) e = fig.add_subplot(3, 2, 5) f = fig.add_subplot(3, 2, 6) a.boxplot(data['可用额度比值'], labels=['可用额度比值']) b.boxplot([data['年龄'], data['好坏客户']], labels=['年龄', '好坏客户']) c.boxplot([data['逾期30-59天笔数'], data['逾期60-89天笔数'], data['逾期90天笔数']], labels=['逾期30-59天笔数', '逾期60-89天笔数', '逾期90天笔数']) d.boxplot([data['信贷数量'], data['固定资产贷款量'], data['家属数量']], labels=['信贷数量', '固定资产贷款量', '家属数量']) e.boxplot(data['月收入'], labels=['月收入']) f.boxplot(data['负债率'], labels=['负债率']) data.hist(figsize=(20, 15)) for k in [1, 2, 4, 5]: # 遍历列 q1 = data.iloc[:, k].quantile(0.25) # 计算上四分位数 q3 = data.iloc[:, k].quantile(0.75) # 计算下四分位数 iqr = q3 - q1 print(q3) low = q1 - 1.5 * iqr up = q3 + 1.5 * iqr if k == 1: data1 = data data1 = data1[(data1.iloc[:, k] > low) & (data1.iloc[:, k] < up)] # 保留正常值范围 data = data1 data.info() data.iloc[:, [1, 2, 4, 5]].boxplot(figsize=(15, 10)) data = data[data['逾期30-59天笔数'] < 80] data = data[data['逾期60-89天笔数'] < 80] data = data[data['逾期60-89天笔数'] < 80] data = data[data['逾期90天笔数'] < 80] data = data[data['固定资产贷款量'] < 50] data = data[data['家属数量'] < 15] print(data['好坏客户'].sum()) def binning(data): # 定义自动分箱函数,采用最优分段进行分箱 '''采用斯皮尔曼等级相关系数进行变量相关分析,该相关系数对两个变量划分等级在进行分析,[-1,1], 当两个变量完全单调相关时,斯皮尔曼相关系数则为+1或−1''' def op(y, x, n=20): # 定义函数,有三个参数,x为要分箱的变量,y对应关注的因变量,即好坏客户,n最大分组数为20,依次减小试验 r = 0 bad = y.sum() # 计算坏客户个数,因为等于1时表示坏客户,所以求和即可得到好客户数 good = y.count() - bad # count统计个数为所有客户数,减去好客户得到好客户人数 while np.abs(r) < 1: # 判断,当绝对值小于1时继续执行,直到相关系数绝对值等于1停止 d1 = pd.DataFrame({"x": x, "y": y, "bucket": pd.qcut(x, n)}) d2 = d1.groupby('bucket', as_index=True) # 对数据框d1按照bucket变量分组,Ture则返回以组标签为索引的对象,reset_index()可以取消分组索引让其变成dataframe r, p = stats.spearmanr(d2.mean().x, d2.mean().y) # 分组,数据为组间均值,计算此时x与y的斯皮尔曼等级相关系数,输出相关系数和P值 n = n - 1 # 减小分组数 d3 = pd.DataFrame(d2.x.min(), columns=['min']) # 建立数据框 d3['min'] = d2.min().x d3['max'] = d2.max().x d3['sum'] = d2.sum().y # 对应分组的坏客户数 d3['total'] = d2.count().y # 对应分组总客户数 d3['rate'] = d2.mean().y # 坏客户数占该组总人数比 d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad)) # 求woe # d3['iv']= d4 = (d3.sort_index(by='min')).reset_index(drop=True) print("=" * 60) print(d4) return (d4) # 利用所定义的函数依次对连续型变量进行最优分段分箱处理,满足条件的有以下 # x2=op(data['好坏客户'],data['年龄']) # x4=op(data['好坏客户'],data['负债率']) # x5=op(data['好坏客户'],data['月收入']) # 对于不能采用最优分段的变量采用等深分段 def funqcut(y, x, n): cut1 = pd.qcut(x.rank(method='first'), n) # 进行等深分箱,分组 data = pd.DataFrame({"x": x, "y": y, "cut1": cut1}) cutbad = data.groupby(cut1).y.sum() # 求分组下的坏客户数 cutgood = data.groupby(cut1).y.count() - cutbad # 求分组下好客户数 bad = data.y.sum() # 求总的坏客户数 good = data.y.count() - bad # 求总的好客户数 woe = np.log((cutbad / bad) / (cutgood / good)) # 求各分组的woe iv = (cutbad / bad - cutgood / good) * woe # 求各分组的iv cut = pd.DataFrame({"坏客户数": cutbad, "好客户数": cutgood, "woe": woe, "iv": iv}) print(cut) return cut # 返回表格和对应分组列表 # funqcut(train['好坏客户'],train['年龄'],6).reset_index() x1 = funqcut(data['好坏客户'], data['可用额度比值'], 5).reset_index() x2 = funqcut(data['好坏客户'], data['年龄'], 8).reset_index() x4 = funqcut(data['好坏客户'], data['负债率'], 4).reset_index() x5 = funqcut(data['好坏客户'], data['月收入'], 5).reset_index() x6 = funqcut(data['好坏客户'], data['信贷数量'], 6).reset_index() x3 = funqcut(data['好坏客户'], data['逾期30-59天笔数'], 5).reset_index() x7 = funqcut(data['好坏客户'], data['逾期90天笔数'], 5).reset_index() x8 = funqcut(data['好坏客户'], data['固定资产贷款量'], 5).reset_index() x9 = funqcut(data['好坏客户'], data['逾期60-89天笔数'], 5).reset_index() x10 = funqcut(data['好坏客户'], data['家属数量'], 5).reset_index() ivx1 = x1.iv.sum() ivx2 = x2.iv.sum() ivx3 = x3.iv.sum() ivx4 = x4.iv.sum() ivx5 = x5.iv.sum() ivx6 = x6.iv.sum() ivx7 = x7.iv.sum() ivx8 = x8.iv.sum() ivx9 = x9.iv.sum() ivx10 = x10.iv.sum() IV = pd.DataFrame({"可用额度比值": ivx1, "年龄": ivx2, "逾期30-59天笔数": ivx3, "负债率": ivx4, "月收入": ivx5, "信贷数量": ivx6, "逾期90天笔数": ivx7, "固定资产贷款量": ivx8, "逾期60-89天笔数": ivx9, "家属数量": ivx10}, index=[0]) ivplot = IV.plot.bar(figsize=(15, 10)) ivplot.set_title('特征变量的IV值分布') # 利用等深分段分箱设定,将各个变量的分箱后情况保存为cut1,cut2,cut3...与所得到的分箱情况总结表相对应。其中采用x2/x4/x5采用最优分段的结果 def cutdata(x, n): a = pd.qcut(x.rank(method='first'), n, labels=False) # 等深分组,label=False返回整数值(0,1,2,3...)对应第一类、第二类.. return a # 连续变量均被等深分为了5类 # 应用函数,求出各变量分类情况 cut1 = cutdata(data['可用额度比值'], 5) cut2 = cutdata(data['年龄'], 8) cut3 = cutdata(data['逾期30-59天笔数'], 5) cut4 = cutdata(data['负债率'], 4) cut5 = cutdata(data['月收入'], 5) cut6 = cutdata(data['信贷数量'], 6) cut7 = cutdata(data['逾期90天笔数'], 5) cut8 = cutdata(data['固定资产贷款量'], 5) cut9 = cutdata(data['逾期60-89天笔数'], 5) cut10 = cutdata(data['家属数量'], 5) # 依据变量值的分类替换成对应的woe值 def replace_train(cut, cut_woe): # 定义替换函数,cut为分组情况,cut_woe为分组对应woe值 a = [] for i in cut.unique(): # unique为去重,保留唯一值 a.append(i) a.sort() # 排序,默认小到大,得到类别列表并排序,实则为[0,1,2,3,4] for m in range(len(a)): cut.replace(a[m], cut_woe.values[m], inplace=True) # 替换函数,把cut中旧数值a[m]即分类替换为对应woe,cut_woe中的woe也是从小到大排序,因此与a[m]对应,正如把cut中的0替换为woe值,没有改变cut的数值顺序 return cut # 返回被替换后的列表 train_new = pd.DataFrame() # 创建新数据框保存替换后的新数据 train_new['好坏客户'] = data['好坏客户'] train_new['可用额度比值'] = replace_train(cut1, x1.woe) train_new['年龄'] = replace_train(cut2, x2.woe) train_new['逾期30-59天笔数'] = replace_train(cut3, x3.woe) train_new['负债率'] = replace_train(cut4, x4.woe) train_new['月收入'] = replace_train(cut5, x5.woe) train_new['信贷数量'] = replace_train(cut6, x6.woe) train_new['逾期90天笔数'] = replace_train(cut7, x7.woe) train_new['固定资产贷款量'] = replace_train(cut8, x8.woe) train_new['逾期60-89天笔数'] = replace_train(cut9, x9.woe) train_new['家属数量'] = replace_train(cut10, x10.woe) train_new.head() def train_SVM(data): x = data.iloc[:, 1:] y = data.iloc[:, 0] train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=1111) # 标准化 stand = StandardScaler() stand = stand.fit(train_x) train_x = stand.transform(train_x) test_x = stand.transform(test_x) model = SVC(probability=True) result = model.fit(train_x,train_y) # pred_y = model.predict(test_x) score = model.score(test_x,test_y) print("test score:",score) x1 = test_x[:1] print("x1:",x1) y1 = model.predict_proba(x1) print("y1:",y1) save(model,"svc_model.pk") return model, stand def train_xgb(data): print("running,",sys._getframe().f_code.co_name) x = data.iloc[:, 1:] y = data.iloc[:, 0] train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=1111) # 标准化 stand = StandardScaler() stand = stand.fit(train_x) train_x = stand.transform(train_x) test_x = stand.transform(test_x) model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, # 树的个数--1000棵树建立xgboost max_depth=6, # 树的深度 min_child_weight = 1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 colsample_btree=0.8, # 随机选择80%特征建立决策树 objective='binary:logistic', # 指定损失函数 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27 # 随机数 ) result = model.fit(train_x,train_y) score = model.score(test_x, test_y) print("test score:", score) x1 = test_x[:1] print("x1:",x1) y1 = model.predict(x1) y2 = model.predict_proba(x1) print("y1:",y1,y2) save(model,"xgb_model.pk") return model, stand ''' 假设比例即违约与正常比v为1/70,此时预期分值Z为700,PDD(比率翻倍的分数)为50 B=PDD/log(2) A=Z+B*log(v) ''' B=3/np.log(2) A=60+B*np.log(1/50) def get_score(bad_prob, good_prob, A=A, B=B): ''' 评分公式: score = A - B * ln(bad_prob/good_prob) 或 score = A + PDD * log2(good_prob/bad_prob) ''' odds = bad_prob/good_prob score = A - B * np.log(odds) # print(A+30*np.log2(good_prob/bad_prob)) return score import pickle def save(object_to_save, path): with open(path, 'wb') as f: pickle.dump(object_to_save, f) def load(path): with open(path, 'rb') as f: object1 = pickle.load(f) return object1 class AHP: """ 相关信息的传入和准备 """ def __init__(self, array): ## 记录矩阵相关信息 self.array = array ## 记录矩阵大小 self.n = array.shape[0] # 初始化RI值,用于一致性检验 self.RI_list = [0, 0, 0.52, 0.89, 1.12, 1.26, 1.36, 1.41, 1.46, 1.49, 1.52, 1.54, 1.56, 1.58, 1.59] # 矩阵的特征值和特征向量 self.eig_val, self.eig_vector = np.linalg.eig(self.array) # 矩阵的最大特征值 self.max_eig_val = np.max(self.eig_val) # 矩阵最大特征值对应的特征向量 self.max_eig_vector = self.eig_vector[:, np.argmax(self.eig_val)].real # 矩阵的一致性指标CI self.CI_val = (self.max_eig_val - self.n) / (self.n - 1) # 矩阵的一致性比例CR self.CR_val = self.CI_val / (self.RI_list[self.n - 1]) """ 一致性判断 """ def test_consist(self): # 打印矩阵的一致性指标CI和一致性比例CR print("判断矩阵的CI值为:" + str(self.CI_val)) print("判断矩阵的CR值为:" + str(self.CR_val)) # 进行一致性检验判断 if self.n == 2: # 当只有两个子因素的情况 print("仅包含两个子因素,不存在一致性问题") else: if self.CR_val < 0.1: # CR值小于0.1,可以通过一致性检验 print("判断矩阵的CR值为" + str(self.CR_val) + ",通过一致性检验") return True else: # CR值大于0.1, 一致性检验不通过 print("判断矩阵的CR值为" + str(self.CR_val) + "未通过一致性检验") return False """ 算术平均法求权重 """ def cal_weight_by_arithmetic_method(self): # 求矩阵的每列的和 col_sum = np.sum(self.array, axis=0) # 将判断矩阵按照列归一化 array_normed = self.array / col_sum # 计算权重向量 array_weight = np.sum(array_normed, axis=1) / self.n # 打印权重向量 print("算术平均法计算得到的权重向量为:\n", array_weight) # 返回权重向量的值 return array_weight """ 几何平均法求权重 """ def cal_weight__by_geometric_method(self): # 求矩阵的每列的积 col_product = np.product(self.array, axis=0) # 将得到的积向量的每个分量进行开n次方 array_power = np.power(col_product, 1 / self.n) # 将列向量归一化 array_weight = array_power / np.sum(array_power) # 打印权重向量 print("几何平均法计算得到的权重向量为:\n", array_weight) # 返回权重向量的值 return array_weight """ 特征值法求权重 """ def cal_weight__by_eigenvalue_method(self): # 将矩阵最大特征值对应的特征向量进行归一化处理就得到了权重 array_weight = self.max_eig_vector / np.sum(self.max_eig_vector) # 打印权重向量 print("特征值法计算得到的权重向量为:\n", array_weight) # 返回权重向量的值 return array_weight if __name__ == '__main__': # data = get_data() # data = data_process(data) # # model,stand = train_SVM(data) # model,stand = train_xgb(data) # test_data = pd.read_csv("Data/cs-test.csv", index_col=0) # model = load('svc_model.pk') model = load('xgb_model.pk') data = pd.read_csv("../Data/cs-training.csv", index_col=0) # x = np.array([np.array([-0.01867214, 0.28473352, -0.35965723, -0.12996393, -0.18025808, 0.07713616, # -0.18793693, -0.01226944, -0.19963244, -0.72204477])]) # x = np.array([np.array([-0.92867214, 0.98473352, -0.05965723, -0.92996393, -0.08025808, 0.17713616, # -0.58793693, -0.91226944, -0.99963244, -0.92204477])]) # print(model.predict(x)) # a,b = model.predict_proba(x)[0] # print(get_score(b,a),'\n') print(get_score(0.5,0.5)) # 给出判断矩阵 # b = np.array([[1, 1 / 3, 1 / 8], [3, 1, 1 / 3], [8, 3, 1]]) ''' "个人信息" :"年龄" "月收入" "家属数量" “信贷信息” :"负债率" "逾期30-59天笔数" "逾期60-89天笔数" "逾期90天笔数" "可用额度比值" "信贷数量" "固定资产贷款量" ''' # "个人信息",“信用信息” a = np.array([[1, 1/3], [3, 1] ]) ahp1 = AHP(a) # 算术平均法求权重 weight1 = ahp1.cal_weight_by_arithmetic_method() # weight2 = AHP(a).cal_weight__by_geometric_method() # 特征值法求权重 weight3 = ahp1.cal_weight__by_eigenvalue_method() ahp1.test_consist() # "年龄","月收入","家属数量" b1 = np.array([ [1 ,1/3,6 ], [3 ,1 ,9 ], [1/6,1/9,1] ]) ahp2 = AHP(b1) weight4 = ahp2.cal_weight_by_arithmetic_method() weight5 = ahp2.cal_weight__by_eigenvalue_method() ahp2.test_consist() # "负债率","逾期30-59天笔数","逾期60-89天笔数","逾期90天笔数","可用额度比值","信贷数量","固定资产贷款量" b2 = np.array([ [1 ,2 ,4 ,5 ,3 ,2 ,2 ], [1/2,1 ,3 ,4 ,1 ,2 ,1 ], [1/4,1/3,1 ,2 ,1/2,1 ,1/2], [1/5,1/4,1/2,1 ,1/4,1/5,1/3], [1/3,1 ,2 ,4 ,1 ,2 ,2 ], [1/2,1/2,1 ,5 ,1/2,1 ,1 ], [1/2,1 ,2 ,3 ,1/2,1 ,1 ] ]) ahp3 = AHP(b2) weight6 = ahp3.cal_weight_by_arithmetic_method() weight7 = ahp3.cal_weight__by_eigenvalue_method() ahp3.test_consist() pass