123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483 |
- import sys
- import pandas as pd
- from sklearn.ensemble import RandomForestRegressor
- import matplotlib.pyplot as plt
- import matplotlib as mpl
- mpl.rcParams['font.sans-serif'] = ['KaiTi']
- mpl.rcParams['font.serif'] = ['KaiTi']
- import warnings
- warnings.filterwarnings("ignore")
- from scipy import stats
- import numpy as np
- from sklearn.linear_model import LogisticRegression
- from sklearn.svm import SVC,LinearSVC
- from sklearn.preprocessing import MinMaxScaler,StandardScaler
- from sklearn.cross_validation import train_test_split
- import xgboost as xgb
- def get_data():
- data = pd.read_csv("../Data/cs-training.csv", index_col=0)
- # print(data.info())
- pd.set_option('display.max_columns', None)
- pd.set_option('display.max_rows', None)
- # print("\033[7;37;41m\t 数据详细描述: \033[0m")
- # print(data.describe())
- print("get data!")
- return data
- def data_process(data):
- def set_missing(df):
- # 把已有的数值型特征取出来,MonthlyIncome位于第5列,NumberOfDependents位于第10列
- process_df = df.ix[:, [5, 0, 1, 2, 3, 4, 6, 7, 8, 9]]
- # 分成已知该特征和未知该特征两部分
- known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
- unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
- X = known[:, 1:] # X为特征属性值
- y = known[:, 0] # y为结果标签值
- rfr = RandomForestRegressor(random_state=0, n_estimators=200, max_depth=3)
- rfr.fit(X, y)
- print(rfr.feature_importances_)
- # 用得到的模型进行未知特征值预测月收入
- predicted = rfr.predict(unknown[:, 1:]).round(0)
- # 用得到的预测结果填补原缺失数据
- df.loc[df.MonthlyIncome.isnull(), 'MonthlyIncome'] = predicted
- return df
- # 用随机森林填补比较多的缺失值
- data = set_missing(data)
- data = data.dropna()
- data = data.drop_duplicates()
- states = {
- 'SeriousDlqin2yrs': '好坏客户',
- 'RevolvingUtilizationOfUnsecuredLines': '可用额度比值', # 无担保放款循环利用比值
- 'age': '年龄',
- 'NumberOfTime30-59DaysPastDueNotWorse': '逾期30-59天笔数',
- 'DebtRatio': '负债率',
- 'MonthlyIncome': '月收入',
- 'NumberOfOpenCreditLinesAndLoans': '信贷数量',
- 'NumberOfTimes90DaysLate': '逾期90天笔数',
- 'NumberRealEstateLoansOrLines': '固定资产贷款量',
- 'NumberOfTime60-89DaysPastDueNotWorse': '逾期60-89天笔数',
- 'NumberOfDependents': '家属数量'
- } # 创建字典
- data.rename(columns=states, inplace=True)
- print(data[data['年龄']>90].count())
- data = data[data['年龄']>0]
- data = data[data['年龄']<100]
- data = data[data['逾期30-59天笔数']<85]
- data = data[data['家属数量']<30]
- data = data[data['负债率']<100]
- print(data.info())
- print("data process!")
- return data
- def data_analysis(data):
- # train_box = data.iloc[:,[3,7,9]]
- # train_box.boxplot(figsize=(10,4))
- # plt.show()
- fig = plt.figure(figsize=(15, 10))
- a = fig.add_subplot(3, 2, 1)
- b = fig.add_subplot(3, 2, 2)
- c = fig.add_subplot(3, 2, 3)
- d = fig.add_subplot(3, 2, 4)
- e = fig.add_subplot(3, 2, 5)
- f = fig.add_subplot(3, 2, 6)
- a.boxplot(data['可用额度比值'], labels=['可用额度比值'])
- b.boxplot([data['年龄'], data['好坏客户']], labels=['年龄', '好坏客户'])
- c.boxplot([data['逾期30-59天笔数'], data['逾期60-89天笔数'], data['逾期90天笔数']], labels=['逾期30-59天笔数', '逾期60-89天笔数', '逾期90天笔数'])
- d.boxplot([data['信贷数量'], data['固定资产贷款量'], data['家属数量']], labels=['信贷数量', '固定资产贷款量', '家属数量'])
- e.boxplot(data['月收入'], labels=['月收入'])
- f.boxplot(data['负债率'], labels=['负债率'])
- data.hist(figsize=(20, 15))
- for k in [1, 2, 4, 5]: # 遍历列
- q1 = data.iloc[:, k].quantile(0.25) # 计算上四分位数
- q3 = data.iloc[:, k].quantile(0.75) # 计算下四分位数
- iqr = q3 - q1
- print(q3)
- low = q1 - 1.5 * iqr
- up = q3 + 1.5 * iqr
- if k == 1:
- data1 = data
- data1 = data1[(data1.iloc[:, k] > low) & (data1.iloc[:, k] < up)] # 保留正常值范围
- data = data1
- data.info()
- data.iloc[:, [1, 2, 4, 5]].boxplot(figsize=(15, 10))
- data = data[data['逾期30-59天笔数'] < 80]
- data = data[data['逾期60-89天笔数'] < 80]
- data = data[data['逾期60-89天笔数'] < 80]
- data = data[data['逾期90天笔数'] < 80]
- data = data[data['固定资产贷款量'] < 50]
- data = data[data['家属数量'] < 15]
- print(data['好坏客户'].sum())
- def binning(data):
- # 定义自动分箱函数,采用最优分段进行分箱
- '''采用斯皮尔曼等级相关系数进行变量相关分析,该相关系数对两个变量划分等级在进行分析,[-1,1],
- 当两个变量完全单调相关时,斯皮尔曼相关系数则为+1或−1'''
- def op(y, x, n=20): # 定义函数,有三个参数,x为要分箱的变量,y对应关注的因变量,即好坏客户,n最大分组数为20,依次减小试验
- r = 0
- bad = y.sum() # 计算坏客户个数,因为等于1时表示坏客户,所以求和即可得到好客户数
- good = y.count() - bad # count统计个数为所有客户数,减去好客户得到好客户人数
- while np.abs(r) < 1: # 判断,当绝对值小于1时继续执行,直到相关系数绝对值等于1停止
- d1 = pd.DataFrame({"x": x, "y": y, "bucket": pd.qcut(x, n)})
- d2 = d1.groupby('bucket',
- as_index=True) # 对数据框d1按照bucket变量分组,Ture则返回以组标签为索引的对象,reset_index()可以取消分组索引让其变成dataframe
- r, p = stats.spearmanr(d2.mean().x, d2.mean().y) # 分组,数据为组间均值,计算此时x与y的斯皮尔曼等级相关系数,输出相关系数和P值
- n = n - 1 # 减小分组数
- d3 = pd.DataFrame(d2.x.min(), columns=['min']) # 建立数据框
- d3['min'] = d2.min().x
- d3['max'] = d2.max().x
- d3['sum'] = d2.sum().y # 对应分组的坏客户数
- d3['total'] = d2.count().y # 对应分组总客户数
- d3['rate'] = d2.mean().y # 坏客户数占该组总人数比
- d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad)) # 求woe
- # d3['iv']=
- d4 = (d3.sort_index(by='min')).reset_index(drop=True)
- print("=" * 60)
- print(d4)
- return (d4)
- # 利用所定义的函数依次对连续型变量进行最优分段分箱处理,满足条件的有以下
- # x2=op(data['好坏客户'],data['年龄'])
- # x4=op(data['好坏客户'],data['负债率'])
- # x5=op(data['好坏客户'],data['月收入'])
- # 对于不能采用最优分段的变量采用等深分段
- def funqcut(y, x, n):
- cut1 = pd.qcut(x.rank(method='first'), n) # 进行等深分箱,分组
- data = pd.DataFrame({"x": x, "y": y, "cut1": cut1})
- cutbad = data.groupby(cut1).y.sum() # 求分组下的坏客户数
- cutgood = data.groupby(cut1).y.count() - cutbad # 求分组下好客户数
- bad = data.y.sum() # 求总的坏客户数
- good = data.y.count() - bad # 求总的好客户数
- woe = np.log((cutbad / bad) / (cutgood / good)) # 求各分组的woe
- iv = (cutbad / bad - cutgood / good) * woe # 求各分组的iv
- cut = pd.DataFrame({"坏客户数": cutbad, "好客户数": cutgood, "woe": woe, "iv": iv})
- print(cut)
- return cut # 返回表格和对应分组列表
- # funqcut(train['好坏客户'],train['年龄'],6).reset_index()
- x1 = funqcut(data['好坏客户'], data['可用额度比值'], 5).reset_index()
- x2 = funqcut(data['好坏客户'], data['年龄'], 8).reset_index()
- x4 = funqcut(data['好坏客户'], data['负债率'], 4).reset_index()
- x5 = funqcut(data['好坏客户'], data['月收入'], 5).reset_index()
- x6 = funqcut(data['好坏客户'], data['信贷数量'], 6).reset_index()
- x3 = funqcut(data['好坏客户'], data['逾期30-59天笔数'], 5).reset_index()
- x7 = funqcut(data['好坏客户'], data['逾期90天笔数'], 5).reset_index()
- x8 = funqcut(data['好坏客户'], data['固定资产贷款量'], 5).reset_index()
- x9 = funqcut(data['好坏客户'], data['逾期60-89天笔数'], 5).reset_index()
- x10 = funqcut(data['好坏客户'], data['家属数量'], 5).reset_index()
- ivx1 = x1.iv.sum()
- ivx2 = x2.iv.sum()
- ivx3 = x3.iv.sum()
- ivx4 = x4.iv.sum()
- ivx5 = x5.iv.sum()
- ivx6 = x6.iv.sum()
- ivx7 = x7.iv.sum()
- ivx8 = x8.iv.sum()
- ivx9 = x9.iv.sum()
- ivx10 = x10.iv.sum()
- IV = pd.DataFrame({"可用额度比值": ivx1,
- "年龄": ivx2,
- "逾期30-59天笔数": ivx3,
- "负债率": ivx4,
- "月收入": ivx5,
- "信贷数量": ivx6,
- "逾期90天笔数": ivx7,
- "固定资产贷款量": ivx8,
- "逾期60-89天笔数": ivx9,
- "家属数量": ivx10}, index=[0])
- ivplot = IV.plot.bar(figsize=(15, 10))
- ivplot.set_title('特征变量的IV值分布')
- # 利用等深分段分箱设定,将各个变量的分箱后情况保存为cut1,cut2,cut3...与所得到的分箱情况总结表相对应。其中采用x2/x4/x5采用最优分段的结果
- def cutdata(x, n):
- a = pd.qcut(x.rank(method='first'), n, labels=False) # 等深分组,label=False返回整数值(0,1,2,3...)对应第一类、第二类..
- return a
- # 连续变量均被等深分为了5类
- # 应用函数,求出各变量分类情况
- cut1 = cutdata(data['可用额度比值'], 5)
- cut2 = cutdata(data['年龄'], 8)
- cut3 = cutdata(data['逾期30-59天笔数'], 5)
- cut4 = cutdata(data['负债率'], 4)
- cut5 = cutdata(data['月收入'], 5)
- cut6 = cutdata(data['信贷数量'], 6)
- cut7 = cutdata(data['逾期90天笔数'], 5)
- cut8 = cutdata(data['固定资产贷款量'], 5)
- cut9 = cutdata(data['逾期60-89天笔数'], 5)
- cut10 = cutdata(data['家属数量'], 5)
- # 依据变量值的分类替换成对应的woe值
- def replace_train(cut, cut_woe): # 定义替换函数,cut为分组情况,cut_woe为分组对应woe值
- a = []
- for i in cut.unique(): # unique为去重,保留唯一值
- a.append(i)
- a.sort() # 排序,默认小到大,得到类别列表并排序,实则为[0,1,2,3,4]
- for m in range(len(a)):
- cut.replace(a[m], cut_woe.values[m],
- inplace=True) # 替换函数,把cut中旧数值a[m]即分类替换为对应woe,cut_woe中的woe也是从小到大排序,因此与a[m]对应,正如把cut中的0替换为woe值,没有改变cut的数值顺序
- return cut # 返回被替换后的列表
- train_new = pd.DataFrame() # 创建新数据框保存替换后的新数据
- train_new['好坏客户'] = data['好坏客户']
- train_new['可用额度比值'] = replace_train(cut1, x1.woe)
- train_new['年龄'] = replace_train(cut2, x2.woe)
- train_new['逾期30-59天笔数'] = replace_train(cut3, x3.woe)
- train_new['负债率'] = replace_train(cut4, x4.woe)
- train_new['月收入'] = replace_train(cut5, x5.woe)
- train_new['信贷数量'] = replace_train(cut6, x6.woe)
- train_new['逾期90天笔数'] = replace_train(cut7, x7.woe)
- train_new['固定资产贷款量'] = replace_train(cut8, x8.woe)
- train_new['逾期60-89天笔数'] = replace_train(cut9, x9.woe)
- train_new['家属数量'] = replace_train(cut10, x10.woe)
- train_new.head()
- def train_SVM(data):
- x = data.iloc[:, 1:]
- y = data.iloc[:, 0]
- train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=1111)
- # 标准化
- stand = StandardScaler()
- stand = stand.fit(train_x)
- train_x = stand.transform(train_x)
- test_x = stand.transform(test_x)
- model = SVC(probability=True)
- result = model.fit(train_x,train_y)
- # pred_y = model.predict(test_x)
- score = model.score(test_x,test_y)
- print("test score:",score)
- x1 = test_x[:1]
- print("x1:",x1)
- y1 = model.predict_proba(x1)
- print("y1:",y1)
- save(model,"svc_model.pk")
- return model, stand
- def train_xgb(data):
- print("running,",sys._getframe().f_code.co_name)
- x = data.iloc[:, 1:]
- y = data.iloc[:, 0]
- train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=1111)
- # 标准化
- stand = StandardScaler()
- stand = stand.fit(train_x)
- train_x = stand.transform(train_x)
- test_x = stand.transform(test_x)
- model = xgb.XGBClassifier(learning_rate=0.1,
- n_estimators=1000, # 树的个数--1000棵树建立xgboost
- max_depth=6, # 树的深度
- min_child_weight = 1, # 叶子节点最小权重
- gamma=0., # 惩罚项中叶子结点个数前的参数
- subsample=0.8, # 随机选择80%样本建立决策树
- colsample_btree=0.8, # 随机选择80%特征建立决策树
- objective='binary:logistic', # 指定损失函数
- scale_pos_weight=1, # 解决样本个数不平衡的问题
- random_state=27 # 随机数
- )
- result = model.fit(train_x,train_y)
- score = model.score(test_x, test_y)
- print("test score:", score)
- x1 = test_x[:1]
- print("x1:",x1)
- y1 = model.predict(x1)
- y2 = model.predict_proba(x1)
- print("y1:",y1,y2)
- save(model,"xgb_model.pk")
- return model, stand
- '''
- 假设比例即违约与正常比v为1/70,此时预期分值Z为700,PDD(比率翻倍的分数)为50
- B=PDD/log(2)
- A=Z+B*log(v)
- '''
- B=3/np.log(2)
- A=60+B*np.log(1/50)
- def get_score(bad_prob, good_prob, A=A, B=B):
- ''' 评分公式: score = A - B * ln(bad_prob/good_prob)
- 或 score = A + PDD * log2(good_prob/bad_prob)
- '''
- odds = bad_prob/good_prob
- score = A - B * np.log(odds)
- # print(A+30*np.log2(good_prob/bad_prob))
- return score
- import pickle
- def save(object_to_save, path):
- with open(path, 'wb') as f:
- pickle.dump(object_to_save, f)
- def load(path):
- with open(path, 'rb') as f:
- object1 = pickle.load(f)
- return object1
- class AHP:
- """
- 相关信息的传入和准备
- """
- def __init__(self, array):
- ## 记录矩阵相关信息
- self.array = array
- ## 记录矩阵大小
- self.n = array.shape[0]
- # 初始化RI值,用于一致性检验
- self.RI_list = [0, 0, 0.52, 0.89, 1.12, 1.26, 1.36, 1.41, 1.46, 1.49, 1.52, 1.54, 1.56, 1.58,
- 1.59]
- # 矩阵的特征值和特征向量
- self.eig_val, self.eig_vector = np.linalg.eig(self.array)
- # 矩阵的最大特征值
- self.max_eig_val = np.max(self.eig_val)
- # 矩阵最大特征值对应的特征向量
- self.max_eig_vector = self.eig_vector[:, np.argmax(self.eig_val)].real
- # 矩阵的一致性指标CI
- self.CI_val = (self.max_eig_val - self.n) / (self.n - 1)
- # 矩阵的一致性比例CR
- self.CR_val = self.CI_val / (self.RI_list[self.n - 1])
- """
- 一致性判断
- """
- def test_consist(self):
- # 打印矩阵的一致性指标CI和一致性比例CR
- print("判断矩阵的CI值为:" + str(self.CI_val))
- print("判断矩阵的CR值为:" + str(self.CR_val))
- # 进行一致性检验判断
- if self.n == 2: # 当只有两个子因素的情况
- print("仅包含两个子因素,不存在一致性问题")
- else:
- if self.CR_val < 0.1: # CR值小于0.1,可以通过一致性检验
- print("判断矩阵的CR值为" + str(self.CR_val) + ",通过一致性检验")
- return True
- else: # CR值大于0.1, 一致性检验不通过
- print("判断矩阵的CR值为" + str(self.CR_val) + "未通过一致性检验")
- return False
- """
- 算术平均法求权重
- """
- def cal_weight_by_arithmetic_method(self):
- # 求矩阵的每列的和
- col_sum = np.sum(self.array, axis=0)
- # 将判断矩阵按照列归一化
- array_normed = self.array / col_sum
- # 计算权重向量
- array_weight = np.sum(array_normed, axis=1) / self.n
- # 打印权重向量
- print("算术平均法计算得到的权重向量为:\n", array_weight)
- # 返回权重向量的值
- return array_weight
- """
- 几何平均法求权重
- """
- def cal_weight__by_geometric_method(self):
- # 求矩阵的每列的积
- col_product = np.product(self.array, axis=0)
- # 将得到的积向量的每个分量进行开n次方
- array_power = np.power(col_product, 1 / self.n)
- # 将列向量归一化
- array_weight = array_power / np.sum(array_power)
- # 打印权重向量
- print("几何平均法计算得到的权重向量为:\n", array_weight)
- # 返回权重向量的值
- return array_weight
- """
- 特征值法求权重
- """
- def cal_weight__by_eigenvalue_method(self):
- # 将矩阵最大特征值对应的特征向量进行归一化处理就得到了权重
- array_weight = self.max_eig_vector / np.sum(self.max_eig_vector)
- # 打印权重向量
- print("特征值法计算得到的权重向量为:\n", array_weight)
- # 返回权重向量的值
- return array_weight
- if __name__ == '__main__':
- # data = get_data()
- # data = data_process(data)
- # # model,stand = train_SVM(data)
- # model,stand = train_xgb(data)
- # test_data = pd.read_csv("Data/cs-test.csv", index_col=0)
- # model = load('svc_model.pk')
- model = load('xgb_model.pk')
- data = pd.read_csv("../Data/cs-training.csv", index_col=0)
- # x = np.array([np.array([-0.01867214, 0.28473352, -0.35965723, -0.12996393, -0.18025808, 0.07713616,
- # -0.18793693, -0.01226944, -0.19963244, -0.72204477])])
- # x = np.array([np.array([-0.92867214, 0.98473352, -0.05965723, -0.92996393, -0.08025808, 0.17713616,
- # -0.58793693, -0.91226944, -0.99963244, -0.92204477])])
- # print(model.predict(x))
- # a,b = model.predict_proba(x)[0]
- # print(get_score(b,a),'\n')
- print(get_score(0.5,0.5))
- # 给出判断矩阵
- # b = np.array([[1, 1 / 3, 1 / 8], [3, 1, 1 / 3], [8, 3, 1]])
- '''
- "个人信息" :"年龄"
- "月收入"
- "家属数量"
-
- “信贷信息” :"负债率"
- "逾期30-59天笔数"
- "逾期60-89天笔数"
- "逾期90天笔数"
- "可用额度比值"
- "信贷数量"
- "固定资产贷款量"
- '''
- # "个人信息",“信用信息”
- a = np.array([[1, 1/3],
- [3, 1]
- ])
- ahp1 = AHP(a)
- # 算术平均法求权重
- weight1 = ahp1.cal_weight_by_arithmetic_method()
- # weight2 = AHP(a).cal_weight__by_geometric_method()
- # 特征值法求权重
- weight3 = ahp1.cal_weight__by_eigenvalue_method()
- ahp1.test_consist()
- # "年龄","月收入","家属数量"
- b1 = np.array([
- [1 ,1/3,6 ],
- [3 ,1 ,9 ],
- [1/6,1/9,1]
- ])
- ahp2 = AHP(b1)
- weight4 = ahp2.cal_weight_by_arithmetic_method()
- weight5 = ahp2.cal_weight__by_eigenvalue_method()
- ahp2.test_consist()
- # "负债率","逾期30-59天笔数","逾期60-89天笔数","逾期90天笔数","可用额度比值","信贷数量","固定资产贷款量"
- b2 = np.array([
- [1 ,2 ,4 ,5 ,3 ,2 ,2 ],
- [1/2,1 ,3 ,4 ,1 ,2 ,1 ],
- [1/4,1/3,1 ,2 ,1/2,1 ,1/2],
- [1/5,1/4,1/2,1 ,1/4,1/5,1/3],
- [1/3,1 ,2 ,4 ,1 ,2 ,2 ],
- [1/2,1/2,1 ,5 ,1/2,1 ,1 ],
- [1/2,1 ,2 ,3 ,1/2,1 ,1 ]
- ])
- ahp3 = AHP(b2)
- weight6 = ahp3.cal_weight_by_arithmetic_method()
- weight7 = ahp3.cal_weight__by_eigenvalue_method()
- ahp3.test_consist()
- pass
|