请问田野老师,计算woe的时候,总是提示argument of type 'int' is not iterable

'''问题,运行以下代码,总是提示argument of type 'int' is not iterable'''

#函数:计算WOE和IV        
#df: 包含要素和目标的数据框架;col: 需要计算WOE和iv的特征,通常是分类变量
def CalcWOE(df, col, target):
    total = df.groupby([col])[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup.reset_index(level=0, inplace=True)
    N = sum(regroup['total'])
    B = sum(regroup['bad'])
    regroup['good'] = regroup['total'] - regroup['bad']
    G = N - B
    regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
    regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
    regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
    WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
    for k, v in WOE_dict.items():
        WOE_dict[k] = v['WOE']
    IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
    IV = sum(IV)
    return {"WOE": WOE_dict, 'IV':IV}


#df: 包含特征和目标的数据框架;col: 需要用bad rate编码的特征,通常是分类变量    
def BadRateEncoding(df, col, target):
    total = df.groupby([col])[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup.reset_index(level=0, inplace=True)
    regroup['bad_rate'] = regroup.apply(lambda x: x.bad*1.0/x.total,axis = 1)
    br_dict = regroup[[col,'bad_rate']].set_index([col]).to_dict(orient='index')
    for k, v in br_dict.items():
        br_dict[k] = v['bad_rate']
    badRateEnconding = df[col].map(lambda x: br_dict[x])
    return {'encoding':badRateEnconding, 'br_rate':br_dict}



#函数:计算WOE和IV。如果我们发现任何类别为0,那么我们将这些类别与具有最小非零差错率的类别组合起来
#df: 数据集;col: 需要计算WOE和iv的特征,通常是分类变量
def MergeBad0(df,col,target):
    total = df.groupby([col])[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup.reset_index(level=0, inplace=True)
    regroup['bad_rate'] = regroup.apply(lambda x: x.bad*1.0/x.total,axis = 1)
    regroup = regroup.sort_values(by = 'bad_rate')
    col_regroup = [[i] for i in regroup[col]]
    for i in range(regroup.shape[0]):
        col_regroup[1] = col_regroup[0] + col_regroup[1]
        col_regroup.pop(0)
        if regroup['bad_rate'][i+1] > 0:
            break
    newGroup = {}
    for i in range(len(col_regroup)):
        for g2 in col_regroup[i]:
            newGroup[g2] = 'Bin '+str(i)
    return newGroup
    

for col in categorical_var:
    print 'we are processing {}'.format(col)
    if len(set(trainData[col]))>5:
        print '{} is encoded with bad rate'.format(col)
        col0 = str(col)+'_encoding'        
        encoding_result = BadRateEncoding(trainData, col, 'target')    #(1), 计算违约率,并使用违约率对原始值进行编码
        trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['br_rate']        
        numerical_var.append(col0)   #(2), 将违约率编码值转入数值变量列表       
        encoded_features[col] = [col0, br_encoding]   #(3), 保存编码结果,包括新的列名称和违约率                
        deleted_features.append(col)   #(4), 删除原始值
    else:
        maxPcnt = MaximumBinPcnt(trainData, col)
        if maxPcnt > 0.9:
            print '{} is deleted because of large percentage of single bin'.format(col)
            deleted_features.append(col)
            categorical_var.remove(col)            
            continue
        bad_bin = trainData.groupby([col])['target'].sum()
        if min(bad_bin) == 0:
            print '{} has 0 bad sample!'.format(col)
            col1 = str(col) + '_mergeByBadRate'                      
            mergeBin = MergeBad0(trainData, col, 'target')   #(1), 确定如何合并类别                        
            trainData[col1] = trainData[col].map(mergeBin)    #(2), 将原始数据转换为合并数据
            maxPcnt = MaximumBinPcnt(trainData, col1)
            if maxPcnt > 0.9:
                print '{} is deleted because of large percentage of single bin'.format(col)
                deleted_features.append(col)
                categorical_var.remove(col)
                del trainData[col]
                continue                       
            merged_features[col] = [col1, mergeBin]     #(3) 如果合并的数据满足要求,就保留
            WOE_IV = CalcWOE(trainData, col1, 'target')
            var_WOE[col1] = WOE_IV['WOE']
            var_IV[col1] = WOE_IV['IV']
            
            deleted_features.append(col)
        else:
            WOE_IV = CalcWOE(trainData, col, 'target')
            var_WOE[col] = WOE_IV['WOE']
            var_IV[col] = WOE_IV['IV']

山羊Thegoat

赞同来自:

可以试下将 regroup.reset_index(level=0, inplace=True)改为 regroup.reset_index(level=[0, ],inplace=True) 我是在另外一个计算违约率的我是试函数BinBadRate 遇到同样的错误,一个同学的告诉我的方法,结果就好了, 同时也遇到了跟上面这位同学同样的错误,还没来得及试   我是在pyton2.7.13里运行的

jursey

赞同来自:

运行 deleted_features = []  #delete the categorical features in one of its single bin occupies more than 90% encoded_features = {} merged_features = {} var_IV = {}  #save the IV values for binned features var_WOE = {} for col in categorical_var:     print( 'we are processing {}'.format(col))     if len(set(trainData[col]))>5:         print( '{} is encoded with bad rate'.format(col))         col0 = str(col)+'_encoding'         #(1), calculate the bad rate and encode the original value using bad rate         encoding_result = BadRateEncoding(trainData, col, 'target')         trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['br_rate']         #(2), push the bad rate encoded value into numerical varaible list         numerical_var.append(col0)         #(3), save the encoding result, including new column name and bad rate         encoded_features[col] = [col0, br_encoding]         #(4), delete the original value         #del trainData[col]         deleted_features.append(col)     else:         maxPcnt = MaximumBinPcnt(trainData, col)         if maxPcnt > 0.9:             print ('{} is deleted because of large percentage of single bin'.format(col))             deleted_features.append(col)             categorical_var.remove(col)             #del trainData[col]             continue         bad_bin = trainData.groupby([col])['target'].sum()         if min(bad_bin) == 0:             print( '{} has 0 bad sample!'.format(col))             col1 = str(col) + '_mergeByBadRate'             #(1), determine how to merge the categories             mergeBin = MergeBad0(trainData, col, 'target')             #(2), convert the original data into merged data             trainData[col1] = trainData[col].map(mergeBin)             maxPcnt = MaximumBinPcnt(trainData, col1)             if maxPcnt > 0.9:                 print( '{} is deleted because of large percentage of single bin'.format(col))                 deleted_features.append(col)                 categorical_var.remove(col)                 del trainData[col]                 continue             #(3) if the merged data satisify the requirement, we keep it             merged_features[col] = [col1, mergeBin]             WOE_IV = CalcWOE(trainData, col1, 'target')             var_WOE[col1] = WOE_IV['WOE']             var_IV[col1] = WOE_IV['IV']             #del trainData[col]             deleted_features.append(col)         else:             WOE_IV = CalcWOE(trainData, col, 'target')             var_WOE[col] = WOE_IV['WOE']             var_IV[col] = WOE_IV['IV']          出现报错  --------------------------------------------------------------------------- TypeError                                 Traceback (most recent call last) <ipython-input-29-f416b2c7521f> in <module>()      10         col0 = str(col)+'_encoding'      11         #(1), calculate the bad rate and encode the original value using bad rate ---> 12         encoding_result = BadRateEncoding(trainData, col, 'target')      13         trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['br_rate']      14         #(2), push the bad rate encoded value into numerical varaible list C:\data\code\scorecard_functions.py in BadRateEncoding(df, col, target)     107     bad = pd.DataFrame({'bad': bad})     108     regroup = total.merge(bad, left_index=True, right_index=True, how='left') --> 109     regroup.reset_index(level[0,], inplace=True)     110     regroup['bad_rate'] = regroup.apply(lambda x: x.bad*1.0/x.total,axis = 1)     111     br_dict = regroup[[col,'bad_rate']].set_index([col]).to_dict(orient='index') C:\Users\admin\Anaconda3\lib\site-packages\pandas\core\frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)    3053                 # to ndarray and maybe infer different dtype    3054                 level_values = _maybe_casted_values(lev, lab) -> 3055                 if level is None or i in level:    3056                     new_obj.insert(0, name, level_values)    3057 TypeError: argument of type 'int' is not iterable

要回复问题请先登录注册