异动分析之Gini和贡献度拆解
异动分析之Gini和贡献度拆解异动分析之Gini和贡献度拆解异动分析之Gini和贡献度拆解
import pandas as pd
import numpy as np
import cal_abs_contribute as cal
import downloaddata as dd
import uploaddata as ud
import time
import datetime
import jaydebeapi
from hdfs.client import InsecureClient
import pandas as pd
from hdfs import HdfsError
import itertools
today = datetime.date.today()
sqlstr = '''
select
*
from tmp_traffic.mdapp_campus_ontime_yidong_1214156_20240506 where pdate ='{0}'
'''.format(today)
df_all = dd.downloaddata(sqlstr)
#重命名列
columns=['dt','current_hour','hour_shike','scname','sodptcityname','soarrcityname','sodptstationname','soarrstationname',
'success_order_num','success_user_num','cancel_order_num','cancel_user_num',
'd14_success_ordercnt','d14_success_membercnt','d14_cancel_ordercnt','d14_cancel_membercnt','pdate'
]
df_all.columns=columns
df_all.head()
def cal_abs_contribute(df,base_col,cont_col,X,base_y,cont_y):
#df-数据集,只包含两期数据;period-日期字段;base_date-基期日期;cont_date-对照组日期;X-维度列;y-指标列
df=df.copy()
#构建基期与对照期数据集
df_base=base_col
df_cont=cont_col
#计算变化值
all_change=df_cont[cont_y].sum()-df_base[base_y].sum()
#计算所有组合的贡献度
df_contribute=pd.DataFrame(columns=['contribute','dim','dim_name','dim_value','all_base','base','all_change','change','var'])
for i in range(5): #2 表示循环2维,可最大维度数量,运行效率极慢
comb=itertools.combinations(X,i+1)
for j in comb:
#计算贡献值
dc=df_cont.groupby(list(j))[cont_y].sum()-df_base.groupby(list(j))[base_y].sum()
#计算贡献百分比
dr=dc/all_change
df_temp=pd.DataFrame(dr)
df_temp['dim']=i+1
df_temp['dim_name']=str(dr.index.names)
df_temp['dim_value']=dr.index
df_temp['all_base']=df_base[base_y].sum()
df_temp['base']=df_base.groupby(list(j))[base_y].sum().values
df_temp['all_change']=all_change
df_temp['change']=dc.values
df_temp['var']=df_temp['change']/df_temp['base']
df_temp.columns=['contribute','dim','dim_name','dim_value','all_base','base','all_change','change','var']
df_contribute=df_contribute.append(df_temp)
df_result=df_contribute[['dim','dim_name','dim_value','all_base','base','all_change','change','contribute','var']]
return df_result
#删除不需要计算的列
df=df_all.drop(['dt','current_hour','success_user_num','soarrcityname','sodptstationname','sodptcityname','soarrstationname','cancel_order_num','cancel_user_num','d14_success_membercnt','d14_cancel_ordercnt','d14_cancel_membercnt','pdate'], axis=1)
#将数据清洗为固定的格式
base_col=df[['hour_shike','scname','success_order_num']]
cont_col=df[['hour_shike','scname','d14_success_ordercnt']]
X=df.columns[0:-2]
base_y=df.columns[-2]
cont_y=df.columns[-1]
## 得到计算结果
df_result = cal_abs_contribute(df,base_col,cont_col,X,base_y,cont_y).sort_values(by='contribute',ascending=False)
df_result['con2'] = df_result['contribute']*df_result['contribute']
## 处理掉逗号
df_result['dim_name'] =df_result['dim_name'].str.replace(',', '&')
def tuple_to_string(t):
return '&'.join(t)
df_result.loc[df_result['dim']==2,'dim_value'] =df_result['dim_value'].apply(tuple_to_string)
df_result['dim_value'] =df_result['dim_value'].str.replace(',', '-')
## 计算简易基尼系数
df_dimension = df_result.groupby(['dim_name'],as_index=False)['con2'].sum()
df_dimension['gini'] = 1-df_dimension['con2']
df_dimension = df_dimension[['dim_name','gini']]
df_merge = pd.merge(df_result, df_dimension, on='dim_name', how='left')
df_merge = df_merge[['dim','dim_name','dim_value','all_base','base','all_change','change','contribute','var','gini']]
df_merge.head(10)
df_result=df_merge
# 处理掉逗号
df_result['dim_name'] =df_result['dim_name'].str.replace(',', '&')
df_result = df_result.reset_index(drop=True)
print(df_result.head())
## 保存成csv 文件
filenamecsv = "campus_successorder_change_result_dpt_{0}.csv".format(today)
df_result.to_csv(filenamecsv,header=None,index= None,encoding = 'utf-8')
## 上传数据库
def uploaddb(**kwargs):
print('HDFS')
#连接数据库
try:
client = InsecureClient(url="http://nn2.ns12.vip.bigdata.ly:15821",user="traffic_train")
print('r1',client.list('/'))
except HdfsError as e:
if "Operation category READ is not supported in state standby" in str(e):
client = InsecureClient(url="http://nn1.ns12.vip.bigdata.ly:15821",user="traffic_train")
print('r2',client.list('/'))
else:
print("其他 HdfsError 异常:", e)
#分区表和非分区表
if kwargs['partition'] :
#删除当日分区中所有数据
client.delete('/ns-traffic/ads/train/{0}/pdate={1}/'.format(kwargs['hivetable'],kwargs['pdate']),True)
print('删除分区','/ns-traffic/ads/train/{0}/pdate={1}/'.format(kwargs['hivetable'],kwargs['pdate']))
#上传新数据
client.upload('/ns-traffic/ads/train/{0}/pdate={1}/{2}'.format(kwargs['hivetable'],kwargs['pdate'],kwargs['file_name'].split('.')[0]),kwargs['file_name'])
print('上传表','/ns-traffic/ads/train/{0}/pdate={1}/{2}'.format(kwargs['hivetable'],kwargs['pdate'],kwargs['file_name'].split('.')[0]))
print('上传完成')
else :
client.delete('/ns-traffic/ads/train/{0}/'.format(kwargs['hivetable']),True)
print('删除分区','/ns-traffic/ads/train/{0}/'.format(kwargs['hivetable']))
client.upload('/ns-traffic/ads/train/{0}/{1}'.format(kwargs['hivetable'],kwargs['file_name'].split('.')[0]),kwargs['file_name'])
print('上传表','/ns-traffic/ads/train/{0}/{1}'.format(kwargs['hivetable'],kwargs['file_name'].split('.')[0]))
print('上传完成')
## 上传数据库
uploaddb(hivetable='campus_successorder_change_result_dpt',pdate=today,file_name=filenamecsv,partition=1)
关注公众号「水沐教育科技」,在手机上阅读所有教程,随时随地都能学习。内含一款搜索神器,免费下载全网书籍和视频。

微信扫码关注公众号