# 使用机器学习生成的预测模型确定倾向分数

## 快速入门

``````python=3.6.7
psycopg2
sklearn
pandas
matplotlib
numpy
tqdm
``````

## 为机器学习准备数据 prepare-data-for-machine-learning

``````#define the target label for prediction
df['target'] = (df['analytic_action'] == 'productPurchase').astype(int)
#remove columns that are dependent on the label
df.drop(['analytic_action','purchase_value'],axis=1,inplace=True)
``````

1. 将表示数字的列转换为数字列。 为此，请显式转换`dataframe`中的数据类型。
2. 还将分类列转换为数字列。
``````#convert columns that represent numbers
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')
``````

``````from sklearn.preprocessing import OneHotEncoder

#get the categorical columns
cat_columns = list(set(df.columns) - set(num_cols + ['target']))

#get the dataframe with categorical columns only
df_cat = df.loc[:,cat_columns]

#initialize sklearn's OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

#fit the data into the encoder
enc.fit(df_cat)

#define OneHotEncoder's columns names
ohc_columns = [[c+'='+c_ for c_ in cat] for c,cat in zip(cat_columns,enc.categories_)]
ohc_columns = [item for sublist in ohc_columns for item in sublist]

#finalize the data input to the ML models
X = pd.DataFrame( np.concatenate((enc.transform(df_cat).toarray(),df[num_cols]),axis=1),
columns =  ohc_columns + num_cols)

#define target column
y = df['target']
``````

``````from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

clf = LogisticRegression(max_iter=2000, random_state=0).fit(X_train, y_train)

print("Test data accuracy: {}".format(clf.score(X_test, y_test)))
``````

``````from matplotlib import pyplot as plt

#get feature importance as a sorted list of columns
feature_importance = np.argsort(-clf.coef_[0])
top_10_features_purchase_names = X.columns[feature_importance[:10]]
top_10_features_purchase_values = clf.coef_[0][feature_importance[:10]]
top_10_features_not_purchase_names = X.columns[feature_importance[-10:]]
top_10_features_not_purchase_values = clf.coef_[0][feature_importance[-10:]]

#plot the figures
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(10,5))

ax1.bar(np.arange(10),top_10_features_purchase_values)
ax1.set_xticks(np.arange(10))
ax1.set_xticklabels(top_10_features_purchase_names,rotation = 90)
ax1.set_ylim([np.min(clf.coef_[0])-0.1,np.max(clf.coef_[0])+0.1])
ax1.set_title("Top 10 features to define \n a propensity to purchase")

ax2.bar(np.arange(10),top_10_features_not_purchase_values, color='#E15750')
ax2.set_xticks(np.arange(10))
ax2.set_xticklabels(top_10_features_not_purchase_names,rotation = 90)
ax2.set_ylim([np.min(clf.coef_[0])-0.1,np.max(clf.coef_[0])+0.1])
ax2.set_title("Top 10 features to define \n a propensity to NOT purchase")

plt.show()
``````

## 使用查询服务应用训练后的模型 use-query-service-to-apply-trained-model

• 首先，SQL必须取代Logistics Regression模块才能获得预测标签的概率。 物流回归创建的模型生成了回归模型`y = wX + c`，其中权重`w`和截距`c`是模型的输出。 SQL特征可用于将权重相乘以获得概率。

• 其次，在Python中用一种热编码实现的工程过程也必须合并到SQL中。 例如，在原始数据库中，我们有`geo_county`列用于存储县，但该列已转换为`geo_county=Bexar``geo_county=Dallas``geo_county=DeKalb`。 以下SQL语句执行相同的转换，其中`w1``w2``w3`可以用从Python中的模型学习到的权重替代：

``````SELECT  CASE WHEN geo_state = 'Bexar' THEN FLOAT(w1) ELSE 0 END AS f1,
CASE WHEN geo_state = 'Dallas' THEN FLOAT(w2) ELSE 0 END AS f2,
CASE WHEN geo_state = 'Bexar' THEN FLOAT(w3) ELSE 0 END AS f3,
``````

``````SELECT FLOAT(purchase_num) * FLOAT(w4) AS f4,
``````

``````SELECT CASE WHEN 1 / (1 + EXP(- (f1 + f2 + f3 + f4 + FLOAT(intercept)))) > 0.5 THEN 1 ELSE 0 END AS Prediction;
``````

### 端到端示例

``````y = 0.1 * "c1=category 1"+ 0.2 * "c1=category 2" +0.3 * c2+0.4
``````

SQL中的等效项如下：

``````SELECT
CASE WHEN 1 / (1 + EXP(- (f1 + f2 + f3 + FLOAT(0.4)))) > 0.5 THEN 1 ELSE 0 END AS Prediction
FROM
(
SELECT
CASE WHEN c1 = 'Cateogry 1' THEN FLOAT(0.1) ELSE 0 END AS f1,
CASE WHEN c1 = 'Cateogry 2' THEN FLOAT(0.2) ELSE 0 END AS f2,
FLOAT(c2) * FLOAT(0.3) AS f3
FROM TABLE
)
``````

``````def generate_lr_inference_sql(ohc_columns, num_cols, clf, db):
features_sql = []
category_sql_text = "case when {col} = '{val}' then float({coef}) else 0 end as f{name}"
numerical_sql_text = "float({col}) * float({coef}) as f{name}"
for i, (column, coef) in enumerate(zip(ohc_columns+num_cols, clf.coef_[0])):
if i < len(ohc_columns):
col,val = column.split('=')
val = val.replace("'","%''%")
sql = category_sql_text.format(col=col,val=val,coef=coef,name=i+1)
else:
sql = numerical_sql_text.format(col=column,coef=coef,name=i+1)
features_sql.append(sql)
features_sum = '+'.join(['f{}'.format(i) for i in range(1,len(features_sql)+1)])
final_sql = '''
select case when 1/(1 + EXP(-({features} + float({intercept})))) > 0.5 then 1 else 0 end as Prediction
from
(select {cols}
from {db})
'''.format(features=features_sum,cols=",".join(features_sql),intercept=clf.intercept_[0],db=db)
return final_sql
``````

``````sql = generate_lr_inference_sql(ohc_columns, num_cols, clf, "fdu_luma_raw")
cur.execute(sql)
samples = [r for r in cur]
colnames = [desc[0] for desc in cur.description]
pd.DataFrame(samples,columns=colnames)
``````

## 处理采样数据：引导 working-on-sampled-data

``````def end_to_end_pipeline(df):

#define the target label for prediction
df['target'] = (df['analytic_action'] == 'productPurchase').astype(int)
#remove columns that are dependent on the label
df.drop(['analytic_action','purchase_value'],axis=1,inplace=True)

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

#get the categorical columns
cat_columns = list(set(df.columns) - set(num_cols + ['target']))

#get the dataframe with categorical columns only
df_cat = df.loc[:,cat_columns]

#initialize sklearn's One Hot Encoder
enc = OneHotEncoder(handle_unknown='ignore')

#fit the data into the encoder
enc.fit(df_cat)

#define one hot encoder's columns names
ohc_columns = [[c+'='+c_ for c_ in cat] for c,cat in zip(cat_columns,enc.categories_)]
ohc_columns = [item for sublist in ohc_columns for item in sublist]

#finalize the data input to the ML models
X = pd.DataFrame( np.concatenate((enc.transform(df_cat).toarray(),df[num_cols]),axis=1),
columns =  ohc_columns + num_cols)

#define target column
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

clf = LogisticRegression(max_iter=2000,random_state=0).fit(X_train, y_train)

return clf.score(X_test, y_test)
``````

``````from tqdm import tqdm

bootstrap_accuracy = []
for i in tqdm(range(100)):

#sample data from QS
cur.execute('''SELECT *
FROM fdu_luma_raw
ORDER BY random()
LIMIT 1000
''')
samples = [r for r in cur]
colnames = [desc[0] for desc in cur.description]
df_samples = pd.DataFrame(samples,columns=colnames)
df_samples.fillna(0,inplace=True)

#train the propensity model with sampled data and output its accuracy
bootstrap_accuracy.append(end_to_end_pipeline(df_samples))

bootstrap_accuracy = np.sort(bootstrap_accuracy)
``````

recommendation-more-help
ccf2b369-4031-483f-af63-a93b5ae5e3fb