# 使用機器學習產生的預測性模型來判斷傾向分數

• 主題：
• Queries
檢視有關此主題的更多資訊
• 建立對象：
• Developer
User

## 快速入門

``````python=3.6.7
psycopg2
sklearn
pandas
matplotlib
numpy
tqdm
``````

## 為機器學習準備資料

``````#define the target label for prediction
df['target'] = (df['analytic_action'] == 'productPurchase').astype(int)
#remove columns that are dependent on the label
df.drop(['analytic_action','purchase_value'],axis=1,inplace=True)
``````

1. 將代表數字的欄轉換為數值欄。 若要這麼做，請明確轉換 `dataframe`.
2. 也將分類欄轉換為數值欄。
``````#convert columns that represent numbers
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')
``````

``````from sklearn.preprocessing import OneHotEncoder

#get the categorical columns
cat_columns = list(set(df.columns) - set(num_cols + ['target']))

#get the dataframe with categorical columns only
df_cat = df.loc[:,cat_columns]

#initialize sklearn's OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

#fit the data into the encoder
enc.fit(df_cat)

#define OneHotEncoder's columns names
ohc_columns = [[c+'='+c_ for c_ in cat] for c,cat in zip(cat_columns,enc.categories_)]
ohc_columns = [item for sublist in ohc_columns for item in sublist]

#finalize the data input to the ML models
X = pd.DataFrame( np.concatenate((enc.transform(df_cat).toarray(),df[num_cols]),axis=1),
columns =  ohc_columns + num_cols)

#define target column
y = df['target']
``````

``````from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

clf = LogisticRegression(max_iter=2000, random_state=0).fit(X_train, y_train)

print("Test data accuracy: {}".format(clf.score(X_test, y_test)))
``````

``````from matplotlib import pyplot as plt

#get feature importance as a sorted list of columns
feature_importance = np.argsort(-clf.coef_[0])
top_10_features_purchase_names = X.columns[feature_importance[:10]]
top_10_features_purchase_values = clf.coef_[0][feature_importance[:10]]
top_10_features_not_purchase_names = X.columns[feature_importance[-10:]]
top_10_features_not_purchase_values = clf.coef_[0][feature_importance[-10:]]

#plot the figures
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(10,5))

ax1.bar(np.arange(10),top_10_features_purchase_values)
ax1.set_xticks(np.arange(10))
ax1.set_xticklabels(top_10_features_purchase_names,rotation = 90)
ax1.set_ylim([np.min(clf.coef_[0])-0.1,np.max(clf.coef_[0])+0.1])
ax1.set_title("Top 10 features to define \n a propensity to purchase")

ax2.bar(np.arange(10),top_10_features_not_purchase_values, color='#E15750')
ax2.set_xticks(np.arange(10))
ax2.set_xticklabels(top_10_features_not_purchase_names,rotation = 90)
ax2.set_ylim([np.min(clf.coef_[0])-0.1,np.max(clf.coef_[0])+0.1])
ax2.set_title("Top 10 features to define \n a propensity to NOT purchase")

plt.show()
``````

## 使用查詢服務來套用經過訓練的模型

• 首先，SQL必須取代 Logistics Regression 用於取得預測標籤機率的模組。 物流回歸建立的模型產生回歸模型 `y = wX + c` 其中權重 `w` 和截距 `c` 是模型的輸出。 SQL功能可用來將權重相乘以獲得機率。

• 其次，工程程式達成於 Python 具有一個熱編碼也必須合併到SQL中。 例如，在原始資料庫中，我們有 `geo_county` 欄以儲存縣/市，但此欄會轉換為 `geo_county=Bexar``geo_county=Dallas``geo_county=DeKalb`. 下列SQL陳述式會執行相同的轉換，其中 `w1``w2`、和 `w3` 可用從模型中學習的權重替代。 Python：

``````SELECT  CASE WHEN geo_state = 'Bexar' THEN FLOAT(w1) ELSE 0 END AS f1,
CASE WHEN geo_state = 'Dallas' THEN FLOAT(w2) ELSE 0 END AS f2,
CASE WHEN geo_state = 'Bexar' THEN FLOAT(w3) ELSE 0 END AS f3,
``````

``````SELECT FLOAT(purchase_num) * FLOAT(w4) AS f4,
``````

``````SELECT CASE WHEN 1 / (1 + EXP(- (f1 + f2 + f3 + f4 + FLOAT(intercept)))) > 0.5 THEN 1 ELSE 0 END AS Prediction;
``````

### 端對端範例

``````y = 0.1 * "c1=category 1"+ 0.2 * "c1=category 2" +0.3 * c2+0.4
``````

SQL中的對等項如下：

``````SELECT
CASE WHEN 1 / (1 + EXP(- (f1 + f2 + f3 + FLOAT(0.4)))) > 0.5 THEN 1 ELSE 0 END AS Prediction
FROM
(
SELECT
CASE WHEN c1 = 'Cateogry 1' THEN FLOAT(0.1) ELSE 0 END AS f1,
CASE WHEN c1 = 'Cateogry 2' THEN FLOAT(0.2) ELSE 0 END AS f2,
FLOAT(c2) * FLOAT(0.3) AS f3
FROM TABLE
)
``````

``````def generate_lr_inference_sql(ohc_columns, num_cols, clf, db):
features_sql = []
category_sql_text = "case when {col} = '{val}' then float({coef}) else 0 end as f{name}"
numerical_sql_text = "float({col}) * float({coef}) as f{name}"
for i, (column, coef) in enumerate(zip(ohc_columns+num_cols, clf.coef_[0])):
if i < len(ohc_columns):
col,val = column.split('=')
val = val.replace("'","%''%")
sql = category_sql_text.format(col=col,val=val,coef=coef,name=i+1)
else:
sql = numerical_sql_text.format(col=column,coef=coef,name=i+1)
features_sql.append(sql)
features_sum = '+'.join(['f{}'.format(i) for i in range(1,len(features_sql)+1)])
final_sql = '''
select case when 1/(1 + EXP(-({features} + float({intercept})))) > 0.5 then 1 else 0 end as Prediction
from
(select {cols}
from {db})
'''.format(features=features_sum,cols=",".join(features_sql),intercept=clf.intercept_[0],db=db)
return final_sql
``````

``````sql = generate_lr_inference_sql(ohc_columns, num_cols, clf, "fdu_luma_raw")
cur.execute(sql)
samples = [r for r in cur]
colnames = [desc[0] for desc in cur.description]
pd.DataFrame(samples,columns=colnames)
``````

## 使用抽樣資料：啟動程式

``````def end_to_end_pipeline(df):

#define the target label for prediction
df['target'] = (df['analytic_action'] == 'productPurchase').astype(int)
#remove columns that are dependent on the label
df.drop(['analytic_action','purchase_value'],axis=1,inplace=True)

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

#get the categorical columns
cat_columns = list(set(df.columns) - set(num_cols + ['target']))

#get the dataframe with categorical columns only
df_cat = df.loc[:,cat_columns]

#initialize sklearn's One Hot Encoder
enc = OneHotEncoder(handle_unknown='ignore')

#fit the data into the encoder
enc.fit(df_cat)

#define one hot encoder's columns names
ohc_columns = [[c+'='+c_ for c_ in cat] for c,cat in zip(cat_columns,enc.categories_)]
ohc_columns = [item for sublist in ohc_columns for item in sublist]

#finalize the data input to the ML models
X = pd.DataFrame( np.concatenate((enc.transform(df_cat).toarray(),df[num_cols]),axis=1),
columns =  ohc_columns + num_cols)

#define target column
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

clf = LogisticRegression(max_iter=2000,random_state=0).fit(X_train, y_train)

return clf.score(X_test, y_test)
``````

``````from tqdm import tqdm

bootstrap_accuracy = []
for i in tqdm(range(100)):

#sample data from QS
cur.execute('''SELECT *
FROM fdu_luma_raw
ORDER BY random()
LIMIT 1000
''')
samples = [r for r in cur]
colnames = [desc[0] for desc in cur.description]
df_samples = pd.DataFrame(samples,columns=colnames)
df_samples.fillna(0,inplace=True)

#train the propensity model with sampled data and output its accuracy
bootstrap_accuracy.append(end_to_end_pipeline(df_samples))

bootstrap_accuracy = np.sort(bootstrap_accuracy)
``````