EmbeddingRandomForest
Random Forest with Embeddings
So far we have created both a random forest and a NN to do tabular modeling. One thing intresting about a NN is that it contains embeddings. Why don't we try to use these embeddings from the Neural Network in Random Forests? Will it improve the random forest? Lets find out!
import zipfile
z= zipfile.ZipFile('bluebook-for-bulldozers.zip') #unzip first
z.extractall() #extract
df_nn = pd.read_csv(Path()/'TrainAndValid.csv', low_memory=False) #Data
#Set ordinal variables using our order
sizes = 'Large','Large / Medium','Medium','Small','Mini','Compact'
df_nn['ProductSize'] = df_nn['ProductSize'].astype('category')
df_nn['ProductSize'].cat.set_categories(sizes, ordered=True, inplace=True)
dep_var = 'SalePrice'
df_nn[dep_var] = np.log(df_nn[dep_var]) #remember we need to take log of the label (Kaggle requires)
df_nn = add_datepart(df_nn, 'saledate') #Also remember that we used feature engineering on date
cont_nn,cat_nn = cont_cat_split(df_nn, max_card=9000, dep_var=dep_var) #Max_card makes it so that any col with more than
# 9000 lvls, it will be treated as cont
cont_nn
Notice that it's missing saleElpased from the cont_nn. We need to add this as we want this col to be treated as cont.
cont_nn.append('saleElapsed')
cont_nn
cat_nn.remove('saleElapsed')
df_nn.dtypes['saleElapsed'] #must change to int as an object type will cause error
df_nn['saleElapsed'] = df_nn['saleElapsed'].astype('int')
cond = (df_nn.saleYear<2011) | (df_nn.saleMonth<10)
train_idx = np.where( cond)[0]
valid_idx = np.where(~cond)[0]
splits = (list(train_idx),list(valid_idx))
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_nn, procs_nn, cat_nn, cont_nn, splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(1024) #minibatches
y = to_nn.train.y
y.min(),y.max()
learn = tabular_learner(dls, y_range=(8,12), layers=[500,250],
n_out=1, loss_func=F.mse_loss)
learn.lr_find() #find best lr
learn.fit_one_cycle(5, 1e-2) #train
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)
preds,targs = learn.get_preds()
r_mse(preds,targs)
This is actually very good
procs = [Categorify, FillMissing]
rf_to = TabularPandas(df_nn, procs, cat_nn, cont_nn, y_names=dep_var, splits=splits)
xs,y = rf_to.train.xs,rf_to.train.y
valid_xs,valid_y = rf_to.valid.xs,rf_to.valid.y
def rf(xs, y, n_estimators=40, max_samples=200_000, max_features=0.5, min_samples_leaf=5, **kwargs):
return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
max_samples=max_samples, max_features=max_features,
min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
m = rf(xs, y) #Fitting
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)
So it seems our random forest preformed worse in comparison to the NN. Let's improve this by adding the NN embeddings!
learn.model.embeds[:5] #These are just some of the embedding within the NN
The function below extracts the embeddings from the model
def embed_features(learner, xs):
xs = xs.copy()
for i, feature in enumerate(learn.dls.cat_names):
emb = learner.model.embeds[i].cpu()
new_feat = pd.DataFrame(emb(tensor(xs[feature], dtype=torch.int64)), index=xs.index, columns=[f'{feature}_{j}' for j in range(emb.embedding_dim)])
xs.drop(columns=feature, inplace=True)
xs = xs.join(new_feat)
return xs
embeded_xs = embed_features(learn, learn.dls.train.xs)
xs_valid = embed_features(learn, learn.dls.valid.xs)
embeded_xs.shape, xs_valid.shape
m = rf(embeded_xs, y) #Fitting
m_rmse(m, embeded_xs, y), m_rmse(m, xs_valid, valid_y)
It seems that adding the NN embeddings improves the random forest!