Tabular on Lesson 8
from fastai.collab import *
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
names=['user','movie','rating','timestamp'])
ratings.head()
user_name = ratings.columns[0] #user col
item_name = ratings.columns[1] #movie col
rating_name = ratings.columns[2] #label (rating col)
cat_names = [user_name,item_name] #category col
splits = RandomSplitter()(range_of(ratings)) #split
procs = [Categorify, FillMissing, Normalize]
to = TabularCollab(ratings, procs, cat_names, y_names=[rating_name], y_block=TransformBlock(), splits=splits, reduce_memory=False)
dls = to.dataloaders()
dls.show_batch()
learn = tabular_learner(dls, y_range=(0,5.5), layers=[500,250],
n_out=1, loss_func=MSELossFlat())
learn.lr_find() #find best lr
learn.fit_one_cycle(4, 1e-3, wd=.01)
If you compare these results with lesson 8, you will notice they are similer.
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)
Method below creates our random forest and fits it
def rf(xs, y, n_estimators=40, max_samples=80000, max_features=0.5, min_samples_leaf=5, **kwargs):
return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
max_samples=max_samples, max_features=max_features,
min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
procs = [Categorify, FillMissing]
to = TabularCollab(ratings, procs, cat_names, y_names=[rating_name], y_block=TransformBlock(), splits=splits, reduce_memory=False)
xs,y = to.train.xs, to.train.y
valid_xs,valid_y = to.valid.xs, to.valid.y
m = rf(xs, y) #Fitting
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)