Lesson 8 - FastAI
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
names=['user','movie','rating','timestamp'])
ratings.head()
#Sci-fi, action, old
last_skywalker = np.array([0.98,0.9,-0.9])
user1 = np.array([0.9,0.8,-0.6])
(user1*last_skywalker).sum()
Pos val means that the user probably will like it
casablanca = np.array([-0.99,-0.3,0.8])
(user1*casablanca).sum()
Neg val means that the user might not like it
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
usecols=(0,1), names=('movie','title'), header=None)
movies.head()
ratings = ratings.merge(movies)
ratings.head()
dls = CollabDataLoaders.from_df(ratings, user_name = 'user', item_name='title', bs=64) #must pass the correct columns
dls.show_batch()
dls.classes #We have the user and title class
len(dls.classes['user']),
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5 #Number of latent factors
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)
one_hot_3 = one_hot(3, n_users).float()
one_hot_3[:10]
user_factors[3] #parameter (latent factors) values at this index are
user_factors.t() @ one_hot_3
Notice same values
class DotProduct(Module): #extends Module class
def __init__(self, n_users, n_movies, n_factors):
self.user_factors = Embedding(n_users, n_factors)
self.movie_factors = Embedding(n_movies, n_factors)
def forward(self, x): #Method called auto. anytime using Module class
users = self.user_factors(x[:,0]) #user ID's
movies = self.movie_factors(x[:,1]) #movie ID's
return (users * movies).sum(dim=1) #dim=0 is the minibatches, we want to sum over the other dim (1)
x,y = dls.one_batch()
x.shape
x[:3] #user ID, movie ID
y[:3] #ratings
model = DotProduct(n_users, n_movies, 50) #our model
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)
Not bad, but we can do better!
class DotProduct(Module):
def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
self.user_factors = Embedding(n_users, n_factors)
self.movie_factors = Embedding(n_movies, n_factors)
self.y_range = y_range
def forward(self, x):
users = self.user_factors(x[:,0])
movies = self.movie_factors(x[:,1])
return sigmoid_range((users * movies).sum(dim=1), *self.y_range)
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)
Didn't really improve, but thats ok
class DotProductBias(Module):
def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
self.user_factors = Embedding(n_users, n_factors)
self.movie_factors = Embedding(n_movies, n_factors)
self.user_bias = Embedding(n_users, 1)
self.movie_bias = Embedding(n_movies, 1)
self.y_range = y_range
def forward(self, x):
users = self.user_factors(x[:,0])
movies = self.movie_factors(x[:,1])
res = (users * movies).sum(dim=1, keepdim=True)
#bias
res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
return sigmoid_range(res, *self.y_range)
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)
Loss not improving
It seems like our loss is not improving regardless of the improvements: But if you take another look you should realise that it performs better during the earlier epochs (2 or 3). This means that we are overfitting the model. But how can we train it for more epochs without overfitting? This is where weight regularization comes in.
x = np.linspace(-2,2,100)
a_s = [1,2,5,10,50]
ys = [a * x**2 for a in a_s]
_,ax = plt.subplots(figsize=(8,6))
for a,y in zip(a_s,ys): ax.plot(x,y, label=f'a={a}')
ax.set_ylim([0,5])
ax.legend();
Graphic above demonstrates weight decay
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1) #Pass wd
Nice our loss dropped down to .82! Also, notice that train_loss increased, this is because wd is preventing the model from overfitting
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]
class T(Module):
def __init__(self): self.a = torch.ones(3)
L(T().parameters())# calling method from Module class
class T(Module):
def __init__(self): self.a = nn.Parameter(torch.ones(3)) #Must wrap it with nn.Parameter()
L(T().parameters())
class T(Module):
def __init__(self): self.a = nn.Linear(1, 3, bias=False)
t = T()
L(t.parameters())
def create_params(size):
return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))
This is all we need to create our own embedding
class DotProductBias(Module):
def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
self.user_factors = create_params([n_users, n_factors])
self.user_bias = create_params([n_users])
self.movie_factors = create_params([n_movies, n_factors])
self.movie_bias = create_params([n_movies])
self.y_range = y_range
def forward(self, x):
users = self.user_factors[x[:,0]]
movies = self.movie_factors[x[:,1]]
res = (users*movies).sum(dim=1)
res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
return sigmoid_range(res, *self.y_range)
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)
Notice similer performance
movie_bias = learn.model.movie_bias.squeeze() #Grab movie by bias
idxs = movie_bias.argsort()[:5] #Sort by least bias
[dls.classes['title'][i] for i in idxs]
Least bias movies
idxs = movie_bias.argsort(descending=True)[:5] #Sort by most bias
[dls.classes['title'][i] for i in idxs]
Most bias movies
g = ratings.groupby('title')['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_idxs = tensor([learn.dls.classes['title'].o2i[m] for m in top_movies])
movie_w = learn.model.movie_factors[top_idxs].cpu().detach()
movie_pca = movie_w.pca(3)
fac0,fac1,fac2 = movie_pca.t()
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(12,12))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()
Notice that similer movies have been clumped togather
movie_factors = learn.model.movie_factors
idx = dls.classes['title'].o2i['Forrest Gump (1994)']
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]
embs = get_emb_sz(dls)
embs
class CollabNN(Module):
def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
self.user_factors = Embedding(*user_sz)
self.item_factors = Embedding(*item_sz)
self.layers = nn.Sequential(
nn.Linear(user_sz[1]+item_sz[1], n_act),
nn.ReLU(),
nn.Linear(n_act, 1))
self.y_range = y_range
def forward(self, x):
embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
x = self.layers(torch.cat(embs, dim=1))
return sigmoid_range(x, *self.y_range)
model = CollabNN(*embs)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])
learn.fit_one_cycle(5, 5e-3, wd=0.1)
Notice similer results
type(learn.model)
@delegates(TabularModel)
class EmbeddingNN(TabularModel):
def __init__(self, emb_szs, layers, **kwargs):
super().__init__(emb_szs, layers=layers, n_cont=0, out_sz=1, **kwargs)
- What problem does collaborative filtering solve?
Obtains latent factors needed to provided a good reccomendation. - How does it solve it?
It learns the latent factors via gradient descent and clumps up similer kind of factors. - Why might a collaborative filtering predictive model fail to be a very useful recommendation system?
If there is a lack of data from users to provide useful reccomendations. - What does a crosstab representation of collaborative filtering data look like?
Crosstab is where the colomn tabs are users, the row tabs are items, and values are filled out based on the user’s rating of the items. Write the code to create a crosstab representation of the MovieLens data (you might need to do some web searching!).
What is a latent factor? Why is it "latent"?
Latent factos are the factors used to determine a prediction. They are latent as they are learned, NOT given to the model.What is a dot product? Calculate a dot product manually using pure Python with lists.
The dot product is the sum of the products of the corresponding matrixs.a = [1,2,3] b = [1,2,3] sum(i[0]*i[1] for i in zip(a,b))
- What does
pandas.DataFrame.merge
do?
Merges two DataFrame's togather - What is an embedding matrix?
It is what you multiply an embedding with. - What is the relationship between an embedding and a matrix of one-hot-encoded vectors?
The embedding is a matrix of a one-hot encoded vecotors, but is more computationally efficient. - Why do we need
Embedding
if we could use one-hot-encoded vectors for the same thing?
More computationally efficient and fastser. - What does an embedding contain before we start training (assuming we're not using a pretained model)?
It is randomly initialized. Create a class (without peeking, if possible!) and use it.
class Name: def __init__def(self): pass def func_name(self): pass
- What does
x[:,0]
return?
User ids Rewrite the
DotProduct
class (without peeking, if possible!) and train a model with it.class DotProduct(Module): def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)): self.user_factors = Embedding(n_users, n_factors) self.movie_factors = Embedding(n_movies, n_factors) self.y_range = y_range def forward(self, x): users = self.user_factors(x[:,0]) movies = self.movie_factors(x[:,1]) return sigmoid_range((users * movies).sum(dim=1), *self.y_range)
- What is a good loss function to use for MovieLens? Why?
Mean Squared Error Loss. Can use to compare how far the prediction is from the label. - What would happen if we used cross-entropy loss with MovieLens? How would we need to change the model?
We would need the model to output more predictions, only then can we pass it to the cross-entropy. - What is the use of bias in a dot product model?
Some rating may skeew the data, so a bias can help. - What is another name for weight decay?
L2 regularization - Write the equation for weight decay (without peeking!).
loss_with_wd = loss + wd * (parameters**2).sum() - Write the equation for the gradient of weight decay. Why does it help reduce weights?
Add to the gradients 2wdparameters. Overall, this prevents overfitting by enforcing more evenly distributed weights. - Why does reducing weights lead to better generalization?
By enforcing more evenly distributed weights, the result is is more shallow, with less sharp surfaces. - What does
argsort
do in PyTorch?
Returns index values of the order of the original list. - Does sorting the movie biases give the same result as averaging overall movie ratings by movie? Why/why not?
No. It takes into account other factors which can influence the results. - How do you print the names and details of the layers in a model?
learn.model - What is the "bootstrapping problem" in collaborative filtering?
The model cannot make any recommendations without enough data - How could you deal with the bootstrapping problem for new users? For new movies?
Have them complete a questionair. - How can feedback loops impact collaborative filtering systems?
May cause the model to suffer from bias. - When using a neural network in collaborative filtering, why can we have different numbers of factors for movies and users?
Because we are not taking the dot product, and instead concatenating the embedding matrices, different number of factors is alright. - Why is there an
nn.Sequential
in theCollabNN
model?
Can create nonlinearity as we can couple multiple layers together. - What kind of model should we use if we want to add metadata about users and items, or information such as date and time, to a collaborative filtering model?
Tabular model
Further Research
Take a look at all the differences between the
Embedding
version ofDotProductBias
and thecreate_params
version, and try to understand why each of those changes is required. If you're not sure, try reverting each change to see what happens. (NB: even the type of brackets used inforward
has changed!)Find three other areas where collaborative filtering is being used, and find out what the pros and cons of this approach are in those areas.
Complete this notebook using the full MovieLens dataset, and compare your results to online benchmarks. See if you can improve your accuracy. Look on the book's website and the fast.ai forum for ideas. Note that there are more columns in the full dataset—see if you can use those too (the next chapter might give you ideas).
Create a model for MovieLens that works with cross-entropy loss, and compare it to the model in this chapter.
Completed, see here https://usama280.github.io/PasteBlogs/