In this post we will attempt at the interesting classification problem : Predicting a movie genre from only its title. It would be very interesting to be able to make such prediction. It can be used to cluster movies based on genre. Plus it’s a great way to explore various classification problems and the very famous word embeddings as well.

import numpy, pandas as pd
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('movies.csv',quotechar='"')
data.head()
movieId title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
1 2 Jumanji (1995) Adventure|Children|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance
4 5 Father of the Bride Part II (1995) Comedy

Preprocessing data

# drop non ascii titles
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

data = data.drop(data[data['title'].apply(lambda t: not is_ascii(t))].index)

Processing title

We strip away numbers, parenthesis… etc

import re
def process_title(title): 
    # strip away numbers and parenthesis
    title = title.replace('(','').replace(')','')
    title = re.sub(r'\d+','',title)
    # strip away "part" word
    title = re.sub(r'[Pp]art','',title)
    #strip II and III and IV
    title = title.replace('II','').replace('III','').replace('IV','')
    return title

data['title'] = data['title'].apply(process_title) 
#drop empty titles
data = data.drop(data[data['title'].str.strip() ==''].index)

Converting to binary classification

This is a multilabel classification problem, we will convert it to set of binary classification problems

# drop movies with no genres
data['genres'] = data['genres'].apply(lambda gs:gs.lower())

# get all genres
genres = set()
for gs in data['genres'].str.split('|'):
    genres |= set(gs)
genres.remove('(no genres listed)')

for g in genres:
    data[g] = data['genres'].apply(lambda gs: 1 if g in gs.split('|') else 0)
data.head()
movieId title genres sci-fi horror fantasy adventure western musical children ... romance film-noir crime drama animation action comedy documentary war imax
0 1 Toy Story adventure|animation|children|comedy|fantasy 0 0 1 1 0 0 1 ... 0 0 0 0 1 0 1 0 0 0
1 2 Jumanji adventure|children|fantasy 0 0 1 1 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
2 3 Grumpier Old Men comedy|romance 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 1 0 0 0
3 4 Waiting to Exhale comedy|drama|romance 0 0 0 0 0 0 0 ... 1 0 0 1 0 0 1 0 0 0
4 5 Father of the Bride comedy 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0

5 rows × 22 columns

Converting to lower case

data['title']=data['title'].apply(lambda t: t.lower())

Checking class distribution

d = dict(data.mean())
del d['movieId']
#sorting genres by frequency occurence
g_sorted_freq = sorted(d.keys(),key=lambda x:d[x])

# dropping the 6 least common genres
for g in g_sorted_freq[:6]:
    data = data.drop(g,axis=1)
    genres.remove(g)
# checking classes distribution
plt.ylim((0,1.0))
plt.ylabel('portion of positive examples')
data[list(genres)].mean().plot(kind='bar')

class distribution

We can see we have very imbalanced data with less than 10% positive examples in about 6 classes. We will have to deal with this to be able to evaluate or models correctly

creating balanced dataset for each genre by means of undersampling

balanced_data = {}
for g in genres:
    positive_examples = data[data[g]==1]
    negative_examples = data[data[g]==0].sample(n=len(positive_examples.index))
    balanced_data[g] = positive_examples.append(negative_examples)
    

Treating it as text classification using Naive Bayes

from sklearn.model_selection import train_test_split
train, test = train_test_split(data, train_size = 0.6)
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def learn_counts(train, target):
    cnt_word_given_class = defaultdict(lambda: defaultdict(lambda :0))
    for i,row in train.iterrows():
        classes = train[target].unique()
        for word in tokenizer.tokenize(row['title']):
            cnt_word_given_class[word][row[target]]+=1.0
    cnt_classes = {c:len(train[train[target]==c].index) for c in classes}
    V = len(cnt_word_given_class.keys())
    return classes, cnt_classes, cnt_word_given_class, V 

def get_class_prob_given_word(word,cnt_w_c, cnt_c,classes,K):
    return {c: (1.0*K + cnt_w_c[word][c])  / (cnt_c[c] + K*V) for c in classes} 

def get_text_class (text,cnt_w_c, cnt_c, V, classes=[0,1],K=1):
    probs = {c:0 for c in classes}
    for word in tokenizer.tokenize(text):
            word_probs = get_class_prob_given_word(word,cnt_w_c,cnt_c,classes,K)
            for c in probs:
                probs[c] += math.log(word_probs[c])
    
    return max(probs.keys(), key=lambda x:probs[x])
        
f1_scores = []
for g in genres:
    train,test = train_test_split(balanced_data[g],train_size = 0.6)
    classes,cnt_c,cnt_w_c, V = learn_counts(train,g)
    y_pred = test['title'].apply(lambda t: get_text_class(t,cnt_w_c,cnt_c, V))
    f1_scores.append(f1_score(y_pred,test[g]))
    print 'for genre %s , f1 score is %.2f' %(g, f1_scores[-1])
    
print 'average f1 score over all genres : %.2f' %(np.mean(f1_scores))
for genre sci-fi , f1 score is 0.71
for genre horror , f1 score is 0.70
for genre fantasy , f1 score is 0.66
for genre adventure , f1 score is 0.67
for genre thriller , f1 score is 0.56
for genre mystery , f1 score is 0.64
for genre romance , f1 score is 0.61
for genre crime , f1 score is 0.62
for genre drama , f1 score is 0.50
for genre action , f1 score is 0.64
for genre comedy , f1 score is 0.58
for genre documentary , f1 score is 0.67
for genre war , f1 score is 0.65
average f1 score : 0.63

Classification using word embeddings

# glove word embeddings
import numpy as np

embeddings = {}
with open('glove.6B/glove.6B.50d.txt', 'r') as f:
    for line in f:
        embeddings[line.split()[0]] = np.array(map(float, line.split()[1:]))
# transform text (a title) to an embedding by averaging word embeddings


def get_mean_embeddings(docs,embeddings):
    means = []
    dim = len(embeddings.values()[0])
    for doc in docs :
        words = tokenizer.tokenize(doc)
        means.append(np.mean([embeddings[w] if w in embeddings else np.zeros(dim) for w in words], axis=0)) 
    return np.array(means)
def get_mean_embeddings(docs,embeddings):
    dim = len(embeddings.values()[0])
    return np.array([
                np.mean([embeddings[w]
                         for w in tokenizer.tokenize(doc) if w in embeddings] or
                        [np.zeros(dim)], axis=0)
                for doc in docs
            ])

Trying out different models (SVM , Logistic Regression, KNN, Random Forests)

import sklearn.svm as svm
from sklearn.metrics import f1_score
clf = svm.SVC(kernel='rbf')
f1_scores = []
for g in genres:
    genre_data = balanced_data[g]
    train,test = train_test_split(genre_data,train_size = 0.6)
    train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
    test_feature_matrix = get_mean_embeddings(test['title'],embeddings)
    clf.fit(train_feature_matrix,train[g])
    y_pred = clf.predict(test_feature_matrix)
    f1_scores.append(f1_score(test[g],y_pred))
    print 'for "%s" , f1 score = %.2f' %(g,f1_scores[-1])
    
print 'average f1 score over all genres : %.2f ' %(np.mean(f1_scores))
for "sci-fi" , f1 score = 0.70
for "horror" , f1 score = 0.68
for "fantasy" , f1 score = 0.62
for "adventure" , f1 score = 0.66
for "thriller" , f1 score = 0.63
for "mystery" , f1 score = 0.58
for "romance" , f1 score = 0.62
for "crime" , f1 score = 0.56
for "drama" , f1 score = 0.59
for "action" , f1 score = 0.67
for "comedy" , f1 score = 0.62
for "documentary" , f1 score = 0.64
for "war" , f1 score = 0.65
average f1 score over all genres : 0.63 
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
for g in genres:
    genre_data = balanced_data[g]
    train,test = train_test_split(genre_data,train_size = 0.6)
    train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
    test_feature_matrix = get_mean_embeddings(test['title'],embeddings)
    clf.fit(train_feature_matrix,train[g])
    y_pred = clf.predict(test_feature_matrix)
    f1_scores.append(f1_score(test[g],y_pred))
    print 'for "%s" , f1 score = %.2f' %(g,f1_scores[-1])
    
print 'average f1 score over all genres : %.2f ' %(np.mean(f1_scores))
for "sci-fi" , f1 score = 0.66
for "horror" , f1 score = 0.68
for "fantasy" , f1 score = 0.62
for "adventure" , f1 score = 0.65
for "thriller" , f1 score = 0.62
for "mystery" , f1 score = 0.57
for "romance" , f1 score = 0.60
for "crime" , f1 score = 0.59
for "drama" , f1 score = 0.57
for "action" , f1 score = 0.66
for "comedy" , f1 score = 0.61
for "documentary" , f1 score = 0.62
for "war" , f1 score = 0.66
average f1 score over all genres : 0.60 
import sklearn.neighbors
clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors=10)
for g in genres:
    genre_data = balanced_data[g]
    train,test = train_test_split(genre_data,train_size = 0.6)
    train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
    test_feature_matrix = get_mean_embeddings(test['title'],embeddings)
    clf.fit(train_feature_matrix,train[g])
    y_pred = clf.predict(test_feature_matrix)
    f1_scores.append(f1_score(test[g],y_pred))
    print 'for "%s" , f1 score = %.2f' %(g,f1_scores[-1])
    
print 'average f1 score over all genres : %.2f ' %(np.mean(f1_scores))
for "sci-fi" , f1 score = 0.65
for "horror" , f1 score = 0.68
for "fantasy" , f1 score = 0.63
for "adventure" , f1 score = 0.64
for "thriller" , f1 score = 0.52
for "mystery" , f1 score = 0.55
for "romance" , f1 score = 0.54
for "crime" , f1 score = 0.48
for "drama" , f1 score = 0.48
for "action" , f1 score = 0.59
for "comedy" , f1 score = 0.56
for "documentary" , f1 score = 0.63
for "war" , f1 score = 0.59
average f1 score over all genres : 0.61 
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=20)
for g in genres:
    genre_data = balanced_data[g]
    train,test = train_test_split(genre_data,train_size = 0.6)
    train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
    test_feature_matrix = get_mean_embeddings(test['title'],embeddings)
    clf.fit(train_feature_matrix,train[g])
    y_pred = clf.predict(test_feature_matrix)
    f1_scores.append(f1_score(test[g],y_pred))
    print 'for "%s" , f1 score = %.2f' %(g,f1_scores[-1])
    
print 'average f1 score over all genres : %.2f ' %(np.mean(f1_scores))
for "sci-fi" , f1 score = 0.63
for "horror" , f1 score = 0.64
for "fantasy" , f1 score = 0.61
for "adventure" , f1 score = 0.60
for "thriller" , f1 score = 0.58
for "mystery" , f1 score = 0.53
for "romance" , f1 score = 0.54
for "crime" , f1 score = 0.57
for "drama" , f1 score = 0.54
for "action" , f1 score = 0.64
for "comedy" , f1 score = 0.58
for "documentary" , f1 score = 0.59
for "war" , f1 score = 0.65
average f1 score over all genres : 0.59 

Conclusion

Ok. This was an attempt to predict movies genres using titles only. At first we explored Naive Bayes. Then we introduced word embeddings where we used glove word embeddings to obtain an embedding for the whole title by averaging word embeddings of the constituent words title word embeddings. We used an SVM with RBF as kernel, Logistic regression model, KNN, and Random Forests.

We can see that best models on average for all genres are Naive Bayes, SVM and Logistic Regression. However these model may vary on individual genres.

We also saw how to deal with imbalanced data by performing undersampling.