Source code for src.recommender.engines.content_based

import os
import pandas as pd
import numpy as np
import logging
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from scipy import sparse

from config import MAX_RECOMMENDATIONS, DATASETS_PATH
from src.recommender.engines.engine import (
    QueryBasedEngine, OfflineEngine
)
from src.data_interface import model
from src.utils.data import recommendations_from_similarity_matrix

pd.options.mode.chained_assignment = None  # default='warn'


[docs]class SameGenres(QueryBasedEngine): def __init__(self): super(SameGenres, self).__init__()
[docs] def compute_query(self, context): recommendations = model.Movie.query\ .filter(model.Movie.genres.contains( [g[0] for g in context.item.genres]))\ .filter(model.Movie.id != context.item.id) \ .limit(MAX_RECOMMENDATIONS)\ .all() return recommendations
[docs]class OneHotMultiInput(OfflineEngine): def __init__(self): super(OneHotMultiInput, self).__init__() self.input_id_kind = "item"
[docs] def train(self): logging.info("training {0}".format(self.type)) # read dataset df = pd.read_json( os.path.join(DATASETS_PATH, "movielens", "omdb.csv"), lines=True ) # select features df = df[[ "id", "Title", "Plot", "Country", "Actors", "Director", "Production", "Genre", "Language", "Released", "imdbVotes", "imdbRating" ]] # edit features df.replace("N/A", np.nan, inplace=True) df["Released_year"] = df["Released"]\ .fillna("")\ .str.split(" ").str[-1]\ .replace("", 0).astype(int) df["Released_decade"] = pd.cut( df["Released_year"], range(1920, 2020, 10) ) df["imdbVotes"] = df["imdbVotes"]\ .str.replace(",", "").fillna(0).astype(int) df["popularity"] = pd.cut(df["imdbVotes"], 10) # init vectorizers country_vect = CountVectorizer() director_vect = CountVectorizer() genre_vect = CountVectorizer() language_vect = CountVectorizer() # plot_vect = TfidfVectorizer(min_df=2, max_df=0.5) # title_vect = TfidfVectorizer(min_df=2, max_df=0.5) # fit vectorizers and concatenate X = sparse.hstack([ country_vect.fit_transform(df["Country"].fillna("")), genre_vect.fit_transform(df["Genre"].fillna("")), language_vect.fit_transform(df["Language"].fillna("")), director_vect.fit_transform(df["Director"].fillna("")), pd.get_dummies(df["Released_decade"]).values, # plot_vect.fit_transform(df["Plot"].fillna("")), # title_vect.fit_transform(df["Title"].fillna("")), ]) cosine_sim = 1 - pairwise_distances(X, metric="cosine") movie_ids = df["id"].tolist() recommendations = recommendations_from_similarity_matrix( movie_ids=movie_ids, sim_matrix=cosine_sim, n_recommendations=MAX_RECOMMENDATIONS, input_kind=self.input_id_kind ) self.save_recommendations_to_csv(recommendations)
[docs]class TfidfGenres(OfflineEngine): def __init__(self): super(TfidfGenres, self).__init__() self.input_id_kind = "item"
[docs] def train(self): logging.info("training {0}".format(self.type)) df = pd.read_json( os.path.join(DATASETS_PATH, "movielens", "omdb.csv"), lines=True ) genre_vect = TfidfVectorizer() X = sparse.hstack([ genre_vect.fit_transform(df["Genre"].fillna("")), ]) cosine_sim = 1 - pairwise_distances(X, metric="cosine") movie_ids = df["id"].tolist() recommendations = recommendations_from_similarity_matrix( movie_ids=movie_ids, sim_matrix=cosine_sim, n_recommendations=MAX_RECOMMENDATIONS, input_kind=self.input_id_kind ) self.save_recommendations_to_csv(recommendations)