Source code for src.utils.data

import pandas as pd
from scipy.sparse import csr_matrix


[docs]def sparse_matrix_from_df(df, groupby, indicator): """ Make a scipy sparse matrix from a pandas Dataframe Args: df (pd.DataFrame): Dataframe with the matrix desired rows as index groupby (str): Name of the column to set as matrix column indicator (str): Name of the column that will serve as data Returns: sparse matrix (scipy.sparse.csr_matrix) row values (list) column values (list) """ rows_u = list(df.index.unique()) columns_u = list(df[groupby].unique()) data = df[indicator].tolist() row = pd.Series(df.index) \ .astype("category", categories=rows_u) \ .cat.codes col = df[groupby] \ .astype("category", categories=columns_u) \ .cat.codes sparse_matrix = csr_matrix((data, (row, col)), shape=(len(rows_u), len(columns_u))) return sparse_matrix, rows_u, columns_u
[docs]def matrix_from_df_with_vect(df, groupby_column, data_column, vectorizer): grouped_df = df.groupby(groupby_column) group_keys = list(grouped_df.groups.keys()) data = grouped_df[data_column]\ .apply(list)\ .apply(lambda r: " ".join(list(map(str, r))))\ .tolist() return vectorizer.transform(data), group_keys
[docs]def recommendations_from_similarity_matrix(movie_ids, sim_matrix, n_recommendations, input_kind): recommendations = [] for movie_index, movie_id in enumerate(movie_ids): sim_scores = list(enumerate(sim_matrix[movie_index])) sim_scores_sorted = sorted( sim_scores, key=lambda x: x[1], reverse=True )[:n_recommendations] for recommended_movie_index, score in sim_scores_sorted: recommended_movie_id = movie_ids[recommended_movie_index] if movie_id == recommended_movie_id: continue recommendations.append([ movie_id, recommended_movie_id, input_kind, score ]) return recommendations