Source code for src.utils.data

import pandas as pd
from scipy.sparse import csr_matrix


[docs]def sparse_matrix_from_df(df, groupby, indicator):
    """ Make a scipy sparse matrix from a pandas Dataframe

    Args:
        df (pd.DataFrame): Dataframe with the matrix desired rows as index
        groupby (str): Name of the column to set as matrix column
        indicator (str): Name of the column that will serve as data

    Returns:
        sparse matrix (scipy.sparse.csr_matrix)
        row values (list)
        column values (list)

    """
    rows_u = list(df.index.unique())
    columns_u = list(df[groupby].unique())

    data = df[indicator].tolist()

    row = pd.Series(df.index) \
        .astype("category", categories=rows_u) \
        .cat.codes
    col = df[groupby] \
        .astype("category", categories=columns_u) \
        .cat.codes

    sparse_matrix = csr_matrix((data, (row, col)),
                               shape=(len(rows_u), len(columns_u)))

    return sparse_matrix, rows_u, columns_u


[docs]def matrix_from_df_with_vect(df, groupby_column, data_column, vectorizer):
    grouped_df = df.groupby(groupby_column)
    group_keys = list(grouped_df.groups.keys())

    data = grouped_df[data_column]\
        .apply(list)\
        .apply(lambda r: " ".join(list(map(str, r))))\
        .tolist()

    return vectorizer.transform(data), group_keys


[docs]def recommendations_from_similarity_matrix(movie_ids,
                                           sim_matrix,
                                           n_recommendations,
                                           input_kind):
    recommendations = []

    for movie_index, movie_id in enumerate(movie_ids):
        sim_scores = list(enumerate(sim_matrix[movie_index]))
        sim_scores_sorted = sorted(
            sim_scores,
            key=lambda x: x[1], reverse=True
        )[:n_recommendations]

        for recommended_movie_index, score in sim_scores_sorted:
            recommended_movie_id = movie_ids[recommended_movie_index]

            if movie_id == recommended_movie_id:
                continue

            recommendations.append([
                movie_id,
                recommended_movie_id,
                input_kind,
                score
            ])

    return recommendations