You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
118 lines
5.6 KiB
118 lines
5.6 KiB
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib as plt
|
|
import asyncio
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import threading
|
|
|
|
# Variable globale pour indiquer quand le script principal a terminé
|
|
script_done = False
|
|
|
|
def time_tracker():
|
|
start_time = time.perf_counter()
|
|
while not script_done:
|
|
elapsed = time.perf_counter() - start_time
|
|
print(f"Temps écoulé: {elapsed:.2f} secondes", end="\r")
|
|
time.sleep(1) # Met à jour toutes les 1 secondes
|
|
# Affiche le temps final
|
|
elapsed = time.perf_counter() - start_time
|
|
print(f"\nTemps total d'exécution: {elapsed:.2f} secondes")
|
|
|
|
# Fonction pour exécuter votre code de manipulation de données
|
|
def data_manipulation(DataIsLoaded = None):
|
|
timer_thread = threading.Thread(target=time_tracker)
|
|
|
|
# Démarre le thread de suivi du temps
|
|
timer_thread.start()
|
|
|
|
names = pd.read_csv("baseData/name.basics.tsv", sep="\t")
|
|
|
|
# filter by actor / actress
|
|
# ------------------------------------------------------------------ #
|
|
if DataIsLoaded is not None:
|
|
df = pd.read_csv("baseData/title.principals.tsv", sep="\t")
|
|
actors = df.loc[(df.category == 'actor') | (df.category =='actress')]
|
|
actors.to_csv("processedData/actors.tsv", sep="\t", index=False) # outputs actors.tsv
|
|
# ------------------------------------------------------------------ #
|
|
|
|
# merges actors and movies to have each actor that played in a movie
|
|
# ------------------------------------------------------------------ #
|
|
actors = pd.read_csv("processedData/actors.tsv", sep="\t")
|
|
ratings = pd.read_csv("baseData/title.ratings.tsv", sep="\t")
|
|
|
|
actorsRatings = actors.merge(ratings, left_on="tconst", right_on="tconst", how="inner")
|
|
actorsRatings.to_csv("processedData/actorsRatings.tsv", sep="\t", index=False) # outputs actorsRatings.tsv
|
|
|
|
else:
|
|
actorsRatings = pd.read_csv("processedData/actorsRatings.tsv", sep="\t")
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
# groups the actors with the mean of the ratings they had on each of their movies, and the number of movies used to calculate that mean
|
|
# ------------------------------------------------------------------ #
|
|
grouped = actorsRatings.groupby("nconst").agg({"averageRating": ["mean", "count"]})
|
|
# ------------------------------------------------------------------ #
|
|
|
|
# Changer les labels de colonne en nconst, averageRatingMean et averageRatingCount
|
|
# ------------------------------------------------------------------ #
|
|
# Aplatir l'index des colonnes et renommer
|
|
grouped.columns = ['averageRatingMean', 'averageRatingCount'] # Aplatir et renommer les colonnes
|
|
|
|
# Réinitialiser l'index pour inclure 'nconst' comme une colonne régulière
|
|
grouped.reset_index(inplace=True)
|
|
|
|
if DataIsLoaded is not None:
|
|
# Enregistrer dans un fichier CSV avec les noms de colonnes personnalisés
|
|
grouped.to_csv("processedData/actorsRatingsGrouped.tsv", index=False, sep="\t") # outputs actorsRatingsGrouped.tsv
|
|
# ------------------------------------------------------------------ #
|
|
|
|
# merges the last file with the names of the actors getRatingsActeur.py
|
|
# ------------------------------------------------------------------ #
|
|
actorNames = names[['nconst','primaryName']]
|
|
groupedWithActorNames = grouped.merge(actorNames, left_on="nconst", right_on="nconst", how="inner")
|
|
|
|
if DataIsLoaded is not None:
|
|
groupedWithActorNames.to_csv("processedData/actorsRatingsGroupedWithName.tsv", index=True, sep="\t") # outputs actorsRatingsGroupedWithName.tsv
|
|
# ------------------------------------------------------------------ #
|
|
|
|
# Joindre les deux DataFrames sur 'nconst' pour associer chaque 'tconst' avec les 'averageRating'
|
|
# Note : Assurez-vous que 'nconst' est présent dans les deux DataFrames comme clé de jointure
|
|
merged = pd.merge(actorsRatings[['tconst', 'nconst','averageRating']], groupedWithActorNames[['nconst', 'averageRatingMean', 'averageRatingCount', 'primaryName']], on='nconst')
|
|
# Grouper par 'tconst' et agréger 'averageRating' dans une liste pour chaque 'tconst'
|
|
grouped = merged.groupby('tconst').agg({
|
|
'averageRatingMean': lambda x: list(x),
|
|
'primaryName': lambda x: list(x),
|
|
'averageRating': 'first' # Prend la première valeur de 'averageRating', supposant qu'elle est identique pour toutes les lignes du même 'tconst'
|
|
}).reset_index()
|
|
|
|
# Convertir le DataFrame agrégé en un dictionnaire
|
|
actorsRatingsPerMovie = pd.DataFrame({
|
|
'tconst': grouped['tconst'],
|
|
'ratings': grouped['averageRatingMean'],
|
|
'actorNames': grouped['primaryName'],
|
|
'averageRatingMovie': grouped['averageRating'] # Ajoute 'averageRating' comme 'averageRatingMovie'
|
|
})
|
|
|
|
actorsRatingsPerMovie.to_csv("processedData/actorsRatingsPerMovie.tsv", index=False, sep="\t")
|
|
|
|
|
|
|
|
# actorsRatingsPerMovie = pd.DataFrame(list(allActorRatingsMovies.items()), columns=['tconst', 'ratings'])
|
|
# actorsRatingsPerMovie.to_csv("actorsRatingsPerMovie.tsv", index=False, sep="\t")
|
|
|
|
# Fonction principale pour exécuter à la fois le timer et la manipulation de données
|
|
if __name__ == "__main__":
|
|
# Crée un thread pour suivre le temps
|
|
timer_thread = threading.Thread(target=time_tracker)
|
|
|
|
# Démarre le thread de suivi du temps
|
|
timer_thread.start()
|
|
|
|
data_manipulation()
|
|
# Indique au thread de suivi du temps que le script est terminé
|
|
script_done = True
|
|
|
|
# Attend que le thread de suivi du temps se termine
|
|
timer_thread.join()
|