diff --git a/sae2.04 b/sae2.04 new file mode 100644 index 0000000..c734c98 --- /dev/null +++ b/sae2.04 @@ -0,0 +1,261 @@ +#sae2.04 + +import pandas as pd +import getpass +import matplotlib.pyplot as plt +import numpy as np +from sqlalchemy import create_engine, exc, text + +df = pd.read_csv('spotify_songs.csv', sep=',', encoding="latin-1") +print(df) +print(df.columns) +''' +renvoie ['track_id', 'track_name', 'track_artist', 'track_popularity', + 'track_album_id', 'track_album_name', 'track_album_release_date', + 'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre', + 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', + 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', + 'duration_ms'] +''' +df = df.drop(columns=['key','mode','instrumentalness']) +# Ces colonnes sont inutiles ici +df = df.dropna() + +print(df['duration_ms']) + +df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms') +df = df.rename(columns={"duration_ms": "duration_m"}) + +print(df['duration_m']) + +# Nous avons ici modifier la durée des chansons afin qu'elles soyent en minutes dans la base + +df = df.drop(151) +# Cette ligne ne fonctionnant pas, nous la supprimons + +dfT = df[['track_id', 'track_name', 'track_popularity', 'duration_m', 'danceability', 'energy', 'loudness','speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'track_artist', 'track_album_id','playlist_id']] +dfA = df[['track_album_id', 'track_album_name', 'track_album_release_date', 'track_artist']] +dfP = df[['playlist_id', 'playlist_name', 'playlist_genre', 'playlist_subgenre']] +dfArtist = df[['track_artist']] +# Nous créons des dataframes qui seront plus tard les tables de notre base + +dfArtist = dfArtist.drop_duplicates() +dfA = dfA.drop_duplicates() +dfA['track_album_id'] = dfA['track_album_id'].drop_duplicates() +dfA = dfA.dropna() +dfP = dfP.drop_duplicates() +dfP['playlist_id'] = dfP['playlist_id'].drop_duplicates() +dfP = dfP.dropna() +dfT = dfT.drop_duplicates() +dfT['track_id'] = dfT['track_id'].drop_duplicates() +dfT = dfT.dropna() +# Nous traitons les données pour enlever les doublons et les lignes NaN + + + +co = None +engine = create_engine("postgresql://reneveu:achanger@londres/dbreneveu") +try : + co = engine.connect() + + # Création de la base de données + """ + co.execute(text('''DROP TABLE IF EXISTS Artist CASCADE;''')) + co.execute(text('''CREATE TABLE Artist( + track_artist varchar(150), + PRIMARY KEY (track_artist) + );''')) + + co.execute(text('''DROP TABLE IF EXISTS Album CASCADE;''')) + co.execute(text('''CREATE TABLE Album( + track_album_id varchar(150), + track_album_name varchar(500), + track_album_release_date varchar(15), + track_artist varchar(150) REFERENCES Artist, + PRIMARY KEY (track_album_id) + );''')) + + co.execute(text('''DROP TABLE IF EXISTS Playlist CASCADE;''')) + co.execute(text('''CREATE TABLE Playlist( + playlist_id varchar(150) PRIMARY KEY, + playlist_name varchar(150), + playlist_genre varchar(50), + playlist_subgenre varchar(150) + );''')) + + co.execute(text('''DROP TABLE IF EXISTS Track CASCADE;''')) + co.execute(text('''CREATE TABLE Track( + track_id varchar(150), + track_name varchar(150), + track_popularity numeric, + duration_m time, + danceability numeric, + energy numeric, + loudness numeric, + speechiness numeric, + acousticness numeric, + liveness numeric, + valence numeric, + tempo numeric, + track_artist varchar(150) REFERENCES Artist, + track_album_id varchar(150) REFERENCES Album, + playlist_id varchar(150) REFERENCES Playlist, + PRIMARY KEY (track_id) + );''')) + + for row in dfArtist.itertuples(): + co.execute(text('''INSERT INTO Artist VALUES(:1);'''), + {'1': row.track_artist}) + co.execute(text('''SELECT * FROM Artist;''')) + co.commit() + + for row in dfA.itertuples(): + co.execute(text('''INSERT INTO Album VALUES(:1, :2, :3, :4);'''), + {'1': row.track_album_id, '2': row.track_album_name, '3': row.track_album_release_date, '4': row.track_artist}) + co.execute(text('''SELECT * FROM Album;''')) + co.commit() + + for row in dfP.itertuples(): + co.execute(text('''INSERT INTO Playlist VALUES(:1, :2, :3, :4);'''), + {'1': row.playlist_id, '2': row.playlist_name, '3': row.playlist_genre, '4': row.playlist_subgenre}) + co.execute(text('''SELECT * FROM Playlist;''')) + co.commit() + + for row in dfT.itertuples(): + co.execute(text('''INSERT INTO Track VALUES(:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14, :15);'''), + {'1': row.track_id, '2': row.track_name, '3': row.track_popularity, '4': row.duration_m, '5': row.danceability, '6': row.energy, '7': row.loudness, + '8': row.speechiness, '9': row.acousticness, '10': row.liveness, '11': row.valence, '12': row.tempo, '13': row.track_artist, '14': row.track_album_id, + '15': row.playlist_id}) + co.execute(text('''SELECT * FROM Track;''')) + co.commit() + + """ + + + # Tentative de graphique + + pop = pd.read_sql(text('''SELECT substr(a.track_album_release_date, 1,4) date, + t.track_name nom, + t.track_popularity pop + FROM Track t + JOIN Album a ON a.track_album_id = t.track_album_id + WHERE t.track_popularity > 90 + GROUP BY date,nom,pop ORDER BY date,nom,pop;'''), con=co) + + #(SELECT track_name FROM Track NATURAL JOIN Album a2 WHERE substr(a2.track_album_release_date, 1,4) = ) and track_popularity = max(t.track_popularity) + + # pop["track_album_release_date"]=pop["track_album_release_date"].astype('str') + + fig = pop.plot(x='date', y=['nom', 'pop'], style=['o-', 'x--']) + + fig.set_title("Popularité de titre par année") + fig.legend(['Titre', 'Popularité']) + fig.set_xlabel("Année") + fig.set_ylabel("Popularité") + fig.set_ylim(0) + plt.show() + + + + test = pd.read_sql(text('''SELECT substr(a.track_album_release_date, 1,4) date, t.track_name nom FROM Track t + NATURAL JOIN Album a + GROUP BY date, nom ORDER BY date,nom LIMIT 1;'''), con=co) + + test["date"]=test["date"].astype('int') + + fig = test.plot(x='nom', y='date', legend=False , kind='bar') + plt.xticks(rotation=0) + # fig = test.plot(x='annee', y='consototale') + fig.set_title("date des titres") + fig.set_xlabel("Année") + fig.set_ylabel("Titre") + fig.set_ylim(0) + plt.show() + + + +except exc.SQLAlchemyError as e: + print(e) +finally : + if co is not None: + co.close() + + + +# Création d'un graphique montrant les pourcentage de chaque genre par playlist +df['playlist_genre'].value_counts().plot.pie(ylabel='', autopct='%1.1f%%') +plt.show() + + +# Travail d'Adryen + +# Afficher la version de pandas +print(pd.__version__) + +# Lecture du fichier csv +df_pop = pd.read_csv("spotify_songs.csv") +print(df_pop.head(5)) + +# Supprimer les lignes avec des valeurs manquantes +df_pop = df_pop.dropna() + +# Trier le DataFrame par artiste +sorted_df = df_pop.sort_values(by='track_artist') +artistes = sorted_df["track_artist"].unique() + +# Créer une liste des artistes uniques +print(artistes) + +# Créer une figure pour le premier graphique +plt.figure(figsize=(10, 6)) + +# Extraire les noms des chansons et leur popularité +names = sorted_df["track_name"] +popularity = sorted_df["track_popularity"] + +# Filtrer les chansons avec une popularité >= 90 +filtered_names = [name for name, pop in zip(names, popularity) if pop >= 90] +filtered_popularity = [pop for pop in popularity if pop >= 90] + +# Tracer le premier graphique en barres horizontales +plt.barh(filtered_names, filtered_popularity, color='skyblue') +plt.xlabel('Indice de popularité') +plt.ylabel('Chansons') +plt.title('Analyse des indices de popularité des chansons') +plt.grid(True) +plt.show() + +# Créer un dictionnaire pour stocker la popularité par artiste +popularity_by_artist = {} + +# Remplir le dictionnaire avec les indices de popularité correspondant à chaque artiste +for name, pop in zip(names, popularity): + artiste = sorted_df.loc[sorted_df['track_name'] == name, 'track_artist'].iloc[0] + if artiste not in popularity_by_artist: + popularity_by_artist[artiste] = [] + popularity_by_artist[artiste].append(pop) + +# Créer un dictionnaire pour stocker la popularité moyenne par artiste +average_popularity_by_artist = {} + +# Calculer la popularité moyenne pour chaque artiste +for artiste, popularity_list in popularity_by_artist.items(): + average_popularity = np.mean(popularity_list) + average_popularity_by_artist[artiste] = average_popularity + +# Filtrer les artistes avec une popularité moyenne supérieure à 85 +selected_artists = {artiste: popularity for artiste, popularity in average_popularity_by_artist.items() if popularity > 80} + +# Créer un graphique pour la deuxième analyse +plt.figure(figsize=(10, 6)) + +# Tracer un graphique des indices de popularité moyens des artistes sélectionnés +plt.bar(selected_artists.keys(), selected_artists.values(), color='skyblue') + +plt.xlabel('Artistes') +plt.ylabel('Popularité moyenne') +plt.title('Analyse de la popularité moyenne des artistes (Popularité moyenne > 85)') +plt.grid(True) +plt.xticks(rotation=90) # Pour faire pivoter les étiquettes des axes x +plt.tight_layout() # Ajuster automatiquement les sous-graphiques pour éviter les chevauchements +plt.show()