You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.3 KiB

import pandas as pd
import psycopg2 as psy
import matplotlib.pyplot as plt
import numpy as np
from getpass import getpass
data = pd. read_csv (r'vgsales.csv')
df = pd. DataFrame (data)
co = None
try:
co = psy.connect(host='londres',
database = 'dbanperederi',
user ='anperederi',
password = getpass())
cur = co.cursor()
cur.execute('''DROP TABLE IF EXISTS Formule ;''')
cur.execute('''CREATE TABLE Formule (
Name varchar(150),
Platform varchar,
Year numeric ,
Genre varchar,
Publisher varchar,
NA_Sales numeric,
EU_Sales numeric,
JP_Sales numeric,
Other_Sales numeric,
Global_Sales numeric,
PRIMARY KEY (Name, Platform)
);''')
# 3. Élimination des doublons
df = pd.DataFrame(data).drop_duplicates(subset=['Name', 'Platform'])
for row in df.itertuples():
cur.execute ('''INSERT INTO Formule VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);''',
(row.Name, row.Platform, row.Year, row.Genre, row.Publisher,
row.NA_Sales, row.EU_Sales, row.JP_Sales,
row.Other_Sales, row.Global_Sales))
# 5. Recherche des lignes incohérentes (par delta vu qu'il s'agit de nombre flottants)
cur.execute("""SELECT Name, Platform, ROUND((NA_Sales + EU_Sales + JP_Sales + Other_Sales)::numeric, 2), Global_Sales
FROM Formule
WHERE ABS(NA_Sales + EU_Sales + JP_Sales + Other_Sales - Global_Sales) > 0.1""")
# for row in cur.fetchall():
# print(row)
# 6. Recalcul des ventes pour les lignes incohérentes
cur.execute("""UPDATE Formule
SET Global_Sales = ROUND((NA_Sales + EU_Sales + JP_Sales + Other_Sales)::numeric, 2)
WHERE ABS(NA_Sales + EU_Sales + JP_Sales + Other_Sales - Global_Sales) > 0.1;""")
# 7. Création du DataFrame propre
cur.execute("SELECT * FROM Formule;")
# # 1. Calcul des ventes moyennes par année de sortie et pour le genre aventure
# df = pd.read_sql("""SELECT annee_sortie, AVG(ventes_total) ventes_moyennes, SUM(ventes_total) ventes_totales FROM plain_jeu
# WHERE genre = 'Adventure' AND annee_sortie IS NOT NULL
# GROUP BY annee_sortie
# ORDER BY annee_sortie;""", con=co)
# # 2/3. Affichage sous forme de courbe
# fig = df.plot(x='annee_sortie', y=['ventes_moyennes', 'ventes_totales'])
# fig.legend(['Ventes moyennes', 'Ventes totales'])
# fig.set_xlabel('Année de sortie')
# fig.set_ylabel('Ventes (en millions)')
# # 4. Ventes totales pour l'ensemble des jeux
# df = pd.read_sql("""SELECT annee_sortie, genre, SUM(ventes_total) ventes_totales FROM plain_jeu
# WHERE annee_sortie IS NOT NULL
# GROUP BY annee_sortie, genre
# ORDER BY annee_sortie;""", con=co)
# fig, ax = plt.subplots()
# for i, (label, group) in enumerate(df.groupby(['genre'])):
# ax = group.plot(ax=ax, kind='line', x='annee_sortie',
# y='ventes_totales', label=label, color=plt.cm.tab20.colors[i])
# ax.set_xlabel('Année de sortie')
# ax.set_ylabel('Ventes (en millions)')
co.commit()
cur.close()
except(Exception,psy.DatabaseError) as error :
print(error)
finally:
if co is not None:
co.close()