You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.5 KiB

import pandas as pd
import psycopg2 as psy
import matplotlib.pyplot as plt
import numpy as np
from getpass import getpass
data = pd. read_csv (r'vgsales.csv')
df = pd. DataFrame (data)
co = None
try:
co = psy.connect(host='londres',
database = 'dbanperederi',
user ='anperederi',
password = getpass())
cur = co.cursor()
cur.execute('''DROP TABLE IF EXISTS Formule ;''')
cur.execute('''CREATE TABLE Formule (
Name varchar(150) NOT NULL,
Platform varchar NOT NULL,
Year numeric NOT NULL,
Genre varchar NOT NULL,
Publisher varchar NOT NULL,
NA_Sales numeric NOT NULL,
EU_Sales numeric NOT NULL,
JP_Sales numeric NOT NULL,
Other_Sales numeric NOT NULL,
Global_Sales numeric NOT NULL,
);''')
for row in df.itertuples():
cur.execute ('''INSERT INTO Formule VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);''',
(row.Name, row.Platform, row.Year, row.Genre, row.Publisher,
row.NA_Sales, row.EU_Sales, row.JP_Sales,
row.Other_Sales, row.Global_Sales))
# 3. Élimination des doublons
df = pd.DataFrame(data).drop_duplicates(subset=['Name', 'Platform'])
# 5. Recherche des lignes incohérentes (par delta vu qu'il s'agit de nombre flottants)
cur.execute("""SELECT Name, Platform, ROUND((NA_Sales + EU_Sales + JP_Sales + Other_Sales)::numeric, 2), Global_Sales
FROM Formule
WHERE ABS(NA_Sales + EU_Sales + JP_Sales + Other_Sales - Global_Sales) > 0.1""")
for row in cur.fetchall():
print(row)
# 6. Recalcul des ventes pour les lignes incohérentes
cur.execute("""UPDATE Formule
SET Global_Sales = ROUND((NA_Sales + EU_Sales + JP_Sales + Other_Sales)::numeric, 2)
WHERE ABS(NA_Sales + EU_Sales + JP_Sales + Other_Sales - Global_Sales) > 0.1;""")
co.commit()
cur.close()
# 7. Création du DataFrame propre
# dp = pd.read_sql("SELECT * FROM Formule;", con=co) #! Regarder cette ligne !!!
# print(dp)
# # 1. Calcul des ventes moyennes par année de sortie et pour le genre aventure
# df = pd.read_sql("""SELECT year, AVG(Global_Sales) sales_avg, SUM(Global_Sales) sales_total
# FROM Formule
# WHERE Genre = 'Adventure' AND year IS NOT NULL
# GROUP BY year
# ORDER BY year;""", con=co)
# # 2/3. Affichage sous forme de courbe
# fig = df.plot(x='year', y=['sales_avg', 'sales_total'])
# fig.legend(['Ventes moyennes', 'Ventes totales'])
# fig.set_xlabel('Année de sortie')
# fig.set_ylabel('Ventes (en millions)')
# # 4. Ventes totales pour l'ensemble des jeux
# df = pd.read_sql("""SELECT year, Genre, SUM(Global_Sales) sales_total
# FROM Formule
# WHERE year IS NOT NULL
# GROUP BY year, Genre
# ORDER BY year;""", con=co)
# fig, ax = plt.subplots()
# for i, (label, group) in enumerate(df.groupby(['Genre'])):
# ax = group.plot(ax=ax, kind='line', x='year',
# y='sales_total', label=label, color=plt.cm.tab20.colors[i])
# ax.set_xlabel('Année de sortie')
# ax.set_ylabel('Ventes (en millions)')
except(Exception,psy.DatabaseError) as error :
print(error)
finally:
if co is not None:
co.close()