You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
94 lines
3.5 KiB
94 lines
3.5 KiB
import pandas as pd
|
|
import psycopg2 as psy
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from getpass import getpass
|
|
|
|
data = pd. read_csv (r'vgsales.csv')
|
|
df = pd. DataFrame (data)
|
|
|
|
co = None
|
|
try:
|
|
co = psy.connect(host='londres',
|
|
database = 'dbanperederi',
|
|
user ='anperederi',
|
|
password = getpass())
|
|
|
|
cur = co.cursor()
|
|
|
|
cur.execute('''DROP TABLE IF EXISTS Formule ;''')
|
|
cur.execute('''CREATE TABLE Formule (
|
|
Name varchar(150) NOT NULL,
|
|
Platform varchar NOT NULL,
|
|
Year numeric NOT NULL,
|
|
Genre varchar NOT NULL,
|
|
Publisher varchar NOT NULL,
|
|
NA_Sales numeric NOT NULL,
|
|
EU_Sales numeric NOT NULL,
|
|
JP_Sales numeric NOT NULL,
|
|
Other_Sales numeric NOT NULL,
|
|
Global_Sales numeric NOT NULL,
|
|
|
|
);''')
|
|
for row in df.itertuples():
|
|
cur.execute ('''INSERT INTO Formule VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);''',
|
|
(row.Name, row.Platform, row.Year, row.Genre, row.Publisher,
|
|
row.NA_Sales, row.EU_Sales, row.JP_Sales,
|
|
row.Other_Sales, row.Global_Sales))
|
|
|
|
# 3. Élimination des doublons
|
|
df = pd.DataFrame(data).drop_duplicates(subset=['Name', 'Platform'])
|
|
|
|
# 5. Recherche des lignes incohérentes (par delta vu qu'il s'agit de nombre flottants)
|
|
cur.execute("""SELECT Name, Platform, ROUND((NA_Sales + EU_Sales + JP_Sales + Other_Sales)::numeric, 2), Global_Sales
|
|
FROM Formule
|
|
WHERE ABS(NA_Sales + EU_Sales + JP_Sales + Other_Sales - Global_Sales) > 0.1""")
|
|
|
|
for row in cur.fetchall():
|
|
print(row)
|
|
|
|
# 6. Recalcul des ventes pour les lignes incohérentes
|
|
cur.execute("""UPDATE Formule
|
|
SET Global_Sales = ROUND((NA_Sales + EU_Sales + JP_Sales + Other_Sales)::numeric, 2)
|
|
WHERE ABS(NA_Sales + EU_Sales + JP_Sales + Other_Sales - Global_Sales) > 0.1;""")
|
|
|
|
co.commit()
|
|
cur.close()
|
|
|
|
# 7. Création du DataFrame propre
|
|
# dp = pd.read_sql("SELECT * FROM Formule;", con=co) #! Regarder cette ligne !!!
|
|
# print(dp)
|
|
|
|
# # 1. Calcul des ventes moyennes par année de sortie et pour le genre aventure
|
|
# df = pd.read_sql("""SELECT year, AVG(Global_Sales) sales_avg, SUM(Global_Sales) sales_total
|
|
# FROM Formule
|
|
# WHERE Genre = 'Adventure' AND year IS NOT NULL
|
|
# GROUP BY year
|
|
# ORDER BY year;""", con=co)
|
|
|
|
# # 2/3. Affichage sous forme de courbe
|
|
# fig = df.plot(x='year', y=['sales_avg', 'sales_total'])
|
|
# fig.legend(['Ventes moyennes', 'Ventes totales'])
|
|
# fig.set_xlabel('Année de sortie')
|
|
# fig.set_ylabel('Ventes (en millions)')
|
|
|
|
# # 4. Ventes totales pour l'ensemble des jeux
|
|
# df = pd.read_sql("""SELECT year, Genre, SUM(Global_Sales) sales_total
|
|
# FROM Formule
|
|
# WHERE year IS NOT NULL
|
|
# GROUP BY year, Genre
|
|
# ORDER BY year;""", con=co)
|
|
# fig, ax = plt.subplots()
|
|
# for i, (label, group) in enumerate(df.groupby(['Genre'])):
|
|
# ax = group.plot(ax=ax, kind='line', x='year',
|
|
# y='sales_total', label=label, color=plt.cm.tab20.colors[i])
|
|
# ax.set_xlabel('Année de sortie')
|
|
# ax.set_ylabel('Ventes (en millions)')
|
|
|
|
|
|
except(Exception,psy.DatabaseError) as error :
|
|
print(error)
|
|
finally:
|
|
if co is not None:
|
|
co.close()
|