streamlit #21

Merged
remi.arnal merged 10 commits from streamlit into master 10 months ago

@ -0,0 +1,31 @@
kind: pipeline
type: docker
name: Pow
trigger:
event:
- push
steps:
- name: build-pow
image: plugins/docker
settings:
dockerfile: ./src/Dockerfile
context: ./src
registry: hub.codefirst.iut.uca.fr
repo: hub.codefirst.iut.uca.fr/dorian.hodin/pow
username:
from_secret: SECRET_USERNAME
password:
from_secret: SECRET_PASSWD
- name: deploy-pow
image: hub.codefirst.iut.uca.fr/thomas.bellembois/codefirst-dockerproxy-clientdrone:latest
environment:
IMAGENAME: hub.codefirst.iut.uca.fr/dorian.hodin/pow:latest
CONTAINERNAME: pow
COMMAND: create
OVERWRITE: true
ADMINS: dorianhodin,aurianjault,remiarnal
depends_on: [ build-pow ]

2
.gitignore vendored

@ -159,4 +159,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
#.idea/

@ -0,0 +1,2 @@
[client]
showSidebarNavigation = false

@ -0,0 +1,11 @@
FROM python:3.9
WORKDIR /app
COPY . .
RUN pip install --upgrade pip
RUN pip install streamlit matplotlib pandas scikit-learn
EXPOSE 8501
ENTRYPOINT ["streamlit", "run", "home.py", "--server.port=8501", "--server.address=0.0.0.0"]

@ -12,7 +12,7 @@ def visualize_clusters_2d(X, labels, centers=None, title="Clusters"):
plt.title(title)
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()
return plt.gcf()
def visualize_clusters_3d(X, labels, centers=None, title="Clusters"):
fig = plt.figure(figsize=(10, 7))
@ -56,7 +56,7 @@ def calculate_cluster_statistics_dbscan(X, labels):
})
return stats
def launch_cluster_knn(df,array_columns,n):
def launch_cluster_knn(df, array_columns, n=3):
X = df[array_columns].values
kmeans = KMeans(n_clusters=n, random_state=42)
@ -67,12 +67,11 @@ def launch_cluster_knn(df,array_columns,n):
stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans)
if len(array_columns) == 3:
visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D")
return visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D")
else:
visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering")
return stats_kmeans
return visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering")
def launch_cluster_DBSCAN(df, array_columns):
def launch_cluster_dbscan(df, array_columns):
X = df[array_columns].values
dbscan = DBSCAN(eps=0.2, min_samples=5)
labels_dbscan = dbscan.fit_predict(X)
@ -80,12 +79,12 @@ def launch_cluster_DBSCAN(df, array_columns):
# for stat in stats_dbscan:
# print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}")
if len(array_columns) == 3:
visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
else:
visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")
return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")
return stats_dbscan
def launch_cluster(df,array_columns):
def launch_cluster(df, array_columns):
X = df[array_columns].values
kmeans = KMeans(n_clusters=4, random_state=42)

@ -2,6 +2,7 @@ import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
<<<<<<< HEAD
def return_csv(path):
df = pd.read_csv(path)
return df
@ -13,7 +14,6 @@ def csv_value(df):
print(df.isna().sum())
# Useless values
def csv_check(df):
for col in df:
print("-"*12)
@ -21,31 +21,47 @@ def csv_check(df):
print("-"*12)
print(df[col].unique())
def do_for_columns(df):
for col_name in df:
df[col_name] = function(df[col_name])
def csv_norm_min_max(df,col):
maValue = df[col].max
miValue = df[col].min
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
return df
def csv_standardisation_Z(df,col):
def csv_norm_min_max(df, col):
max = df[col].max()
min = df[col].min()
df[col] = (df[col] - min)/ (max - min)
return df[col]
def csv_standardisation_Z(df, col):
mean_col1 = df[col].mean()
std_col1 = df[col].std()
df[col] = (df[col] - mean_col1) / std_col1
return df[col]
def csv_robust_normalize(df, column):
def csv_robust_normalize(df, col):
# Calcul de la médiane et de l'IQR
median = df[column].median()
q1 = df[column].quantile(0.25)
q3 = df[column].quantile(0.75)
median = df[col].median()
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
# Application de la normalisation robuste
normalized_column = (df[column] - median) / iqr
df[column] = normalized_column
print (normalized_column)
normalized_column = (df[col] - median) / iqr
df[col] = normalized_column
return normalized_column
def handle_normalization(df, norm_method):
if norm_method == "min-max":
for col_name in df:
df[col_name] = csv_norm_min_max(df, col_name)
return df
elif norm_method == "z-score":
for col_name in df:
df[col_name] = csv_standardisation_Z(df, col_name)
return df
elif norm_method == "robust":
for col_name in df:
df[col_name] = csv_robust_normalize(df, col_name)
return df
else:
raise ValueError("Unknown method")

@ -60,8 +60,6 @@ def impute_with_regression(data):
- n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn')
"""
def handle_missing_values(data, method, n_neighbors=5):
data = drop_high_null_percentage(data)
data = convert_categorical_to_numeric(data)
if method == 'mean':
return replace_with_mean(data)
@ -74,4 +72,4 @@ def handle_missing_values(data, method, n_neighbors=5):
elif method == 'regression':
return impute_with_regression(data)
else:
raise ValueError("Unknown method")
raise ValueError("Unknown method")

@ -2,18 +2,16 @@ from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction):
predictors = df[columns]
target = df[columnGoal]
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)
if algoOfPrediction == "Régression Linéaire":
if algoOfPrediction == "Linear Regression":
model = LinearRegression()
elif algoOfPrediction == "Forêt Aléatoire":
model = RandomForestRegressor(n_estimators=100)
elif algoOfPrediction == "Random Forest":
model = RandomForestRegressor(n_estimators=100)
else:
raise NameError("No method name : \"" + algoOfPrediction + "\"")
model.fit(X_train, y_train)
return model.predict(X_test)
model.fit(predictors, target)
return model.predict(predictors)

@ -2,15 +2,15 @@ import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def histo_col(df,colonne):
def histo_col(df, col):
plt.figure()
plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black')
plt.title(f"Histogramme de la colonne '{colonne}'")
plt.xlabel(colonne)
plt.hist(df[col], bins=4, alpha=0.7, color='blue', edgecolor='black')
plt.title(f"Histogramme de la colonne '{col}'")
plt.xlabel(col)
plt.ylabel("Fréquence")
plt.grid(True)
plt.show()
return plt.gcf()
def plotBoxWhisker(df):
df.plot(kind='box', subplots=True, sharex=False, sharey=False)
plt.show()
def plotBoxWhisker(df, col):
df[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
return plt.gcf()

@ -0,0 +1,53 @@
import streamlit as st
from io import StringIO
from ydata_profiling import ProfileReport
import pandas as pd
def statistics(df):
nan_counts = df.isnull().sum(axis=1).sum()
st.write("*Number of columns*:", len(df.columns))
st.write("*Number of rows*:", len(df.index))
st.write("*Nan Counts*: ", nan_counts)
st.write(df.isna().sum())
def display_df_first_and_lasts_lines(df):
fl = df.head(10)
ll = df.tail(10)
concat = pd.concat([fl, ll])
st.dataframe(concat)
def nav_bar():
st.page_link("./home.py", label="Import", icon="⬆️", help=None)
st.page_link("pages/clean.py", label="Clean", icon="🧼", help=None)
st.page_link("pages/visualize.py", label="Visualize", icon="👁️", help=None)
st.page_link("pages/prediction.py", label="Predict", icon="🔮", help=None)
st.page_link("pages/evaluate.py", label="Evaluate", icon=None, help=None)
def clean_dataframe(line):
# Call to function to clean data
line.empty()
line.write("Dataframe has been cleaned")
def main():
nav_bar()
st.write("# Pow: Your data analyser")
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
st.session_state.orig_df = df
st.write("## Dataframe (10 first/last lines)")
display_df_first_and_lasts_lines(df)
st.write("## Statistics")
statistics(df)
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_widgets()
if st.button("Next"):
st.switch_page("pages/clean.py")
main()

@ -0,0 +1,44 @@
import streamlit as st
import sys
sys.path.append('./back/')
import managing_missing_values as mmv
import load_csv as lc
if 'original_df' in st.session_state:
df = st.session_state.original_df
st.write("# 🧼 Data cleaning")
st.write("## Missing data")
rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True)
st.write("#### Replace missing values")
replace_methods = ["mean","median","mode","knn","regression"]
replace_method = st.radio('Choose an option:', replace_methods)
st.write("## Normalize data")
normalize_methods = ["min-max","z-score","robust"]
normalize_method = st.radio('Choose an option:', normalize_methods)
is_cleaned = st.button("Clean dataset")
if is_cleaned:
if rm_empty_rows_or_cols:
st.write("- Removing hight null percentage values")
df = mmv.drop_high_null_percentage(df)
st.dataframe(df)
st.write("- Handle missing values with method:", replace_method)
df = mmv.handle_missing_values(df, replace_method)
st.session_state.df = df
st.dataframe(df)
st.write("- Normalize with method:", normalize_method)
df = lc.handle_normalization(df, normalize_method)
st.session_state.df = df
st.dataframe(df)
st.switch_page("pages/visualize.py")
else:
st.write("Please upload you dataset.")

@ -0,0 +1,41 @@
import streamlit as st
import pandas as pd
import sys
sys.path.append('./back/')
import clustering_csv as cc
import prediction as p
if 'df' in st.session_state:
df = st.session_state.df
df_cols = df.columns.tolist()
st.write("# 🔮 Prediction")
if st.button("K-means"):
st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"]))
if st.button("DBSCAN"):
st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"]))
if st.button("Linear Regression"):
col = "Route Type"
df_cols.remove(col)
original_col = df[col]
predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression")
if st.button("Random Forest"):
col = "Route Type"
df_cols.remove(col)
original_col = df[col]
predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest")
ndf = pd.DataFrame()
ndf['Original'] = original_col
ndf['Predicted'] = predicted_col
st.dataframe(ndf)
else:
st.write("Please clean your dataset.")

@ -0,0 +1,32 @@
import streamlit as st
import matplotlib.pyplot as plt
import sys
sys.path.append('./back/')
import show_csv as sc
if 'df' in st.session_state:
df = st.session_state.df
df_columns = df.columns.tolist()
st.write("# 📊 Visualization")
st.write("## Histograms")
hist_tabs = st.tabs(df_columns)
for idx, tab in enumerate(hist_tabs):
tab.write("##### "+df_columns[idx])
tab.pyplot(sc.histo_col(df, df_columns[idx]))
st.write("## Box & Whisker")
baw_tabs = st.tabs(df_columns)
for idx, tab in enumerate(baw_tabs):
tab.write("##### "+df_columns[idx])
fig, ax = plt.subplots()
df[df_columns[idx]].plot(kind='box')
tab.pyplot(fig)
else:
st.write('Please clean your dataset.')
Loading…
Cancel
Save