diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..028ee22 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,31 @@ +kind: pipeline +type: docker +name: Pow + +trigger: + event: + - push + +steps: + + - name: build-pow + image: plugins/docker + settings: + dockerfile: ./src/Dockerfile + context: ./src + registry: hub.codefirst.iut.uca.fr + repo: hub.codefirst.iut.uca.fr/dorian.hodin/pow + username: + from_secret: SECRET_USERNAME + password: + from_secret: SECRET_PASSWD + + - name: deploy-pow + image: hub.codefirst.iut.uca.fr/thomas.bellembois/codefirst-dockerproxy-clientdrone:latest + environment: + IMAGENAME: hub.codefirst.iut.uca.fr/dorian.hodin/pow:latest + CONTAINERNAME: pow + COMMAND: create + OVERWRITE: true + ADMINS: dorianhodin,aurianjault,remiarnal + depends_on: [ build-pow ] diff --git a/.gitignore b/.gitignore index 4df8670..2672e85 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +#.idea/ \ No newline at end of file diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000..7c595d0 --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,2 @@ +[client] +showSidebarNavigation = false diff --git a/src/Dockerfile b/src/Dockerfile new file mode 100644 index 0000000..7107b12 --- /dev/null +++ b/src/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.9 + +WORKDIR /app +COPY . . + +RUN pip install --upgrade pip +RUN pip install streamlit matplotlib pandas scikit-learn + +EXPOSE 8501 + +ENTRYPOINT ["streamlit", "run", "home.py", "--server.port=8501", "--server.address=0.0.0.0"] diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py index dcb170d..fb93b4e 100644 --- a/src/back/clustering_csv.py +++ b/src/back/clustering_csv.py @@ -12,7 +12,7 @@ def visualize_clusters_2d(X, labels, centers=None, title="Clusters"): plt.title(title) plt.xlabel("Feature 1") plt.ylabel("Feature 2") - plt.show() + return plt.gcf() def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): fig = plt.figure(figsize=(10, 7)) @@ -56,7 +56,7 @@ def calculate_cluster_statistics_dbscan(X, labels): }) return stats -def launch_cluster_knn(df,array_columns,n): +def launch_cluster_knn(df, array_columns, n=3): X = df[array_columns].values kmeans = KMeans(n_clusters=n, random_state=42) @@ -67,12 +67,11 @@ def launch_cluster_knn(df,array_columns,n): stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) if len(array_columns) == 3: - visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") + return visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") else: - visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") - return stats_kmeans + return visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") -def launch_cluster_DBSCAN(df, array_columns): +def launch_cluster_dbscan(df, array_columns): X = df[array_columns].values dbscan = DBSCAN(eps=0.2, min_samples=5) labels_dbscan = dbscan.fit_predict(X) @@ -80,12 +79,12 @@ def launch_cluster_DBSCAN(df, array_columns): # for stat in stats_dbscan: # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") if len(array_columns) == 3: - visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") + return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") else: - visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") return stats_dbscan -def launch_cluster(df,array_columns): +def launch_cluster(df, array_columns): X = df[array_columns].values kmeans = KMeans(n_clusters=4, random_state=42) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 25d5ad9..83f111f 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -2,6 +2,7 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt +<<<<<<< HEAD def return_csv(path): df = pd.read_csv(path) return df @@ -13,7 +14,6 @@ def csv_value(df): print(df.isna().sum()) # Useless values - def csv_check(df): for col in df: print("-"*12) @@ -21,31 +21,47 @@ def csv_check(df): print("-"*12) print(df[col].unique()) +def do_for_columns(df): + for col_name in df: + df[col_name] = function(df[col_name]) -def csv_norm_min_max(df,col): - maValue = df[col].max - miValue = df[col].min - df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) - return df -def csv_standardisation_Z(df,col): +def csv_norm_min_max(df, col): + max = df[col].max() + min = df[col].min() + df[col] = (df[col] - min)/ (max - min) + return df[col] + +def csv_standardisation_Z(df, col): mean_col1 = df[col].mean() std_col1 = df[col].std() df[col] = (df[col] - mean_col1) / std_col1 return df[col] -def csv_robust_normalize(df, column): +def csv_robust_normalize(df, col): # Calcul de la médiane et de l'IQR - median = df[column].median() - q1 = df[column].quantile(0.25) - q3 = df[column].quantile(0.75) + median = df[col].median() + q1 = df[col].quantile(0.25) + q3 = df[col].quantile(0.75) iqr = q3 - q1 # Application de la normalisation robuste - normalized_column = (df[column] - median) / iqr - df[column] = normalized_column - print (normalized_column) + normalized_column = (df[col] - median) / iqr + df[col] = normalized_column return normalized_column - - +def handle_normalization(df, norm_method): + if norm_method == "min-max": + for col_name in df: + df[col_name] = csv_norm_min_max(df, col_name) + return df + elif norm_method == "z-score": + for col_name in df: + df[col_name] = csv_standardisation_Z(df, col_name) + return df + elif norm_method == "robust": + for col_name in df: + df[col_name] = csv_robust_normalize(df, col_name) + return df + else: + raise ValueError("Unknown method") diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index 7ee920f..24ac773 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -60,8 +60,6 @@ def impute_with_regression(data): - n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn') """ def handle_missing_values(data, method, n_neighbors=5): - - data = drop_high_null_percentage(data) data = convert_categorical_to_numeric(data) if method == 'mean': return replace_with_mean(data) @@ -74,4 +72,4 @@ def handle_missing_values(data, method, n_neighbors=5): elif method == 'regression': return impute_with_regression(data) else: - raise ValueError("Unknown method") \ No newline at end of file + raise ValueError("Unknown method") diff --git a/src/back/prediction.py b/src/back/prediction.py index 09c7556..1700d72 100644 --- a/src/back/prediction.py +++ b/src/back/prediction.py @@ -2,18 +2,16 @@ from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor - def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction): predictors = df[columns] target = df[columnGoal] - X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42) - if algoOfPrediction == "Régression Linéaire": + if algoOfPrediction == "Linear Regression": model = LinearRegression() - elif algoOfPrediction == "Forêt Aléatoire": - model = RandomForestRegressor(n_estimators=100) + elif algoOfPrediction == "Random Forest": + model = RandomForestRegressor(n_estimators=100) else: raise NameError("No method name : \"" + algoOfPrediction + "\"") - model.fit(X_train, y_train) - return model.predict(X_test) \ No newline at end of file + model.fit(predictors, target) + return model.predict(predictors) diff --git a/src/back/show_csv.py b/src/back/show_csv.py index 93d9973..cd8a293 100644 --- a/src/back/show_csv.py +++ b/src/back/show_csv.py @@ -2,15 +2,15 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt -def histo_col(df,colonne): +def histo_col(df, col): plt.figure() - plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black') - plt.title(f"Histogramme de la colonne '{colonne}'") - plt.xlabel(colonne) + plt.hist(df[col], bins=4, alpha=0.7, color='blue', edgecolor='black') + plt.title(f"Histogramme de la colonne '{col}'") + plt.xlabel(col) plt.ylabel("Fréquence") plt.grid(True) - plt.show() + return plt.gcf() -def plotBoxWhisker(df): - df.plot(kind='box', subplots=True, sharex=False, sharey=False) - plt.show() +def plotBoxWhisker(df, col): + df[col].plot(kind='box', subplots=True, sharex=False, sharey=False) + return plt.gcf() diff --git a/src/home.py b/src/home.py new file mode 100644 index 0000000..754dc65 --- /dev/null +++ b/src/home.py @@ -0,0 +1,53 @@ +import streamlit as st +from io import StringIO +from ydata_profiling import ProfileReport +import pandas as pd + +def statistics(df): + nan_counts = df.isnull().sum(axis=1).sum() + + st.write("*Number of columns*:", len(df.columns)) + st.write("*Number of rows*:", len(df.index)) + + st.write("*Nan Counts*: ", nan_counts) + st.write(df.isna().sum()) + +def display_df_first_and_lasts_lines(df): + fl = df.head(10) + ll = df.tail(10) + concat = pd.concat([fl, ll]) + st.dataframe(concat) + +def nav_bar(): + st.page_link("./home.py", label="Import", icon="⬆️", help=None) + st.page_link("pages/clean.py", label="Clean", icon="🧼", help=None) + st.page_link("pages/visualize.py", label="Visualize", icon="👁️", help=None) + st.page_link("pages/prediction.py", label="Predict", icon="🔮", help=None) + st.page_link("pages/evaluate.py", label="Evaluate", icon=None, help=None) + +def clean_dataframe(line): + # Call to function to clean data + line.empty() + line.write("Dataframe has been cleaned") + +def main(): + nav_bar() + st.write("# Pow: Your data analyser") + + uploaded_file = st.file_uploader("Choose a file") + if uploaded_file is not None: + df = pd.read_csv(uploaded_file) + st.session_state.orig_df = df + st.write("## Dataframe (10 first/last lines)") + display_df_first_and_lasts_lines(df) + + st.write("## Statistics") + statistics(df) + + profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True) + profile.to_widgets() + + if st.button("Next"): + st.switch_page("pages/clean.py") + +main() diff --git a/src/pages/clean.py b/src/pages/clean.py new file mode 100644 index 0000000..f64bd49 --- /dev/null +++ b/src/pages/clean.py @@ -0,0 +1,44 @@ +import streamlit as st +import sys +sys.path.append('./back/') + +import managing_missing_values as mmv +import load_csv as lc + +if 'original_df' in st.session_state: + df = st.session_state.original_df + + st.write("# 🧼 Data cleaning") + + st.write("## Missing data") + rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True) + + + st.write("#### Replace missing values") + replace_methods = ["mean","median","mode","knn","regression"] + replace_method = st.radio('Choose an option:', replace_methods) + + st.write("## Normalize data") + normalize_methods = ["min-max","z-score","robust"] + normalize_method = st.radio('Choose an option:', normalize_methods) + + is_cleaned = st.button("Clean dataset") + if is_cleaned: + if rm_empty_rows_or_cols: + st.write("- Removing hight null percentage values") + df = mmv.drop_high_null_percentage(df) + st.dataframe(df) + + st.write("- Handle missing values with method:", replace_method) + df = mmv.handle_missing_values(df, replace_method) + st.session_state.df = df + st.dataframe(df) + + st.write("- Normalize with method:", normalize_method) + df = lc.handle_normalization(df, normalize_method) + st.session_state.df = df + st.dataframe(df) + + st.switch_page("pages/visualize.py") +else: + st.write("Please upload you dataset.") diff --git a/src/pages/evaluate.py b/src/pages/evaluate.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pages/prediction.py b/src/pages/prediction.py new file mode 100644 index 0000000..6892c69 --- /dev/null +++ b/src/pages/prediction.py @@ -0,0 +1,41 @@ +import streamlit as st +import pandas as pd +import sys +sys.path.append('./back/') + +import clustering_csv as cc +import prediction as p + +if 'df' in st.session_state: + + df = st.session_state.df + df_cols = df.columns.tolist() + + st.write("# 🔮 Prediction") + + if st.button("K-means"): + st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"])) + + if st.button("DBSCAN"): + st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"])) + + if st.button("Linear Regression"): + col = "Route Type" + df_cols.remove(col) + original_col = df[col] + predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression") + + if st.button("Random Forest"): + col = "Route Type" + df_cols.remove(col) + original_col = df[col] + predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest") + + ndf = pd.DataFrame() + ndf['Original'] = original_col + ndf['Predicted'] = predicted_col + + st.dataframe(ndf) + +else: + st.write("Please clean your dataset.") diff --git a/src/pages/visualize.py b/src/pages/visualize.py new file mode 100644 index 0000000..d15ff23 --- /dev/null +++ b/src/pages/visualize.py @@ -0,0 +1,32 @@ +import streamlit as st +import matplotlib.pyplot as plt + +import sys +sys.path.append('./back/') + +import show_csv as sc + +if 'df' in st.session_state: + + df = st.session_state.df + df_columns = df.columns.tolist() + + st.write("# 📊 Visualization") + + st.write("## Histograms") + hist_tabs = st.tabs(df_columns) + + for idx, tab in enumerate(hist_tabs): + tab.write("##### "+df_columns[idx]) + tab.pyplot(sc.histo_col(df, df_columns[idx])) + + st.write("## Box & Whisker") + baw_tabs = st.tabs(df_columns) + + for idx, tab in enumerate(baw_tabs): + tab.write("##### "+df_columns[idx]) + fig, ax = plt.subplots() + df[df_columns[idx]].plot(kind='box') + tab.pyplot(fig) +else: + st.write('Please clean your dataset.')