Compare commits

..

19 Commits

Author SHA1 Message Date
gorky1234 f464f6166a corection bug figsize
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 3038bd9841 Merge pull request 'Use cluster strategies and support PCA' (#15) from clustering-strategy into main
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 7cb0d55969 Allow using PCA to reduce dataset dimensions
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 01ef19a2f8 Merge files using strategies
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 86bd285193 Merge pull request 'stat_prediction' (#14) from stat_prediction into main
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 9bc9e21e45 add r2 score
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER da1e97f07f add r2 score
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 27e69b2af8 add confusion_matrix
continuous-integration/drone/push Build is passing Details
10 months ago
bastien 4054395641 update
continuous-integration/drone/push Build is failing Details
10 months ago
bastien 01168f3588 add visu to prediction regression
continuous-integration/drone/push Build is failing Details
10 months ago
Bastien OLLIER 9da6e2d594 Add cluster stats (#13)
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 4d82767c68 Add SkLearn to requirements.txt
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 9cb0d90eb1 Add CI/CD (#9)
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 3eac3f6b8d Merge pull request 'Support multiple column delimiters' (#10) from csv-delimiters into main
10 months ago
Clément FRÉVILLE c87308cc21 Support multiple column delimiters
10 months ago
Clément FRÉVILLE d4aeb87f75 Limit the number of neighbors based on the dataframe
10 months ago
Hugo PRADIER 3c5f6849f8 Merge pull request 'Support kNN as an imputation method' (#8) from knn into main
10 months ago
Clément FRÉVILLE cd0c85ea44 Support kNN as an imputation method
10 months ago
Hugo PRADIER 96d390c749 Merge pull request 'Ajout de la prédiction avec deux algos (un de prédiction et un de classification)' (#7) from prediction into main
10 months ago

@ -0,0 +1,44 @@
kind: pipeline
name: default
type: docker
trigger:
event:
- push
steps:
- name: lint
image: python:3.12
commands:
- pip install --root-user-action=ignore -r requirements.txt
- ruff check .
- name: docker-image
image: plugins/docker
settings:
dockerfile: Dockerfile
registry: hub.codefirst.iut.uca.fr
repo: hub.codefirst.iut.uca.fr/bastien.ollier/miner
username:
from_secret: REGISTRY_USER
password:
from_secret: REGISTRY_PASSWORD
cache_from:
- hub.codefirst.iut.uca.fr/bastien.ollier/miner:latest
depends_on: [ lint ]
- name: deploy-miner
image: hub.codefirst.iut.uca.fr/clement.freville2/codefirst-dockerproxy-clientdrone:latest
settings:
image: hub.codefirst.iut.uca.fr/bastien.ollier/miner:latest
container: miner
command: create
overwrite: true
admins: bastienollier,clementfreville2,hugopradier2
environment:
DRONE_REPO_OWNER: bastien.ollier
depends_on: [ docker-image ]
when:
branch:
- main
- ci/*

@ -0,0 +1,9 @@
FROM python:3.12-slim
WORKDIR /app
COPY . .
RUN pip3 install -r requirements.txt
EXPOSE 80
ENTRYPOINT ["streamlit", "run", "frontend/exploration.py", "--server.port=80", "--server.address=0.0.0.0", "--server.baseUrlPath=/containers/bastienollier-miner"]

@ -0,0 +1,83 @@
from sklearn.cluster import DBSCAN, KMeans
import numpy as np
from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import Any, Optional
@dataclass
class ClusterResult:
labels: np.array
centers: Optional[np.array]
statistics: list[dict[str, Any]]
class Cluster(ABC):
@abstractmethod
def run(self, data: np.array) -> ClusterResult:
pass
class DBSCANCluster(Cluster):
def __init__(self, eps: float = 0.5, min_samples: int = 5):
self.eps = eps
self.min_samples = min_samples
#@typing.override
def run(self, data: np.array) -> ClusterResult:
dbscan = DBSCAN(eps=self.eps, min_samples=self.min_samples)
labels = dbscan.fit_predict(data)
return ClusterResult(labels, None, self.get_statistics(data, labels))
def get_statistics(self, data: np.array, labels: np.array) -> list[dict[str, Any]]:
unique_labels = np.unique(labels)
stats = []
for label in unique_labels:
if label == -1:
continue
cluster_points = data[labels == label]
num_points = len(cluster_points)
density = num_points / (np.max(cluster_points, axis=0) - np.min(cluster_points, axis=0)).prod()
stats.append({
"cluster": label,
"num_points": num_points,
"density": density
})
return stats
def __str__(self) -> str:
return "DBScan"
class KMeansCluster(Cluster):
def __init__(self, n_clusters: int = 8, n_init: int = 1, max_iter: int = 300):
self.n_clusters = n_clusters
self.n_init = n_init
self.max_iter = max_iter
#@typing.override
def run(self, data: np.array) -> ClusterResult:
kmeans = KMeans(n_clusters=self.n_clusters, init="random", n_init=self.n_init, max_iter=self.max_iter, random_state=111)
labels = kmeans.fit_predict(data)
centers = kmeans.cluster_centers_
return ClusterResult(labels, centers, self.get_statistics(data, labels, centers))
def get_statistics(self, data: np.array, labels: np.array, centers: np.array) -> list[dict[str, Any]]:
unique_labels = np.unique(labels)
stats = []
for label in unique_labels:
cluster_points = data[labels == label]
num_points = len(cluster_points)
center = centers[label]
stats.append({
"cluster": label,
"num_points": num_points,
"center": center,
})
return stats
def __str__(self) -> str:
return "KMeans"
CLUSTERING_STRATEGIES = [DBSCANCluster(), KMeansCluster()]

@ -1,5 +1,6 @@
import pandas as pd import pandas as pd
import streamlit as st import streamlit as st
import codecs
st.set_page_config( st.set_page_config(
page_title="Project Miner", page_title="Project Miner",
@ -9,10 +10,13 @@ st.set_page_config(
st.title("Home") st.title("Home")
### Exploration ### Exploration
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) uploaded_file = st.file_uploader("Upload your CSV file", type=["csv", "tsv"])
separator = st.selectbox("Separator", [",", ";", "\\t"])
separator = codecs.getdecoder("unicode_escape")(separator)[0]
has_header = st.checkbox("Has header", value=True)
if uploaded_file is not None: if uploaded_file is not None:
st.session_state.data = pd.read_csv(uploaded_file) st.session_state.data = pd.read_csv(uploaded_file, sep=separator, header=0 if has_header else 1)
st.session_state.original_data = st.session_state.data st.session_state.original_data = st.session_state.data
st.success("File loaded successfully!") st.success("File loaded successfully!")

@ -1,6 +1,7 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pandas import DataFrame, Series from pandas import DataFrame, Series
from pandas.api.types import is_numeric_dtype from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import KNeighborsClassifier
from typing import Any, Union from typing import Any, Union
class DataFrameFunction(ABC): class DataFrameFunction(ABC):
@ -18,11 +19,14 @@ class MVStrategy(DataFrameFunction):
"""A way to handle missing values in a dataframe.""" """A way to handle missing values in a dataframe."""
@staticmethod @staticmethod
def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: def list_available(df: DataFrame, label: str, series: Series) -> list['MVStrategy']:
"""Get all the strategies that can be used.""" """Get all the strategies that can be used."""
choices = [DropStrategy(), ModeStrategy()] choices = [DropStrategy(), ModeStrategy()]
if is_numeric_dtype(series): if is_numeric_dtype(series):
choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy())) choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy()))
other_columns = df.select_dtypes(include="number").drop(label, axis=1).columns.to_list()
if len(other_columns):
choices.append(KNNStrategy(other_columns))
return choices return choices
@ -97,6 +101,43 @@ class LinearRegressionStrategy(MVStrategy):
return "Use linear regression" return "Use linear regression"
class KNNStrategy(MVStrategy):
def __init__(self, training_features: list[str]):
self.available_features = training_features
self.training_features = training_features
self.n_neighbors = 3
def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
# Remove any training column that have any missing values
usable_data = df.dropna(subset=self.training_features)
# Select columns to impute from
train_data = usable_data.dropna(subset=label)
# Create train dataframe
x_train = train_data.drop(label, axis=1)
y_train = train_data[label]
reg = KNeighborsClassifier(self.n_neighbors).fit(x_train, y_train)
# Create test dataframe
test_data = usable_data[usable_data[label].isnull()]
if test_data.empty:
return df
x_test = test_data.drop(label, axis=1)
predicted = reg.predict(x_test)
# Fill with predicated values and patch the original data
usable_data[label].fillna(Series(predicted), inplace=True)
df.fillna(usable_data, inplace=True)
return df
def count_max(self, df: DataFrame, label: str) -> int:
usable_data = df.dropna(subset=self.training_features)
return usable_data[label].count()
def __str__(self) -> str:
return "kNN"
class KeepStrategy(ScalingStrategy): class KeepStrategy(ScalingStrategy):
#@typing.override #@typing.override
def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:

@ -0,0 +1,86 @@
import streamlit as st
import matplotlib.pyplot as plt
from clusters import DBSCANCluster, KMeansCluster, CLUSTERING_STRATEGIES
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import numpy as np
st.header("Clustering")
if "data" in st.session_state:
data = st.session_state.data
general_row = st.columns([1, 1, 1])
clustering = general_row[0].selectbox("Clustering method", CLUSTERING_STRATEGIES)
data_name = general_row[1].multiselect("Columns", data.select_dtypes(include="number").columns)
n_components = general_row[2].number_input("Reduce dimensions to (PCA)", min_value=1, max_value=3, value=2)
with st.form("cluster_form"):
if isinstance(clustering, KMeansCluster):
row1 = st.columns([1, 1, 1])
clustering.n_clusters = row1[0].number_input("Number of clusters", min_value=1, max_value=data.shape[0], value=clustering.n_clusters)
clustering.n_init = row1[1].number_input("n_init", min_value=1, value=clustering.n_init)
clustering.max_iter = row1[2].number_input("max_iter", min_value=1, value=clustering.max_iter)
elif isinstance(clustering, DBSCANCluster):
row1 = st.columns([1, 1])
clustering.eps = row1[0].slider("eps", min_value=0.0001, max_value=1.0, step=0.05, value=clustering.eps)
clustering.min_samples = row1[1].number_input("min_samples", min_value=1, value=clustering.min_samples)
st.form_submit_button("Launch")
if len(data_name) > 0:
x = data[data_name].to_numpy()
n_components = min(n_components, len(data_name))
if len(data_name) > n_components:
pca = PCA(n_components)
x = pca.fit_transform(x)
if n_components == 2:
(fig, ax) = plt.subplots(figsize=(8, 8))
for i in range(0, pca.components_.shape[1]):
ax.arrow(
0,
0,
pca.components_[0, i],
pca.components_[1, i],
head_width=0.1,
head_length=0.1
)
plt.text(
pca.components_[0, i] + 0.05,
pca.components_[1, i] + 0.05,
data_name[i]
)
circle = plt.Circle((0, 0), radius=1, edgecolor='b', facecolor='None')
ax.add_patch(circle)
plt.axis("equal")
ax.set_title("PCA result - Correlation circle")
st.pyplot(fig)
result = clustering.run(x)
st.write("## Cluster stats")
st.table(result.statistics)
st.write("## Graphical representation")
fig = plt.figure()
if n_components == 1:
plt.scatter(x, np.zeros_like(x))
elif n_components == 2:
ax = fig.add_subplot(projection='rectilinear')
plt.scatter(x[:, 0], x[:, 1], c=result.labels, s=50, cmap="viridis")
if result.centers is not None:
plt.scatter(result.centers[:, 0], result.centers[:, 1], c="black", s=200, marker="X")
else:
ax = fig.add_subplot(projection='3d')
ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=result.labels, s=50, cmap="viridis")
if result.centers is not None:
ax.scatter(result.centers[:, 0], result.centers[:, 1], result.centers[:, 2], c="black", s=200, marker="X")
st.pyplot(fig)
if not (result.labels == 0).all():
st.write("Silhouette score:", silhouette_score(x, result.labels))
else:
st.error("Select at least one column")
else:
st.error("file not loaded")

@ -1,35 +0,0 @@
import streamlit as st
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
st.header("Clustering: dbscan")
if "data" in st.session_state:
data = st.session_state.data
with st.form("my_form"):
data_name = st.multiselect("Data Name", data.select_dtypes(include="number").columns, max_selections=3)
eps = st.slider("eps", min_value=0.0, max_value=1.0, value=0.5, step=0.01)
min_samples = st.number_input("min_samples", step=1, min_value=1, value=5)
st.form_submit_button("launch")
if len(data_name) >= 2 and len(data_name) <=3:
x = data[data_name].to_numpy()
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
y_dbscan = dbscan.fit_predict(x)
fig = plt.figure()
if len(data_name) == 2:
ax = fig.add_subplot(projection='rectilinear')
plt.scatter(x[:, 0], x[:, 1], c=y_dbscan, s=50, cmap="viridis")
else:
ax = fig.add_subplot(projection='3d')
ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=y_dbscan, s=50, cmap="viridis")
st.pyplot(fig)
else:
st.error("file not loaded")

@ -1,44 +0,0 @@
import streamlit as st
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
st.header("Clustering: kmeans")
if "data" in st.session_state:
data = st.session_state.data
with st.form("my_form"):
row1 = st.columns([1,1,1])
n_clusters = row1[0].selectbox("Number of clusters", range(1,data.shape[0]))
data_name = row1[1].multiselect("Data Name",data.select_dtypes(include="number").columns, max_selections=3)
n_init = row1[2].number_input("n_init",step=1,min_value=1)
row2 = st.columns([1,1])
max_iter = row1[0].number_input("max_iter",step=1,min_value=1)
st.form_submit_button("launch")
if len(data_name) >= 2 and len(data_name) <=3:
x = data[data_name].to_numpy()
kmeans = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, max_iter=max_iter, random_state=111)
y_kmeans = kmeans.fit_predict(x)
fig = plt.figure()
if len(data_name) == 2:
ax = fig.add_subplot(projection='rectilinear')
plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap="viridis")
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c="black", s=200, marker="X")
else:
ax = fig.add_subplot(projection='3d')
ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=y_kmeans, s=50, cmap="viridis")
centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1],centers[:, 2], c="black", s=200, marker="X")
st.pyplot(fig)
else:
st.error("file not loaded")

@ -1,5 +1,5 @@
import streamlit as st import streamlit as st
from normstrategy import MVStrategy, ScalingStrategy from normstrategy import MVStrategy, ScalingStrategy, KNNStrategy
if "data" in st.session_state: if "data" in st.session_state:
data = st.session_state.original_data data = st.session_state.original_data
@ -8,13 +8,16 @@ if "data" in st.session_state:
for column, series in data.items(): for column, series in data.items():
col1, col2 = st.columns(2) col1, col2 = st.columns(2)
missing_count = series.isna().sum() missing_count = series.isna().sum()
choices = MVStrategy.list_available(data, series) choices = MVStrategy.list_available(data, column, series)
option = col1.selectbox( option = col1.selectbox(
f"Missing values of {column} ({missing_count})", f"Missing values of {column} ({missing_count})",
choices, choices,
index=1, index=1,
key=f"mv-{column}", key=f"mv-{column}",
) )
if isinstance(option, KNNStrategy):
option.training_features = st.multiselect("Training columns", option.training_features, default=option.available_features, key=f"cols-{column}")
option.n_neighbors = st.number_input("Number of neighbors", min_value=1, max_value=option.count_max(data, column), value=option.n_neighbors, key=f"neighbors-{column}")
# Always re-get the series to avoid reusing an invalidated series pointer # Always re-get the series to avoid reusing an invalidated series pointer
data = option.apply(data, column, data[column]) data = option.apply(data, column, data[column])

@ -1,9 +1,11 @@
import streamlit as st import streamlit as st
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
st.header("Prediction: Classification") st.header("Prediction: Classification")
@ -60,5 +62,18 @@ if "data" in st.session_state:
prediction = label_encoders[target_name].inverse_transform(prediction) prediction = label_encoders[target_name].inverse_transform(prediction)
st.write("Prediction:", prediction[0]) st.write("Prediction:", prediction[0])
if len(data_name) == 1:
fig = plt.figure()
y_pred = [model.predict(pd.DataFrame([pred_value[0]], columns=data_name)) for pred_value in X.values.tolist()]
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted')
plt.ylabel('True')
st.pyplot(fig)
else: else:
st.error("File not loaded") st.error("File not loaded")

@ -1,6 +1,8 @@
import streamlit as st import streamlit as st
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
st.header("Prediction: Regression") st.header("Prediction: Regression")
@ -25,5 +27,37 @@ if "data" in st.session_state:
prediction = model.predict(pd.DataFrame([pred_values], columns=data_name)) prediction = model.predict(pd.DataFrame([pred_values], columns=data_name))
st.write("Prediction:", prediction[0]) st.write("Prediction:", prediction[0])
fig = plt.figure()
dataframe_sorted = pd.concat([X, y], axis=1).sort_values(by=data_name)
if len(data_name) == 1:
y_pred = [model.predict(pd.DataFrame([pred_value[0]], columns=data_name)) for pred_value in X.values.tolist()]
r2 = r2_score(y, y_pred)
st.write('R-squared score:', r2)
X = dataframe_sorted[data_name[0]]
y = dataframe_sorted[target_name]
prediction_array_y = [
model.predict(pd.DataFrame([[dataframe_sorted[data_name[0]].iloc[i]]], columns=data_name))[0]
for i in range(dataframe_sorted.shape[0])
]
plt.scatter(dataframe_sorted[data_name[0]], dataframe_sorted[target_name], color='b')
plt.plot(dataframe_sorted[data_name[0]], prediction_array_y, color='r')
elif len(data_name) == 2:
ax = fig.add_subplot(111, projection='3d')
prediction_array_y = [
model.predict(pd.DataFrame([[dataframe_sorted[data_name[0]].iloc[i], dataframe_sorted[data_name[1]].iloc[i]]], columns=data_name))[0]
for i in range(dataframe_sorted.shape[0])
]
ax.scatter(dataframe_sorted[data_name[0]], dataframe_sorted[data_name[1]], dataframe_sorted[target_name], color='b')
ax.plot(dataframe_sorted[data_name[0]], dataframe_sorted[data_name[1]], prediction_array_y, color='r')
st.pyplot(fig)
else: else:
st.error("File not loaded") st.error("File not loaded")

@ -0,0 +1,6 @@
matplotlib>=3.5.0
pandas>=1.5.0
seaborn>=0.12.0
scikit-learn>=0.23.0
streamlit>=1.35.0
ruff>=0.4.8
Loading…
Cancel
Save