Compare commits

..

38 Commits

Author SHA1 Message Date
gorky1234 f464f6166a corection bug figsize
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 3038bd9841 Merge pull request 'Use cluster strategies and support PCA' (#15) from clustering-strategy into main
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 7cb0d55969 Allow using PCA to reduce dataset dimensions
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 01ef19a2f8 Merge files using strategies
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 86bd285193 Merge pull request 'stat_prediction' (#14) from stat_prediction into main
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 9bc9e21e45 add r2 score
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER da1e97f07f add r2 score
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 27e69b2af8 add confusion_matrix
continuous-integration/drone/push Build is passing Details
10 months ago
bastien 4054395641 update
continuous-integration/drone/push Build is failing Details
10 months ago
bastien 01168f3588 add visu to prediction regression
continuous-integration/drone/push Build is failing Details
10 months ago
Bastien OLLIER 9da6e2d594 Add cluster stats (#13)
continuous-integration/drone/push Build is passing Details
10 months ago
Clément FRÉVILLE 4d82767c68 Add SkLearn to requirements.txt
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 9cb0d90eb1 Add CI/CD (#9)
continuous-integration/drone/push Build is passing Details
10 months ago
Bastien OLLIER 3eac3f6b8d Merge pull request 'Support multiple column delimiters' (#10) from csv-delimiters into main
10 months ago
Clément FRÉVILLE c87308cc21 Support multiple column delimiters
10 months ago
Clément FRÉVILLE d4aeb87f75 Limit the number of neighbors based on the dataframe
10 months ago
Hugo PRADIER 3c5f6849f8 Merge pull request 'Support kNN as an imputation method' (#8) from knn into main
10 months ago
Clément FRÉVILLE cd0c85ea44 Support kNN as an imputation method
10 months ago
Hugo PRADIER 96d390c749 Merge pull request 'Ajout de la prédiction avec deux algos (un de prédiction et un de classification)' (#7) from prediction into main
10 months ago
Hugo PRADIER 089cc66042 correctifs
10 months ago
Hugo PRADIER 2d1c867bed ajout prediction classification
10 months ago
Hugo PRADIER a914c3f8f9 prediction de regression terminee
10 months ago
Hugo PRADIER 70641ebca4 debut prediction
10 months ago
Bastien OLLIER e5f05a2c8a Mise à jour de 'frontend/pages/clustering_kmeans.py'
10 months ago
Bastien OLLIER 972fde561f Mise à jour de 'frontend/pages/clustering_dbscan.py'
10 months ago
Bastien OLLIER 694ecd0eef Merge pull request 'Visualize clusters in 3d' (#6) from cluster3d into main
10 months ago
Bastien OLLIER e255c67972 Merge pull request 'Implement base missing values strategies' (#3) from feature/missing-values into main
10 months ago
Bastien OLLIER e48c3bfa50 add 3d plot to bdscan
10 months ago
Bastien OLLIER 52cb140746 add 3d to kmeans
10 months ago
Bastien OLLIER c1f5e55a0b Merge pull request 'clustering' (#5) from clustering into main
10 months ago
Bastien OLLIER 34f70b4d79 delete np
10 months ago
Bastien OLLIER 64cf65a417 max nb cluster to nb line
10 months ago
Bastien OLLIER d4e33e7367 dbscan
10 months ago
Bastien OLLIER 72dcc8ff1c add dbscan
10 months ago
Bastien OLLIER 9fc6d7d2d1 add dbscan
10 months ago
Bastien OLLIER 197939555c debut dbscan
10 months ago
Bastien OLLIER 5bf5f507a5 end clustering
11 months ago
Bastien OLLIER 4ae8512dcb add form
11 months ago

@ -0,0 +1,44 @@
kind: pipeline
name: default
type: docker
trigger:
event:
- push
steps:
- name: lint
image: python:3.12
commands:
- pip install --root-user-action=ignore -r requirements.txt
- ruff check .
- name: docker-image
image: plugins/docker
settings:
dockerfile: Dockerfile
registry: hub.codefirst.iut.uca.fr
repo: hub.codefirst.iut.uca.fr/bastien.ollier/miner
username:
from_secret: REGISTRY_USER
password:
from_secret: REGISTRY_PASSWORD
cache_from:
- hub.codefirst.iut.uca.fr/bastien.ollier/miner:latest
depends_on: [ lint ]
- name: deploy-miner
image: hub.codefirst.iut.uca.fr/clement.freville2/codefirst-dockerproxy-clientdrone:latest
settings:
image: hub.codefirst.iut.uca.fr/bastien.ollier/miner:latest
container: miner
command: create
overwrite: true
admins: bastienollier,clementfreville2,hugopradier2
environment:
DRONE_REPO_OWNER: bastien.ollier
depends_on: [ docker-image ]
when:
branch:
- main
- ci/*

1
.gitignore vendored

@ -1 +1,2 @@
__pycache__
.venv

@ -0,0 +1,9 @@
FROM python:3.12-slim
WORKDIR /app
COPY . .
RUN pip3 install -r requirements.txt
EXPOSE 80
ENTRYPOINT ["streamlit", "run", "frontend/exploration.py", "--server.port=80", "--server.address=0.0.0.0", "--server.baseUrlPath=/containers/bastienollier-miner"]

@ -0,0 +1,83 @@
from sklearn.cluster import DBSCAN, KMeans
import numpy as np
from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import Any, Optional
@dataclass
class ClusterResult:
labels: np.array
centers: Optional[np.array]
statistics: list[dict[str, Any]]
class Cluster(ABC):
@abstractmethod
def run(self, data: np.array) -> ClusterResult:
pass
class DBSCANCluster(Cluster):
def __init__(self, eps: float = 0.5, min_samples: int = 5):
self.eps = eps
self.min_samples = min_samples
#@typing.override
def run(self, data: np.array) -> ClusterResult:
dbscan = DBSCAN(eps=self.eps, min_samples=self.min_samples)
labels = dbscan.fit_predict(data)
return ClusterResult(labels, None, self.get_statistics(data, labels))
def get_statistics(self, data: np.array, labels: np.array) -> list[dict[str, Any]]:
unique_labels = np.unique(labels)
stats = []
for label in unique_labels:
if label == -1:
continue
cluster_points = data[labels == label]
num_points = len(cluster_points)
density = num_points / (np.max(cluster_points, axis=0) - np.min(cluster_points, axis=0)).prod()
stats.append({
"cluster": label,
"num_points": num_points,
"density": density
})
return stats
def __str__(self) -> str:
return "DBScan"
class KMeansCluster(Cluster):
def __init__(self, n_clusters: int = 8, n_init: int = 1, max_iter: int = 300):
self.n_clusters = n_clusters
self.n_init = n_init
self.max_iter = max_iter
#@typing.override
def run(self, data: np.array) -> ClusterResult:
kmeans = KMeans(n_clusters=self.n_clusters, init="random", n_init=self.n_init, max_iter=self.max_iter, random_state=111)
labels = kmeans.fit_predict(data)
centers = kmeans.cluster_centers_
return ClusterResult(labels, centers, self.get_statistics(data, labels, centers))
def get_statistics(self, data: np.array, labels: np.array, centers: np.array) -> list[dict[str, Any]]:
unique_labels = np.unique(labels)
stats = []
for label in unique_labels:
cluster_points = data[labels == label]
num_points = len(cluster_points)
center = centers[label]
stats.append({
"cluster": label,
"num_points": num_points,
"center": center,
})
return stats
def __str__(self) -> str:
return "KMeans"
CLUSTERING_STRATEGIES = [DBSCANCluster(), KMeansCluster()]

@ -1,5 +1,6 @@
import pandas as pd
import streamlit as st
import codecs
st.set_page_config(
page_title="Project Miner",
@ -9,10 +10,13 @@ st.set_page_config(
st.title("Home")
### Exploration
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv", "tsv"])
separator = st.selectbox("Separator", [",", ";", "\\t"])
separator = codecs.getdecoder("unicode_escape")(separator)[0]
has_header = st.checkbox("Has header", value=True)
if uploaded_file is not None:
st.session_state.data = pd.read_csv(uploaded_file)
st.session_state.data = pd.read_csv(uploaded_file, sep=separator, header=0 if has_header else 1)
st.session_state.original_data = st.session_state.data
st.success("File loaded successfully!")

@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from pandas import DataFrame, Series
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import KNeighborsClassifier
from typing import Any, Union
class DataFrameFunction(ABC):
@ -18,11 +19,14 @@ class MVStrategy(DataFrameFunction):
"""A way to handle missing values in a dataframe."""
@staticmethod
def list_available(df: DataFrame, series: Series) -> list['MVStrategy']:
def list_available(df: DataFrame, label: str, series: Series) -> list['MVStrategy']:
"""Get all the strategies that can be used."""
choices = [DropStrategy(), ModeStrategy()]
if is_numeric_dtype(series):
choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy()))
other_columns = df.select_dtypes(include="number").drop(label, axis=1).columns.to_list()
if len(other_columns):
choices.append(KNNStrategy(other_columns))
return choices
@ -97,6 +101,43 @@ class LinearRegressionStrategy(MVStrategy):
return "Use linear regression"
class KNNStrategy(MVStrategy):
def __init__(self, training_features: list[str]):
self.available_features = training_features
self.training_features = training_features
self.n_neighbors = 3
def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
# Remove any training column that have any missing values
usable_data = df.dropna(subset=self.training_features)
# Select columns to impute from
train_data = usable_data.dropna(subset=label)
# Create train dataframe
x_train = train_data.drop(label, axis=1)
y_train = train_data[label]
reg = KNeighborsClassifier(self.n_neighbors).fit(x_train, y_train)
# Create test dataframe
test_data = usable_data[usable_data[label].isnull()]
if test_data.empty:
return df
x_test = test_data.drop(label, axis=1)
predicted = reg.predict(x_test)
# Fill with predicated values and patch the original data
usable_data[label].fillna(Series(predicted), inplace=True)
df.fillna(usable_data, inplace=True)
return df
def count_max(self, df: DataFrame, label: str) -> int:
usable_data = df.dropna(subset=self.training_features)
return usable_data[label].count()
def __str__(self) -> str:
return "kNN"
class KeepStrategy(ScalingStrategy):
#@typing.override
def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:

@ -0,0 +1,86 @@
import streamlit as st
import matplotlib.pyplot as plt
from clusters import DBSCANCluster, KMeansCluster, CLUSTERING_STRATEGIES
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import numpy as np
st.header("Clustering")
if "data" in st.session_state:
data = st.session_state.data
general_row = st.columns([1, 1, 1])
clustering = general_row[0].selectbox("Clustering method", CLUSTERING_STRATEGIES)
data_name = general_row[1].multiselect("Columns", data.select_dtypes(include="number").columns)
n_components = general_row[2].number_input("Reduce dimensions to (PCA)", min_value=1, max_value=3, value=2)
with st.form("cluster_form"):
if isinstance(clustering, KMeansCluster):
row1 = st.columns([1, 1, 1])
clustering.n_clusters = row1[0].number_input("Number of clusters", min_value=1, max_value=data.shape[0], value=clustering.n_clusters)
clustering.n_init = row1[1].number_input("n_init", min_value=1, value=clustering.n_init)
clustering.max_iter = row1[2].number_input("max_iter", min_value=1, value=clustering.max_iter)
elif isinstance(clustering, DBSCANCluster):
row1 = st.columns([1, 1])
clustering.eps = row1[0].slider("eps", min_value=0.0001, max_value=1.0, step=0.05, value=clustering.eps)
clustering.min_samples = row1[1].number_input("min_samples", min_value=1, value=clustering.min_samples)
st.form_submit_button("Launch")
if len(data_name) > 0:
x = data[data_name].to_numpy()
n_components = min(n_components, len(data_name))
if len(data_name) > n_components:
pca = PCA(n_components)
x = pca.fit_transform(x)
if n_components == 2:
(fig, ax) = plt.subplots(figsize=(8, 8))
for i in range(0, pca.components_.shape[1]):
ax.arrow(
0,
0,
pca.components_[0, i],
pca.components_[1, i],
head_width=0.1,
head_length=0.1
)
plt.text(
pca.components_[0, i] + 0.05,
pca.components_[1, i] + 0.05,
data_name[i]
)
circle = plt.Circle((0, 0), radius=1, edgecolor='b', facecolor='None')
ax.add_patch(circle)
plt.axis("equal")
ax.set_title("PCA result - Correlation circle")
st.pyplot(fig)
result = clustering.run(x)
st.write("## Cluster stats")
st.table(result.statistics)
st.write("## Graphical representation")
fig = plt.figure()
if n_components == 1:
plt.scatter(x, np.zeros_like(x))
elif n_components == 2:
ax = fig.add_subplot(projection='rectilinear')
plt.scatter(x[:, 0], x[:, 1], c=result.labels, s=50, cmap="viridis")
if result.centers is not None:
plt.scatter(result.centers[:, 0], result.centers[:, 1], c="black", s=200, marker="X")
else:
ax = fig.add_subplot(projection='3d')
ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=result.labels, s=50, cmap="viridis")
if result.centers is not None:
ax.scatter(result.centers[:, 0], result.centers[:, 1], result.centers[:, 2], c="black", s=200, marker="X")
st.pyplot(fig)
if not (result.labels == 0).all():
st.write("Silhouette score:", silhouette_score(x, result.labels))
else:
st.error("Select at least one column")
else:
st.error("file not loaded")

@ -1,5 +1,5 @@
import streamlit as st
from normstrategy import MVStrategy, ScalingStrategy
from normstrategy import MVStrategy, ScalingStrategy, KNNStrategy
if "data" in st.session_state:
data = st.session_state.original_data
@ -8,13 +8,16 @@ if "data" in st.session_state:
for column, series in data.items():
col1, col2 = st.columns(2)
missing_count = series.isna().sum()
choices = MVStrategy.list_available(data, series)
choices = MVStrategy.list_available(data, column, series)
option = col1.selectbox(
f"Missing values of {column} ({missing_count})",
choices,
index=1,
key=f"mv-{column}",
)
if isinstance(option, KNNStrategy):
option.training_features = st.multiselect("Training columns", option.training_features, default=option.available_features, key=f"cols-{column}")
option.n_neighbors = st.number_input("Number of neighbors", min_value=1, max_value=option.count_max(data, column), value=option.n_neighbors, key=f"neighbors-{column}")
# Always re-get the series to avoid reusing an invalidated series pointer
data = option.apply(data, column, data[column])

@ -0,0 +1,79 @@
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
st.header("Prediction: Classification")
if "data" in st.session_state:
data = st.session_state.data
with st.form("classification_form"):
st.subheader("Classification Parameters")
data_name = st.multiselect("Features", data.columns)
target_name = st.selectbox("Target", data.columns)
test_size = st.slider("Test Size", min_value=0.1, max_value=0.5, value=0.2, step=0.1)
st.form_submit_button('Train and Predict')
if data_name and target_name:
X = data[data_name]
y = data[target_name]
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
le = LabelEncoder()
X[column] = le.fit_transform(X[column])
label_encoders[column] = le
if y.dtype == 'object':
le = LabelEncoder()
y = le.fit_transform(y)
label_encoders[target_name] = le
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
st.subheader("Model Accuracy")
st.write(f"Accuracy on test data: {accuracy:.2f}")
st.subheader("Enter values for prediction")
pred_values = []
for feature in data_name:
if feature in label_encoders:
values = list(label_encoders[feature].classes_)
value = st.selectbox(f"Value for {feature}", values)
value_encoded = label_encoders[feature].transform([value])[0]
pred_values.append(value_encoded)
else:
value = st.number_input(f"Value for {feature}", value=0.0)
pred_values.append(value)
prediction = model.predict(pd.DataFrame([pred_values], columns=data_name))
if target_name in label_encoders:
prediction = label_encoders[target_name].inverse_transform(prediction)
st.write("Prediction:", prediction[0])
if len(data_name) == 1:
fig = plt.figure()
y_pred = [model.predict(pd.DataFrame([pred_value[0]], columns=data_name)) for pred_value in X.values.tolist()]
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted')
plt.ylabel('True')
st.pyplot(fig)
else:
st.error("File not loaded")

@ -0,0 +1,63 @@
import streamlit as st
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt
st.header("Prediction: Regression")
if "data" in st.session_state:
data = st.session_state.data
with st.form("regression_form"):
st.subheader("Linear Regression Parameters")
data_name = st.multiselect("Features", data.select_dtypes(include="number").columns)
target_name = st.selectbox("Target", data.select_dtypes(include="number").columns)
st.form_submit_button('Train and Predict')
if data_name and target_name:
X = data[data_name]
y = data[target_name]
model = LinearRegression()
model.fit(X, y)
st.subheader("Enter values for prediction")
pred_values = [st.number_input(f"Value for {feature}", value=0.0) for feature in data_name]
prediction = model.predict(pd.DataFrame([pred_values], columns=data_name))
st.write("Prediction:", prediction[0])
fig = plt.figure()
dataframe_sorted = pd.concat([X, y], axis=1).sort_values(by=data_name)
if len(data_name) == 1:
y_pred = [model.predict(pd.DataFrame([pred_value[0]], columns=data_name)) for pred_value in X.values.tolist()]
r2 = r2_score(y, y_pred)
st.write('R-squared score:', r2)
X = dataframe_sorted[data_name[0]]
y = dataframe_sorted[target_name]
prediction_array_y = [
model.predict(pd.DataFrame([[dataframe_sorted[data_name[0]].iloc[i]]], columns=data_name))[0]
for i in range(dataframe_sorted.shape[0])
]
plt.scatter(dataframe_sorted[data_name[0]], dataframe_sorted[target_name], color='b')
plt.plot(dataframe_sorted[data_name[0]], prediction_array_y, color='r')
elif len(data_name) == 2:
ax = fig.add_subplot(111, projection='3d')
prediction_array_y = [
model.predict(pd.DataFrame([[dataframe_sorted[data_name[0]].iloc[i], dataframe_sorted[data_name[1]].iloc[i]]], columns=data_name))[0]
for i in range(dataframe_sorted.shape[0])
]
ax.scatter(dataframe_sorted[data_name[0]], dataframe_sorted[data_name[1]], dataframe_sorted[target_name], color='b')
ax.plot(dataframe_sorted[data_name[0]], dataframe_sorted[data_name[1]], prediction_array_y, color='r')
st.pyplot(fig)
else:
st.error("File not loaded")

@ -0,0 +1,6 @@
matplotlib>=3.5.0
pandas>=1.5.0
seaborn>=0.12.0
scikit-learn>=0.23.0
streamlit>=1.35.0
ruff>=0.4.8
Loading…
Cancel
Save