-
Notifications
You must be signed in to change notification settings - Fork 0
/
sweet_spot_finder.py
54 lines (41 loc) · 2.69 KB
/
sweet_spot_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
# Function to load data and prepare vectors for clustering
def load_and_prepare_data(file_path):
# Load data from a CSV file
df = pd.read_csv(file_path)
# Preprocess the data: Generate a single numpy array per row by concatenating vector parts // change it based on your needs but in my use case I have to seperate them into 6 parts to work in Excel.
df['vector_array'] = df.apply(lambda row: np.array([float(item)
for part in range(1, 7) # Assuming 6 parts of vectors
for item in str(row[str(part)]).split(',')
if item.strip() != '']),
axis=1)
# Initialize all rows with a default cluster ID of -1 (indicating no cluster)
df['cluster'] = -1
# Filter rows to include only those with a vector array size that matches the model's output dimensions
vector_dimension = 3072 # Example dimension for text-ada-003-large embedddings
valid_vectors_df = df[df['vector_array'].apply(len) == vector_dimension].copy()
return df, valid_vectors_df
# Function to perform DBSCAN clustering on preprocessed data
def perform_clustering_dbscan(valid_vectors_df, similarity_threshold, min_samples):
# Convert similarity threshold to DBSCAN's eps (epsilon) parameter
eps_value = 1 - similarity_threshold
# Initialize DBSCAN with specified parameters
db = DBSCAN(eps=eps_value, min_samples=min_samples, metric='cosine')
# Fit the DBSCAN model and assign cluster labels
valid_vectors_df['cluster'] = db.fit_predict(np.stack(valid_vectors_df['vector_array'].values))
# Extract the unique cluster labels
labels = valid_vectors_df['cluster']
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
# Output the results of clustering
print(f"Similarity Threshold: {similarity_threshold:.3f}, Min Samples: {min_samples} -> Number of clusters: {n_clusters}")
return valid_vectors_df
# Main execution: Adjust parameters and file path as necessary
if __name__ == "__main__":
input_csv_path = 'path/to/your/input.csv' # Replace with your actual file path
min_samples = 2 # Define the minimum number of samples in a neighborhood for point classification
# Iterate over a range of similarity thresholds to find the optimal clustering configuration
for threshold in np.arange(0.995, 0.800, -0.005): # Adjust the range as needed
df, valid_vectors_df = load_and_prepare_data(input_csv_path)
perform_clustering_dbscan(valid_vectors_df, threshold, min_samples)