-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSCT_ML_2.py
57 lines (47 loc) · 1.7 KB
/
SCT_ML_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Step 1: Load the Data
data = pd.read_csv('Mall_Customers.csv')
# Inspect the first few rows
print(data.head())
# Step 2: Data Preprocessing
# Select relevant features
features = data[['Annual Income (k$)', 'Spending Score (1-100)']]
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# Step 3: Determine the Optimal Number of Clusters
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=0)
kmeans.fit(features_scaled)
wcss.append(kmeans.inertia_)
# Plot the elbow graph
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid()
plt.show()
# Step 4: Fit K-means Clustering
optimal_k = 5 # Choose the optimal number based on the elbow method
kmeans = KMeans(n_clusters=optimal_k, random_state=0)
clusters = kmeans.fit_predict(features_scaled)
# Add cluster labels to the original dataset
data['Cluster'] = clusters
# Step 5: Analyze the Clusters
cluster_summary = data.groupby('Cluster').mean()
print(cluster_summary)
# Optionally, visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(features_scaled[:, 0], features_scaled[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', marker='X', label='Centroids')
plt.title('Customer Clusters')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.legend()
plt.grid()
plt.show()