Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance code review for ML/DL/AI project #6

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions info.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
tree -I "__pycache__"
.
├── README.md
├── __init__.py
├── config
│ ├── config.yaml
│ └── visualization_config.yaml
├── data
│ ├── input
│ ├── output
│ └── processed
├── logs
│ └── pipeline.log
├── outputs
│ ├── checkpoints
│ └── figures
├── plan.md
├── requirements.txt
├── run.py
├── scripts
│ └── validate_dependencies.py
├── setup.py
├── src
│ ├── __init__.py
│ ├── cli.py
│ ├── cluster_manager.py
│ ├── clustering
│ │ ├── __init__.py
│ │ ├── attention_clustering.py
│ │ ├── cluster_explainer.py
│ │ ├── cluster_manager.py
│ │ ├── clustering_utils.py
│ │ ├── dynamic_cluster_manager.py
│ │ ├── dynamic_clusterer.py
│ │ ├── graph_clusterer.py
│ │ ├── hybrid_cluster_manager.py
│ │ └── streaming_manager.py
│ ├── dashboard
│ │ └── app.py
│ ├── data_exploration.py
│ ├── data_loader
│ │ └── flexible_loader.py
│ ├── data_loader.py
│ ├── data_preparation.py
│ ├── data_validator.py
│ ├── embedding_generator.py
│ ├── evaluation
│ │ ├── __init__.py
│ │ ├── cluster_evaluator.py
│ │ ├── eval_pipeline.py
│ │ ├── metrics.py
│ │ └── pipeline_evaluator.py
│ ├── main.py
│ ├── main_with_training.py
│ ├── personalization
│ │ └── user_config.py
│ ├── preprocessing
│ │ └── domain_agnostic_preprocessor.py
│ ├── preprocessor.py
│ ├── summarization
│ │ ├── __init__.py
│ │ ├── adaptive_style.py
│ │ ├── adaptive_summarizer.py
│ │ ├── enhanced_summarizer.py
│ │ ├── hybrid_summarizer.py
│ │ ├── model_trainer.py
│ │ └── summarizer.py
│ ├── synsearch.code-workspace
│ ├── utils
│ │ ├── __init__.py
│ │ ├── checkpoint_manager.py
│ │ ├── cluster_selector.py
│ │ ├── logging_config.py
│ │ ├── logging_utils.py
│ │ ├── metrics_calculator.py
│ │ ├── metrics_utils.py
│ │ ├── model_utils.py
│ │ └── style_selector.py
│ └── visualization
│ ├── __init__.py
│ ├── cluster_visualizer.py
│ └── embedding_visualizer.py
└── tests
├── __init__.py
├── test_data_loader.py
├── test_data_pipeline.py
├── test_data_validator.py
├── test_embedding_generator.py
├── test_embedding_visualizer.py
├── test_enhanced_pipeline.py
├── test_evaluation_metrics.py
├── test_integration.py
└── test_summarizer.py

21 directories, 70 files
44 changes: 43 additions & 1 deletion src/clustering/attention_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import torch.nn as nn
from typing import List, Dict, Optional
import numpy as np
from torch.utils.data import DataLoader, Dataset

class AttentionRefiner(nn.Module):
"""Refines embeddings using self-attention before clustering."""
Expand Down Expand Up @@ -29,9 +30,50 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:

return attn_output.squeeze(0)

class EmbeddingDataset(Dataset):
"""Custom Dataset for embeddings."""

def __init__(self, embeddings: np.ndarray):
self.embeddings = embeddings

def __len__(self):
return len(self.embeddings)

def __getitem__(self, idx):
return self.embeddings[idx]

class HybridClusteringModule:
"""Combines attention-refined embeddings with dynamic clustering."""

def __init__(self, embedding_dim: int, device: Optional[str] = None):
"""
Initialize the HybridClusteringModule with embedding dimension and device.

Args:
embedding_dim (int): Dimension of the embeddings.
device (Optional[str], optional): Device to use for computation. Defaults to None.
"""
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.attention_refiner = AttentionRefiner(embedding_dim).to(self.device)
self.attention_refiner = AttentionRefiner(embedding_dim).to(self.device)

def refine_embeddings(self, embeddings: np.ndarray, batch_size: int = 32) -> np.ndarray:
"""
Refine embeddings using self-attention in batches.

Args:
embeddings (np.ndarray): Array of embeddings.
batch_size (int, optional): Batch size for processing. Defaults to 32.

Returns:
np.ndarray: Refined embeddings.
"""
dataset = EmbeddingDataset(embeddings)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

refined_embeddings = []
for batch in dataloader:
batch = batch.to(self.device)
refined_batch = self.attention_refiner(batch)
refined_embeddings.append(refined_batch.cpu().numpy())

return np.concatenate(refined_embeddings, axis=0)
110 changes: 90 additions & 20 deletions src/clustering/cluster_explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@
import spacy
from collections import Counter
import logging
from multiprocessing import Pool, cpu_count

class ClusterExplainer:
"""Explains cluster characteristics and key features."""

def __init__(self, config: Dict[str, Any]):
"""
Initialize the ClusterExplainer with configuration settings.

Args:
config (Dict[str, Any]): Configuration dictionary.
"""
self.config = config
self.logger = logging.getLogger(__name__)
self.nlp = spacy.load('en_core_web_sm')
Expand All @@ -22,7 +29,16 @@ def explain_clusters(
texts: List[str],
labels: np.ndarray
) -> Dict[str, Dict[str, Any]]:
"""Generate explanations for each cluster."""
"""
Generate explanations for each cluster.

Args:
texts (List[str]): List of texts.
labels (np.ndarray): Array of cluster labels.

Returns:
Dict[str, Dict[str, Any]]: Explanations for each cluster.
"""
try:
explanations = {}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): We've found these issues:

unique_labels = np.unique(labels)
Expand All @@ -31,36 +47,74 @@ def explain_clusters(
tfidf_matrix = self.vectorizer.fit_transform(texts)
feature_names = self.vectorizer.get_feature_names_out()

for label in unique_labels:
if label == -1: # Skip noise cluster
continue

cluster_texts = [text for text, l in zip(texts, labels) if l == label]
cluster_indices = np.where(labels == label)[0]

explanations[str(label)] = {
'size': len(cluster_texts),
'key_terms': self._get_key_terms(
tfidf_matrix[cluster_indices],
feature_names
),
'entities': self._extract_entities(cluster_texts),
'summary_stats': self._calculate_summary_stats(cluster_texts)
}
with Pool(processes=cpu_count()) as pool:
results = pool.starmap(
self._process_cluster,
[(label, texts, labels, tfidf_matrix, feature_names) for label in unique_labels if label != -1]
)

for label, explanation in results:
explanations[str(label)] = explanation

return explanations

except Exception as e:
self.logger.error(f"Error generating explanations: {e}")
raise

def _process_cluster(
self,
label: int,
texts: List[str],
labels: np.ndarray,
tfidf_matrix: np.ndarray,
feature_names: np.ndarray
) -> (int, Dict[str, Any]):
"""
Process a single cluster to generate explanations.

Args:
label (int): Cluster label.
texts (List[str]): List of texts.
labels (np.ndarray): Array of cluster labels.
tfidf_matrix (np.ndarray): TF-IDF matrix.
feature_names (np.ndarray): Feature names from TF-IDF vectorizer.

Returns:
(int, Dict[str, Any]): Cluster label and its explanation.
"""
cluster_texts = [text for text, l in zip(texts, labels) if l == label]
cluster_indices = np.where(labels == label)[0]

explanation = {
'size': len(cluster_texts),
'key_terms': self._get_key_terms(
tfidf_matrix[cluster_indices],
feature_names
),
'entities': self._extract_entities(cluster_texts),
'summary_stats': self._calculate_summary_stats(cluster_texts)
}

return label, explanation

def _get_key_terms(
self,
cluster_tfidf: np.ndarray,
feature_names: np.ndarray,
top_n: int = 5
) -> List[Dict[str, float]]:
"""Extract key terms using TF-IDF scores."""
"""
Extract key terms using TF-IDF scores.

Args:
cluster_tfidf (np.ndarray): TF-IDF matrix for the cluster.
feature_names (np.ndarray): Feature names from TF-IDF vectorizer.
top_n (int, optional): Number of top terms to extract. Defaults to 5.

Returns:
List[Dict[str, float]]: List of key terms and their scores.
"""
avg_tfidf = np.asarray(cluster_tfidf.mean(axis=0)).ravel()
top_indices = avg_tfidf.argsort()[-top_n:][::-1]

Expand All @@ -70,7 +124,15 @@ def _get_key_terms(
]

def _extract_entities(self, texts: List[str]) -> Dict[str, List[str]]:
"""Extract named entities from cluster texts."""
"""
Extract named entities from cluster texts.

Args:
texts (List[str]): List of texts in the cluster.

Returns:
Dict[str, List[str]]: Most frequent named entities in the cluster.
"""
entities = {'ORG': [], 'PERSON': [], 'GPE': [], 'TOPIC': []}

for text in texts:
Expand All @@ -86,7 +148,15 @@ def _extract_entities(self, texts: List[str]) -> Dict[str, List[str]]:
}

def _calculate_summary_stats(self, texts: List[str]) -> Dict[str, float]:
"""Calculate summary statistics for cluster texts."""
"""
Calculate summary statistics for cluster texts.

Args:
texts (List[str]): List of texts in the cluster.

Returns:
Dict[str, float]: Summary statistics for the cluster texts.
"""
lengths = [len(text.split()) for text in texts]
return {
'avg_length': float(np.mean(lengths)),
Expand Down
Loading