stochastic-sisyphus · stochastic-sisyphus · Dec 10, 2024 · sourcery-ai · Dec 10, 2024
diff --git a/info.md b/info.md
@@ -0,0 +1,95 @@
+tree -I "__pycache__"
+.
+├── README.md
+├── __init__.py
+├── config
+│   ├── config.yaml
+│   └── visualization_config.yaml
+├── data
+│   ├── input
+│   ├── output
+│   └── processed
+├── logs
+│   └── pipeline.log
+├── outputs
+│   ├── checkpoints
+│   └── figures
+├── plan.md
+├── requirements.txt
+├── run.py
+├── scripts
+│   └── validate_dependencies.py
+├── setup.py
+├── src
+│   ├── __init__.py
+│   ├── cli.py
+│   ├── cluster_manager.py
+│   ├── clustering
+│   │   ├── __init__.py
+│   │   ├── attention_clustering.py
+│   │   ├── cluster_explainer.py
+│   │   ├── cluster_manager.py
+│   │   ├── clustering_utils.py
+│   │   ├── dynamic_cluster_manager.py
+│   │   ├── dynamic_clusterer.py
+│   │   ├── graph_clusterer.py
+│   │   ├── hybrid_cluster_manager.py
+│   │   └── streaming_manager.py
+│   ├── dashboard
+│   │   └── app.py
+│   ├── data_exploration.py
+│   ├── data_loader
+│   │   └── flexible_loader.py
+│   ├── data_loader.py
+│   ├── data_preparation.py
+│   ├── data_validator.py
+│   ├── embedding_generator.py
+│   ├── evaluation
+│   │   ├── __init__.py
+│   │   ├── cluster_evaluator.py
+│   │   ├── eval_pipeline.py
+│   │   ├── metrics.py
+│   │   └── pipeline_evaluator.py
+│   ├── main.py
+│   ├── main_with_training.py
+│   ├── personalization
+│   │   └── user_config.py
+│   ├── preprocessing
+│   │   └── domain_agnostic_preprocessor.py
+│   ├── preprocessor.py
+│   ├── summarization
+│   │   ├── __init__.py
+│   │   ├── adaptive_style.py
+│   │   ├── adaptive_summarizer.py
+│   │   ├── enhanced_summarizer.py
+│   │   ├── hybrid_summarizer.py
+│   │   ├── model_trainer.py
+│   │   └── summarizer.py
+│   ├── synsearch.code-workspace
+│   ├── utils
+│   │   ├── __init__.py
+│   │   ├── checkpoint_manager.py
+│   │   ├── cluster_selector.py
+│   │   ├── logging_config.py
+│   │   ├── logging_utils.py
+│   │   ├── metrics_calculator.py
+│   │   ├── metrics_utils.py
+│   │   ├── model_utils.py
+│   │   └── style_selector.py
+│   └── visualization
+│       ├── __init__.py
+│       ├── cluster_visualizer.py
+│       └── embedding_visualizer.py
+└── tests
+    ├── __init__.py
+    ├── test_data_loader.py
+    ├── test_data_pipeline.py
+    ├── test_data_validator.py
+    ├── test_embedding_generator.py
+    ├── test_embedding_visualizer.py
+    ├── test_enhanced_pipeline.py
+    ├── test_evaluation_metrics.py
+    ├── test_integration.py
+    └── test_summarizer.py
+
+21 directories, 70 files
diff --git a/src/clustering/attention_clustering.py b/src/clustering/attention_clustering.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 from typing import List, Dict, Optional
 import numpy as np
+from torch.utils.data import DataLoader, Dataset
 
 class AttentionRefiner(nn.Module):
     """Refines embeddings using self-attention before clustering."""
@@ -29,9 +30,50 @@ def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
 
         return attn_output.squeeze(0)
 
+class EmbeddingDataset(Dataset):
+    """Custom Dataset for embeddings."""
+
+    def __init__(self, embeddings: np.ndarray):
+        self.embeddings = embeddings
+
+    def __len__(self):
+        return len(self.embeddings)
+
+    def __getitem__(self, idx):
+        return self.embeddings[idx]
+
 class HybridClusteringModule:
     """Combines attention-refined embeddings with dynamic clustering."""
 
     def __init__(self, embedding_dim: int, device: Optional[str] = None):
+        """
+        Initialize the HybridClusteringModule with embedding dimension and device.
+
+        Args:
+            embedding_dim (int): Dimension of the embeddings.
+            device (Optional[str], optional): Device to use for computation. Defaults to None.
+        """
         self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
-        self.attention_refiner = AttentionRefiner(embedding_dim).to(self.device) 
+        self.attention_refiner = AttentionRefiner(embedding_dim).to(self.device)
+
+    def refine_embeddings(self, embeddings: np.ndarray, batch_size: int = 32) -> np.ndarray:
+        """
+        Refine embeddings using self-attention in batches.
+
+        Args:
+            embeddings (np.ndarray): Array of embeddings.
+            batch_size (int, optional): Batch size for processing. Defaults to 32.
+
+        Returns:
+            np.ndarray: Refined embeddings.
+        """
+        dataset = EmbeddingDataset(embeddings)
+        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+
+        refined_embeddings = []
+        for batch in dataloader:
+            batch = batch.to(self.device)
+            refined_batch = self.attention_refiner(batch)
+            refined_embeddings.append(refined_batch.cpu().numpy())
+
+        return np.concatenate(refined_embeddings, axis=0)
diff --git a/src/clustering/cluster_explainer.py b/src/clustering/cluster_explainer.py
@@ -4,11 +4,18 @@
 import spacy
 from collections import Counter
 import logging
+from multiprocessing import Pool, cpu_count
 
 class ClusterExplainer:
     """Explains cluster characteristics and key features."""
 
     def __init__(self, config: Dict[str, Any]):
+        """
+        Initialize the ClusterExplainer with configuration settings.
+
+        Args:
+            config (Dict[str, Any]): Configuration dictionary.
+        """
         self.config = config
         self.logger = logging.getLogger(__name__)
         self.nlp = spacy.load('en_core_web_sm')
@@ -22,7 +29,16 @@ def explain_clusters(
         texts: List[str],
         labels: np.ndarray
     ) -> Dict[str, Dict[str, Any]]:
-        """Generate explanations for each cluster."""
+        """
+        Generate explanations for each cluster.
+
+        Args:
+            texts (List[str]): List of texts.
+            labels (np.ndarray): Array of cluster labels.
+
+        Returns:
+            Dict[str, Dict[str, Any]]: Explanations for each cluster.
+        """
         try:
             explanations = {}
             unique_labels = np.unique(labels)
@@ -31,36 +47,74 @@ def explain_clusters(
             tfidf_matrix = self.vectorizer.fit_transform(texts)
             feature_names = self.vectorizer.get_feature_names_out()
 
-            for label in unique_labels:
-                if label == -1:  # Skip noise cluster
-                    continue
-
-                cluster_texts = [text for text, l in zip(texts, labels) if l == label]
-                cluster_indices = np.where(labels == label)[0]
-
-                explanations[str(label)] = {
-                    'size': len(cluster_texts),
-                    'key_terms': self._get_key_terms(
-                        tfidf_matrix[cluster_indices],
-                        feature_names
-                    ),
-                    'entities': self._extract_entities(cluster_texts),
-                    'summary_stats': self._calculate_summary_stats(cluster_texts)
-                }
+            with Pool(processes=cpu_count()) as pool:
+                results = pool.starmap(
+                    self._process_cluster,
+                    [(label, texts, labels, tfidf_matrix, feature_names) for label in unique_labels if label != -1]
+                )
+
+            for label, explanation in results:
+                explanations[str(label)] = explanation
 
             return explanations
 
         except Exception as e:
             self.logger.error(f"Error generating explanations: {e}")
             raise
 
+    def _process_cluster(
+        self,
+        label: int,
+        texts: List[str],
+        labels: np.ndarray,
+        tfidf_matrix: np.ndarray,
+        feature_names: np.ndarray
+    ) -> (int, Dict[str, Any]):
+        """
+        Process a single cluster to generate explanations.
+
+        Args:
+            label (int): Cluster label.
+            texts (List[str]): List of texts.
+            labels (np.ndarray): Array of cluster labels.
+            tfidf_matrix (np.ndarray): TF-IDF matrix.
+            feature_names (np.ndarray): Feature names from TF-IDF vectorizer.
+
+        Returns:
+            (int, Dict[str, Any]): Cluster label and its explanation.
+        """
+        cluster_texts = [text for text, l in zip(texts, labels) if l == label]
+        cluster_indices = np.where(labels == label)[0]
+
+        explanation = {
+            'size': len(cluster_texts),
+            'key_terms': self._get_key_terms(
+                tfidf_matrix[cluster_indices],
+                feature_names
+            ),
+            'entities': self._extract_entities(cluster_texts),
+            'summary_stats': self._calculate_summary_stats(cluster_texts)
+        }
+
+        return label, explanation
+
     def _get_key_terms(
         self,
         cluster_tfidf: np.ndarray,
         feature_names: np.ndarray,
         top_n: int = 5
     ) -> List[Dict[str, float]]:
-        """Extract key terms using TF-IDF scores."""
+        """
+        Extract key terms using TF-IDF scores.
+
+        Args:
+            cluster_tfidf (np.ndarray): TF-IDF matrix for the cluster.
+            feature_names (np.ndarray): Feature names from TF-IDF vectorizer.
+            top_n (int, optional): Number of top terms to extract. Defaults to 5.
+
+        Returns:
+            List[Dict[str, float]]: List of key terms and their scores.
+        """
         avg_tfidf = np.asarray(cluster_tfidf.mean(axis=0)).ravel()
         top_indices = avg_tfidf.argsort()[-top_n:][::-1]
 
@@ -70,7 +124,15 @@ def _get_key_terms(
         ]
 
     def _extract_entities(self, texts: List[str]) -> Dict[str, List[str]]:
-        """Extract named entities from cluster texts."""
+        """
+        Extract named entities from cluster texts.
+
+        Args:
+            texts (List[str]): List of texts in the cluster.
+
+        Returns:
+            Dict[str, List[str]]: Most frequent named entities in the cluster.
+        """
         entities = {'ORG': [], 'PERSON': [], 'GPE': [], 'TOPIC': []}
 
         for text in texts:
@@ -86,7 +148,15 @@ def _extract_entities(self, texts: List[str]) -> Dict[str, List[str]]:
         }
 
     def _calculate_summary_stats(self, texts: List[str]) -> Dict[str, float]:
-        """Calculate summary statistics for cluster texts."""
+        """
+        Calculate summary statistics for cluster texts.
+
+        Args:
+            texts (List[str]): List of texts in the cluster.
+
+        Returns:
+            Dict[str, float]: Summary statistics for the cluster texts.
+        """
         lengths = [len(text.split()) for text in texts]
         return {
             'avg_length': float(np.mean(lengths)),