stochastic-sisyphus · stochastic-sisyphus · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · sourcery-ai
diff --git a/requirements.txt b/requirements.txt
@@ -72,7 +72,3 @@ language-tool-python>=2.7.1  # For grammar checking
 
 # Domain-agnostic processing
 textacy>=0.12.0
-
-# Add these lines to requirements.txt
-beautifulsoup4>=4.9.3
-lxml>=4.9.0  # Optional but recommended parser for bs4
diff --git a/src/cluster_manager.py b/src/cluster_manager.py
@@ -3,9 +3,13 @@
 from sklearn.metrics import silhouette_score
 
 class ClusterManager:
+    """
+    Manages dynamic clustering operations with adaptive algorithm selection.
+    """
     def __init__(self, config):
         self.config = config
         self.clusterer = None
+        self.method = self.config['clustering']['method']
 
     def perform_clustering(self, embeddings):
         """Perform clustering on embeddings and return labels."""

diff --git a/src/data_loader.py b/src/data_loader.py
@@ -8,6 +8,14 @@
 from sentence_transformers import SentenceTransformer
 
 class DataLoader:
+    """
+    DataLoader class to handle loading and processing of datasets.
+
+    Attributes:
+        config (Dict[str, Any]): Configuration dictionary for data loading.
+        logger (logging.Logger): Logger for logging information and errors.
+        batch_size (int): Batch size for data loading.
+    """
     def __init__(self, config: Dict[str, Any]):
         """Initialize DataLoader with configuration"""
         self.config = config
@@ -20,7 +28,12 @@ def __init__(self, config: Dict[str, Any]):
             self.config['data']['scisummnet_path'] = str(project_root / self.config['data']['scisummnet_path'])
 
     def load_all_datasets(self) -> Dict[str, pd.DataFrame]:
-        """Load all configured datasets."""
+        """
+        Load all configured datasets.
+
+        Returns:
+            Dict[str, pd.DataFrame]: Dictionary of loaded datasets.
+        """
         datasets = {}
 
         # Load XL-Sum dataset if enabled
@@ -70,7 +83,15 @@ def load_all_datasets(self) -> Dict[str, pd.DataFrame]:
         return datasets
 
     def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]:
-        """Load ScisummNet dataset from local directory"""
+        """
+        Load ScisummNet dataset from local directory.
+
+        Args:
+            path (str): Path to the ScisummNet dataset.
+
+        Returns:
+            Optional[pd.DataFrame]: DataFrame containing the loaded data.
+        """
         try:
             self.logger.info(f"Loading ScisummNet dataset from {path}...")
             data = []
@@ -130,7 +151,12 @@ def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]:
             return None
 
     def load_scisummnet_dataset(self) -> pd.DataFrame:
-        """Load ScisummNet dataset."""
+        """
+        Load ScisummNet dataset.
+
+        Returns:
+            pd.DataFrame: DataFrame containing the loaded data.
+        """
         try:
             self.logger.info(f"Loading ScisummNet dataset from {self.config['data']['scisummnet_path']}...")
 
@@ -180,4 +206,4 @@ def load_scisummnet_dataset(self) -> pd.DataFrame:
 
         except Exception as e:
             self.logger.error(f"Error loading ScisummNet dataset: {e}")
-            return None
+            return None
diff --git a/src/main.py b/src/main.py
@@ -40,7 +40,6 @@
 
 import torch
 import multiprocessing
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from utils.style_selector import determine_cluster_style, get_style_parameters
 from utils.metrics_utils import calculate_cluster_variance, calculate_lexical_diversity, calculate_cluster_metrics
 from datasets import load_dataset
@@ -203,6 +202,9 @@ def get_optimal_batch_size():
     return 16  # Default for CPU
 
 def main():
+    """
+    Main function to run the entire pipeline.
+    """
     # Initialize logger first
     logger = logging.getLogger(__name__)
 

diff --git a/src/main_with_training.py b/src/main_with_training.py
@@ -180,4 +180,4 @@ def generate_summaries(cluster_texts: Dict[str, List[str]], config: Dict) -> Lis
     return summaries
 
 if __name__ == "__main__":
-    main() 
+    main()
diff --git a/src/preprocessor.py b/src/preprocessor.py
@@ -11,6 +11,16 @@
 from transformers import AutoTokenizer
 
 class TextPreprocessor:
+    """
+    TextPreprocessor class to handle text preprocessing tasks.
+
+    Attributes:
+        language (str): Language for stopwords and lemmatization.
+        logger (logging.Logger): Logger for logging information and errors.
+        nlp (spacy.lang): SpaCy language model.
+        stopwords (set): Set of stopwords for the specified language.
+        lemmatizer (nltk.WordNetLemmatizer): Lemmatizer for word normalization.
+    """
     def __init__(self, language: str = 'english'):
         """Initialize the preprocessor with specified language."""
         self.logger = logging.getLogger(__name__)
@@ -177,6 +187,14 @@ def process_document(self, doc: Dict[str, str]) -> Dict[str, Any]:
         return processed
 
 class DomainAgnosticPreprocessor:
+    """
+    DomainAgnosticPreprocessor class to handle domain-agnostic text preprocessing tasks.
+
+    Attributes:
+        config (Dict[str, Any]): Configuration dictionary for preprocessing.
+        logger (logging.Logger): Logger for logging information and errors.
+        tokenizer (transformers.AutoTokenizer): Tokenizer for text tokenization.
+    """
     def __init__(self, config: Dict[str, Any]):
         """Initialize with configuration."""
         self.config = config
@@ -234,4 +252,4 @@ def preprocess_text(self, text: str) -> str:
         print("\nScisummNet Processing Complete:")
         print(f"Total documents: {len(processed_sci)}")
         print("Sample processed text:")
-        print(processed_sci['processed_text'].iloc[0][:200]) 
+        print(processed_sci['processed_text'].iloc[0][:200])
-Original file line number
+Diff line change
@@ Expand Up @@
         return summaries
     if __name__ == "__main__":
-        main()
+        main()