diff --git a/requirements.txt b/requirements.txt index 8af3d88..6959654 100644 --- a/requirements.txt +++ b/requirements.txt @@ -72,7 +72,3 @@ language-tool-python>=2.7.1 # For grammar checking # Domain-agnostic processing textacy>=0.12.0 - -# Add these lines to requirements.txt -beautifulsoup4>=4.9.3 -lxml>=4.9.0 # Optional but recommended parser for bs4 diff --git a/src/cluster_manager.py b/src/cluster_manager.py index 6b4e411..c03cedc 100644 --- a/src/cluster_manager.py +++ b/src/cluster_manager.py @@ -3,9 +3,13 @@ from sklearn.metrics import silhouette_score class ClusterManager: + """ + Manages dynamic clustering operations with adaptive algorithm selection. + """ def __init__(self, config): self.config = config self.clusterer = None + self.method = self.config['clustering']['method'] def perform_clustering(self, embeddings): """Perform clustering on embeddings and return labels.""" diff --git a/src/data_loader.py b/src/data_loader.py index a018073..a67c6d7 100644 --- a/src/data_loader.py +++ b/src/data_loader.py @@ -8,6 +8,14 @@ from sentence_transformers import SentenceTransformer class DataLoader: + """ + DataLoader class to handle loading and processing of datasets. + + Attributes: + config (Dict[str, Any]): Configuration dictionary for data loading. + logger (logging.Logger): Logger for logging information and errors. + batch_size (int): Batch size for data loading. + """ def __init__(self, config: Dict[str, Any]): """Initialize DataLoader with configuration""" self.config = config @@ -20,7 +28,12 @@ def __init__(self, config: Dict[str, Any]): self.config['data']['scisummnet_path'] = str(project_root / self.config['data']['scisummnet_path']) def load_all_datasets(self) -> Dict[str, pd.DataFrame]: - """Load all configured datasets.""" + """ + Load all configured datasets. + + Returns: + Dict[str, pd.DataFrame]: Dictionary of loaded datasets. + """ datasets = {} # Load XL-Sum dataset if enabled @@ -70,7 +83,15 @@ def load_all_datasets(self) -> Dict[str, pd.DataFrame]: return datasets def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]: - """Load ScisummNet dataset from local directory""" + """ + Load ScisummNet dataset from local directory. + + Args: + path (str): Path to the ScisummNet dataset. + + Returns: + Optional[pd.DataFrame]: DataFrame containing the loaded data. + """ try: self.logger.info(f"Loading ScisummNet dataset from {path}...") data = [] @@ -130,7 +151,12 @@ def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]: return None def load_scisummnet_dataset(self) -> pd.DataFrame: - """Load ScisummNet dataset.""" + """ + Load ScisummNet dataset. + + Returns: + pd.DataFrame: DataFrame containing the loaded data. + """ try: self.logger.info(f"Loading ScisummNet dataset from {self.config['data']['scisummnet_path']}...") @@ -180,4 +206,4 @@ def load_scisummnet_dataset(self) -> pd.DataFrame: except Exception as e: self.logger.error(f"Error loading ScisummNet dataset: {e}") - return None \ No newline at end of file + return None diff --git a/src/main.py b/src/main.py index 3e80d75..0a36097 100644 --- a/src/main.py +++ b/src/main.py @@ -40,7 +40,6 @@ import torch import multiprocessing -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor from utils.style_selector import determine_cluster_style, get_style_parameters from utils.metrics_utils import calculate_cluster_variance, calculate_lexical_diversity, calculate_cluster_metrics from datasets import load_dataset @@ -203,6 +202,9 @@ def get_optimal_batch_size(): return 16 # Default for CPU def main(): + """ + Main function to run the entire pipeline. + """ # Initialize logger first logger = logging.getLogger(__name__) diff --git a/src/main_with_training.py b/src/main_with_training.py index 7ac700f..30e3edb 100644 --- a/src/main_with_training.py +++ b/src/main_with_training.py @@ -180,4 +180,4 @@ def generate_summaries(cluster_texts: Dict[str, List[str]], config: Dict) -> Lis return summaries if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/preprocessor.py b/src/preprocessor.py index 4170085..2a3329a 100644 --- a/src/preprocessor.py +++ b/src/preprocessor.py @@ -11,6 +11,16 @@ from transformers import AutoTokenizer class TextPreprocessor: + """ + TextPreprocessor class to handle text preprocessing tasks. + + Attributes: + language (str): Language for stopwords and lemmatization. + logger (logging.Logger): Logger for logging information and errors. + nlp (spacy.lang): SpaCy language model. + stopwords (set): Set of stopwords for the specified language. + lemmatizer (nltk.WordNetLemmatizer): Lemmatizer for word normalization. + """ def __init__(self, language: str = 'english'): """Initialize the preprocessor with specified language.""" self.logger = logging.getLogger(__name__) @@ -177,6 +187,14 @@ def process_document(self, doc: Dict[str, str]) -> Dict[str, Any]: return processed class DomainAgnosticPreprocessor: + """ + DomainAgnosticPreprocessor class to handle domain-agnostic text preprocessing tasks. + + Attributes: + config (Dict[str, Any]): Configuration dictionary for preprocessing. + logger (logging.Logger): Logger for logging information and errors. + tokenizer (transformers.AutoTokenizer): Tokenizer for text tokenization. + """ def __init__(self, config: Dict[str, Any]): """Initialize with configuration.""" self.config = config @@ -234,4 +252,4 @@ def preprocess_text(self, text: str) -> str: print("\nScisummNet Processing Complete:") print(f"Total documents: {len(processed_sci)}") print("Sample processed text:") - print(processed_sci['processed_text'].iloc[0][:200]) \ No newline at end of file + print(processed_sci['processed_text'].iloc[0][:200])