Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve cohesion and consistency in the codebase #4

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,3 @@ language-tool-python>=2.7.1 # For grammar checking

# Domain-agnostic processing
textacy>=0.12.0

# Add these lines to requirements.txt
beautifulsoup4>=4.9.3
lxml>=4.9.0 # Optional but recommended parser for bs4
4 changes: 4 additions & 0 deletions src/cluster_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
from sklearn.metrics import silhouette_score

class ClusterManager:
"""
Manages dynamic clustering operations with adaptive algorithm selection.
"""
def __init__(self, config):
self.config = config
self.clusterer = None
self.method = self.config['clustering']['method']

def perform_clustering(self, embeddings):
"""Perform clustering on embeddings and return labels."""
Expand Down
34 changes: 30 additions & 4 deletions src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
from sentence_transformers import SentenceTransformer

class DataLoader:
"""
DataLoader class to handle loading and processing of datasets.

Attributes:
config (Dict[str, Any]): Configuration dictionary for data loading.
logger (logging.Logger): Logger for logging information and errors.
batch_size (int): Batch size for data loading.
"""
def __init__(self, config: Dict[str, Any]):
"""Initialize DataLoader with configuration"""
self.config = config
Expand All @@ -20,7 +28,12 @@ def __init__(self, config: Dict[str, Any]):
self.config['data']['scisummnet_path'] = str(project_root / self.config['data']['scisummnet_path'])

def load_all_datasets(self) -> Dict[str, pd.DataFrame]:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): Low code quality found in DataLoader.load_all_datasets - 25% (low-code-quality)


ExplanationThe quality score for this function is below the quality threshold of 25%.
This score is a combination of the method length, cognitive complexity and working memory.

How can you solve this?

It might be worth refactoring this function to make it shorter and more readable.

  • Reduce the function length by extracting pieces of functionality out into
    their own functions. This is the most important thing you can do - ideally a
    function should be less than 10 lines.
  • Reduce nesting, perhaps by introducing guard clauses to return early.
  • Ensure that variables are tightly scoped, so that code using related concepts
    sits together within the function rather than being scattered.

"""Load all configured datasets."""
"""
Load all configured datasets.

Returns:
Dict[str, pd.DataFrame]: Dictionary of loaded datasets.
"""
datasets = {}

# Load XL-Sum dataset if enabled
Expand Down Expand Up @@ -70,7 +83,15 @@ def load_all_datasets(self) -> Dict[str, pd.DataFrame]:
return datasets

def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (complexity): Consider consolidating the two ScisummNet loading methods into a single parameterized function.

The duplicate implementations of ScisummNet loading can be consolidated into a single method while maintaining all functionality:

def load_scisummnet(self, path: Optional[str] = None, log_level: str = 'error') -> Optional[pd.DataFrame]:
    """
    Load ScisummNet dataset from specified path or config.

    Args:
        path (Optional[str]): Override path to dataset. Uses config path if None.
        log_level (str): Logging level to use ('error' or 'warning')

    Returns:
        Optional[pd.DataFrame]: DataFrame containing the loaded data.
    """
    try:
        dataset_path = path or self.config['data']['scisummnet_path']
        self.logger.info(f"Loading ScisummNet dataset from {dataset_path}...")

        top1000_dir = Path(dataset_path) / 'top1000_complete'
        if not top1000_dir.exists():
            log_fn = self.logger.error if log_level == 'error' else self.logger.warning
            log_fn(f"Directory not found: {top1000_dir}")
            return None

        # Rest of implementation...

This consolidation:

  1. Eliminates code duplication
  2. Maintains different logging behaviors via the log_level parameter
  3. Preserves all existing functionality
  4. Makes the relationship between the two use cases explicit

"""Load ScisummNet dataset from local directory"""
"""
Load ScisummNet dataset from local directory.

Args:
path (str): Path to the ScisummNet dataset.

Returns:
Optional[pd.DataFrame]: DataFrame containing the loaded data.
"""
try:
self.logger.info(f"Loading ScisummNet dataset from {path}...")
data = []
Expand Down Expand Up @@ -130,7 +151,12 @@ def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]:
return None

def load_scisummnet_dataset(self) -> pd.DataFrame:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Return type annotation should be Optional[pd.DataFrame] since the function can return None on error

"""Load ScisummNet dataset."""
"""
Load ScisummNet dataset.

Returns:
pd.DataFrame: DataFrame containing the loaded data.
"""
try:
self.logger.info(f"Loading ScisummNet dataset from {self.config['data']['scisummnet_path']}...")

Expand Down Expand Up @@ -180,4 +206,4 @@ def load_scisummnet_dataset(self) -> pd.DataFrame:

except Exception as e:
self.logger.error(f"Error loading ScisummNet dataset: {e}")
return None
return None
4 changes: 3 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@

import torch
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from utils.style_selector import determine_cluster_style, get_style_parameters
from utils.metrics_utils import calculate_cluster_variance, calculate_lexical_diversity, calculate_cluster_metrics
from datasets import load_dataset
Expand Down Expand Up @@ -203,6 +202,9 @@ def get_optimal_batch_size():
return 16 # Default for CPU

def main():
"""
Main function to run the entire pipeline.
"""
# Initialize logger first
logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion src/main_with_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,4 @@ def generate_summaries(cluster_texts: Dict[str, List[str]], config: Dict) -> Lis
return summaries

if __name__ == "__main__":
main()
main()
20 changes: 19 additions & 1 deletion src/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@
from transformers import AutoTokenizer

class TextPreprocessor:
"""
TextPreprocessor class to handle text preprocessing tasks.

Attributes:
language (str): Language for stopwords and lemmatization.
logger (logging.Logger): Logger for logging information and errors.
nlp (spacy.lang): SpaCy language model.
stopwords (set): Set of stopwords for the specified language.
lemmatizer (nltk.WordNetLemmatizer): Lemmatizer for word normalization.
"""
def __init__(self, language: str = 'english'):
"""Initialize the preprocessor with specified language."""
self.logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -177,6 +187,14 @@ def process_document(self, doc: Dict[str, str]) -> Dict[str, Any]:
return processed

class DomainAgnosticPreprocessor:
"""
DomainAgnosticPreprocessor class to handle domain-agnostic text preprocessing tasks.

Attributes:
config (Dict[str, Any]): Configuration dictionary for preprocessing.
logger (logging.Logger): Logger for logging information and errors.
tokenizer (transformers.AutoTokenizer): Tokenizer for text tokenization.
"""
def __init__(self, config: Dict[str, Any]):
"""Initialize with configuration."""
self.config = config
Expand Down Expand Up @@ -234,4 +252,4 @@ def preprocess_text(self, text: str) -> str:
print("\nScisummNet Processing Complete:")
print(f"Total documents: {len(processed_sci)}")
print("Sample processed text:")
print(processed_sci['processed_text'].iloc[0][:200])
print(processed_sci['processed_text'].iloc[0][:200])