-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve cohesion and consistency in the codebase #4
base: main
Are you sure you want to change the base?
Changes from all commits
75ecef9
125142b
96c8b3c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,14 @@ | |
from sentence_transformers import SentenceTransformer | ||
|
||
class DataLoader: | ||
""" | ||
DataLoader class to handle loading and processing of datasets. | ||
|
||
Attributes: | ||
config (Dict[str, Any]): Configuration dictionary for data loading. | ||
logger (logging.Logger): Logger for logging information and errors. | ||
batch_size (int): Batch size for data loading. | ||
""" | ||
def __init__(self, config: Dict[str, Any]): | ||
"""Initialize DataLoader with configuration""" | ||
self.config = config | ||
|
@@ -20,7 +28,12 @@ def __init__(self, config: Dict[str, Any]): | |
self.config['data']['scisummnet_path'] = str(project_root / self.config['data']['scisummnet_path']) | ||
|
||
def load_all_datasets(self) -> Dict[str, pd.DataFrame]: | ||
"""Load all configured datasets.""" | ||
""" | ||
Load all configured datasets. | ||
|
||
Returns: | ||
Dict[str, pd.DataFrame]: Dictionary of loaded datasets. | ||
""" | ||
datasets = {} | ||
|
||
# Load XL-Sum dataset if enabled | ||
|
@@ -70,7 +83,15 @@ def load_all_datasets(self) -> Dict[str, pd.DataFrame]: | |
return datasets | ||
|
||
def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (complexity): Consider consolidating the two ScisummNet loading methods into a single parameterized function. The duplicate implementations of ScisummNet loading can be consolidated into a single method while maintaining all functionality: def load_scisummnet(self, path: Optional[str] = None, log_level: str = 'error') -> Optional[pd.DataFrame]:
"""
Load ScisummNet dataset from specified path or config.
Args:
path (Optional[str]): Override path to dataset. Uses config path if None.
log_level (str): Logging level to use ('error' or 'warning')
Returns:
Optional[pd.DataFrame]: DataFrame containing the loaded data.
"""
try:
dataset_path = path or self.config['data']['scisummnet_path']
self.logger.info(f"Loading ScisummNet dataset from {dataset_path}...")
top1000_dir = Path(dataset_path) / 'top1000_complete'
if not top1000_dir.exists():
log_fn = self.logger.error if log_level == 'error' else self.logger.warning
log_fn(f"Directory not found: {top1000_dir}")
return None
# Rest of implementation... This consolidation:
|
||
"""Load ScisummNet dataset from local directory""" | ||
""" | ||
Load ScisummNet dataset from local directory. | ||
|
||
Args: | ||
path (str): Path to the ScisummNet dataset. | ||
|
||
Returns: | ||
Optional[pd.DataFrame]: DataFrame containing the loaded data. | ||
""" | ||
try: | ||
self.logger.info(f"Loading ScisummNet dataset from {path}...") | ||
data = [] | ||
|
@@ -130,7 +151,12 @@ def load_scisummnet(self, path: str) -> Optional[pd.DataFrame]: | |
return None | ||
|
||
def load_scisummnet_dataset(self) -> pd.DataFrame: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue: Return type annotation should be Optional[pd.DataFrame] since the function can return None on error |
||
"""Load ScisummNet dataset.""" | ||
""" | ||
Load ScisummNet dataset. | ||
|
||
Returns: | ||
pd.DataFrame: DataFrame containing the loaded data. | ||
""" | ||
try: | ||
self.logger.info(f"Loading ScisummNet dataset from {self.config['data']['scisummnet_path']}...") | ||
|
||
|
@@ -180,4 +206,4 @@ def load_scisummnet_dataset(self) -> pd.DataFrame: | |
|
||
except Exception as e: | ||
self.logger.error(f"Error loading ScisummNet dataset: {e}") | ||
return None | ||
return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue (code-quality): Low code quality found in DataLoader.load_all_datasets - 25% (
low-code-quality
)Explanation
The quality score for this function is below the quality threshold of 25%.This score is a combination of the method length, cognitive complexity and working memory.
How can you solve this?
It might be worth refactoring this function to make it shorter and more readable.
their own functions. This is the most important thing you can do - ideally a
function should be less than 10 lines.
sits together within the function rather than being scattered.