diff --git a/config/config.yaml b/config/config.yaml index c790702..a6c1999 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -4,22 +4,12 @@ data: output_path: "data/output" processed_path: "data/processed" batch_size: 32 - scisummnet_path: "data/scisummnet_release1.1__20190413" + scisummnet_path: "data/scisummnet" datasets: - name: "xlsum" - source: "huggingface" enabled: true language: "english" - dataset_name: "GEM/xlsum" - - name: "scisummnet" - source: "local" - enabled: true - file_patterns: - xml: "{paper_id}.xml" - summary: "{paper_id}.gold.txt" - subdirs: - documents: "Documents_xml" - summaries: "summary" + dataset_name: "xlsum" # Preprocessing Configuration preprocessing: @@ -47,7 +37,7 @@ clustering: metric: "euclidean" params: min_cluster_size: 5 - min_samples: 5 + min_samples: 3 metric: "euclidean" output_dir: "outputs/clusters" @@ -58,7 +48,7 @@ clustering: resolution: 1.0 streaming: - enabled: true + enabled: false buffer_size: 100 update_interval: 60 # seconds diff --git a/run.py b/run.py new file mode 100644 index 0000000..921df72 --- /dev/null +++ b/run.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +from src.main import main + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/main.py b/src/main.py index faeac84..241028b 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,38 @@ import os import sys from pathlib import Path + +# Add project root to PYTHONPATH when running directly +if __name__ == "__main__": + project_root = Path(__file__).parent.parent + sys.path.append(str(project_root)) + + # Update imports to absolute paths when running as script + from src.data_loader import DataLoader + from src.data_validator import DataValidator, ConfigValidator + from src.embedding_generator import EnhancedEmbeddingGenerator + from src.visualization.embedding_visualizer import EmbeddingVisualizer + from src.preprocessor import DomainAgnosticPreprocessor + from src.summarization.hybrid_summarizer import HybridSummarizer + from src.evaluation.metrics import EvaluationMetrics + from src.clustering.dynamic_cluster_manager import DynamicClusterManager + from src.utils.metrics_utils import calculate_cluster_metrics + from src.utils.style_selector import determine_cluster_style + from src.utils.logging_config import setup_logging +else: + # Use relative imports when imported as module + from .data_loader import DataLoader + from .data_validator import DataValidator, ConfigValidator + from .embedding_generator import EnhancedEmbeddingGenerator + from .visualization.embedding_visualizer import EmbeddingVisualizer + from .preprocessor import DomainAgnosticPreprocessor + from .summarization.hybrid_summarizer import HybridSummarizer + from .evaluation.metrics import EvaluationMetrics + from .clustering.dynamic_cluster_manager import DynamicClusterManager + from .utils.metrics_utils import calculate_cluster_metrics + from .utils.style_selector import determine_cluster_style + from .utils.logging_config import setup_logging + import yaml import pandas as pd import logging @@ -8,25 +40,13 @@ import numpy as np from tqdm import tqdm -from .data_loader import DataLoader -from .data_validator import DataValidator, ConfigValidator -from .embedding_generator import EnhancedEmbeddingGenerator -from .visualization.embedding_visualizer import EmbeddingVisualizer -from .preprocessor import DomainAgnosticPreprocessor -from .summarization.hybrid_summarizer import HybridSummarizer -from .evaluation.metrics import EvaluationMetrics -from .clustering.dynamic_cluster_manager import DynamicClusterManager -from .utils.metrics_utils import calculate_cluster_metrics -from .utils.style_selector import determine_cluster_style -from .utils.logging_config import setup_logging - import json # Add json import import multiprocessing from typing import List, Dict, Any from datetime import datetime -from src.summarization.adaptive_summarizer import AdaptiveSummarizer -from src.utils.metrics_utils import calculate_cluster_metrics -from src.clustering.clustering_utils import process_clusters # Update import path +from .summarization.adaptive_summarizer import AdaptiveSummarizer +from .utils.metrics_utils import calculate_cluster_metrics +from .clustering.clustering_utils import process_clusters # Update import path # Set up logging with absolute paths log_dir = Path(__file__).parent.parent / "logs" @@ -53,8 +73,8 @@ from utils.metrics_utils import calculate_cluster_variance, calculate_lexical_diversity, calculate_cluster_metrics from datasets import load_dataset from utils.metrics_calculator import MetricsCalculator -from src.summarization.adaptive_summarizer import AdaptiveSummarizer -from src.clustering.dynamic_cluster_manager import DynamicClusterManager +from .summarization.adaptive_summarizer import AdaptiveSummarizer +from .clustering.dynamic_cluster_manager import DynamicClusterManager import random import numpy as np