Skip to content

Commit

Permalink
Refactor configuration and main script; update dataset paths, adjust …
Browse files Browse the repository at this point in the history
…clustering parameters, and improve import handling for better module compatibility
  • Loading branch information
stochastic-sisyphus authored Dec 10, 2024
1 parent ec99c51 commit 9979cec
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 31 deletions.
18 changes: 4 additions & 14 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,12 @@ data:
output_path: "data/output"
processed_path: "data/processed"
batch_size: 32
scisummnet_path: "data/scisummnet_release1.1__20190413"
scisummnet_path: "data/scisummnet"
datasets:
- name: "xlsum"
source: "huggingface"
enabled: true
language: "english"
dataset_name: "GEM/xlsum"
- name: "scisummnet"
source: "local"
enabled: true
file_patterns:
xml: "{paper_id}.xml"
summary: "{paper_id}.gold.txt"
subdirs:
documents: "Documents_xml"
summaries: "summary"
dataset_name: "xlsum"

# Preprocessing Configuration
preprocessing:
Expand Down Expand Up @@ -47,7 +37,7 @@ clustering:
metric: "euclidean"
params:
min_cluster_size: 5
min_samples: 5
min_samples: 3
metric: "euclidean"
output_dir: "outputs/clusters"

Expand All @@ -58,7 +48,7 @@ clustering:
resolution: 1.0

streaming:
enabled: true
enabled: false
buffer_size: 100
update_interval: 60 # seconds

Expand Down
5 changes: 5 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env python3
from src.main import main

if __name__ == "__main__":
main()
54 changes: 37 additions & 17 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,52 @@
import os
import sys
from pathlib import Path

# Add project root to PYTHONPATH when running directly
if __name__ == "__main__":
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))

# Update imports to absolute paths when running as script
from src.data_loader import DataLoader
from src.data_validator import DataValidator, ConfigValidator
from src.embedding_generator import EnhancedEmbeddingGenerator
from src.visualization.embedding_visualizer import EmbeddingVisualizer
from src.preprocessor import DomainAgnosticPreprocessor
from src.summarization.hybrid_summarizer import HybridSummarizer
from src.evaluation.metrics import EvaluationMetrics
from src.clustering.dynamic_cluster_manager import DynamicClusterManager
from src.utils.metrics_utils import calculate_cluster_metrics
from src.utils.style_selector import determine_cluster_style
from src.utils.logging_config import setup_logging
else:
# Use relative imports when imported as module
from .data_loader import DataLoader
from .data_validator import DataValidator, ConfigValidator
from .embedding_generator import EnhancedEmbeddingGenerator
from .visualization.embedding_visualizer import EmbeddingVisualizer
from .preprocessor import DomainAgnosticPreprocessor
from .summarization.hybrid_summarizer import HybridSummarizer
from .evaluation.metrics import EvaluationMetrics
from .clustering.dynamic_cluster_manager import DynamicClusterManager
from .utils.metrics_utils import calculate_cluster_metrics
from .utils.style_selector import determine_cluster_style
from .utils.logging_config import setup_logging

import yaml
import pandas as pd
import logging
import torch
import numpy as np
from tqdm import tqdm

from .data_loader import DataLoader
from .data_validator import DataValidator, ConfigValidator
from .embedding_generator import EnhancedEmbeddingGenerator
from .visualization.embedding_visualizer import EmbeddingVisualizer
from .preprocessor import DomainAgnosticPreprocessor
from .summarization.hybrid_summarizer import HybridSummarizer
from .evaluation.metrics import EvaluationMetrics
from .clustering.dynamic_cluster_manager import DynamicClusterManager
from .utils.metrics_utils import calculate_cluster_metrics
from .utils.style_selector import determine_cluster_style
from .utils.logging_config import setup_logging

import json # Add json import
import multiprocessing
from typing import List, Dict, Any
from datetime import datetime
from src.summarization.adaptive_summarizer import AdaptiveSummarizer
from src.utils.metrics_utils import calculate_cluster_metrics
from src.clustering.clustering_utils import process_clusters # Update import path
from .summarization.adaptive_summarizer import AdaptiveSummarizer
from .utils.metrics_utils import calculate_cluster_metrics
from .clustering.clustering_utils import process_clusters # Update import path

# Set up logging with absolute paths
log_dir = Path(__file__).parent.parent / "logs"
Expand All @@ -53,8 +73,8 @@
from utils.metrics_utils import calculate_cluster_variance, calculate_lexical_diversity, calculate_cluster_metrics
from datasets import load_dataset
from utils.metrics_calculator import MetricsCalculator
from src.summarization.adaptive_summarizer import AdaptiveSummarizer
from src.clustering.dynamic_cluster_manager import DynamicClusterManager
from .summarization.adaptive_summarizer import AdaptiveSummarizer
from .clustering.dynamic_cluster_manager import DynamicClusterManager
import random
import numpy as np

Expand Down

0 comments on commit 9979cec

Please sign in to comment.