From 3fd9b2ecf9a2580fb64a85c762a1f5b5fc63dab7 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:27:40 +0800 Subject: [PATCH 01/40] The OvStudent System --- OvStudent/config.json | 7 +++ OvStudent/config_manager.py | 26 +++++++++ OvStudent/metrics.py | 69 ++++++++++++++++++++++++ OvStudent/query_cache.py | 9 ++++ OvStudent/query_manager.py | 11 ++++ OvStudent/rag_logger.py | 31 +++++++++++ OvStudent/rag_system.log | 102 ++++++++++++++++++++++++++++++++++++ OvStudent/rate_limiter.py | 24 +++++++++ OvStudent/requirements.txt | 11 ++++ OvStudent/system_monitor.py | 26 +++++++++ OvStudent/ttl_cache.py | 23 ++++++++ 11 files changed, 339 insertions(+) create mode 100644 OvStudent/config.json create mode 100644 OvStudent/config_manager.py create mode 100644 OvStudent/metrics.py create mode 100644 OvStudent/query_cache.py create mode 100644 OvStudent/query_manager.py create mode 100644 OvStudent/rag_logger.py create mode 100644 OvStudent/rag_system.log create mode 100644 OvStudent/rate_limiter.py create mode 100644 OvStudent/requirements.txt create mode 100644 OvStudent/system_monitor.py create mode 100644 OvStudent/ttl_cache.py diff --git a/OvStudent/config.json b/OvStudent/config.json new file mode 100644 index 00000000..939d2c5a --- /dev/null +++ b/OvStudent/config.json @@ -0,0 +1,7 @@ +{ + "file_selection_model": "qwen2.5-coder:7b", + "query_processing_model": "gemini-2.0-flash-exp", + "rate_limit": 5, + "converted_jsons_directory": "/Users/kq_m3m/PycharmProjects/OVMaster/Converted_Jsons", + "annotated_scripts_directory": "/Users/kq_m3m/PycharmProjects/OVMaster/Converted_Scripts_Annotated" +} \ No newline at end of file diff --git a/OvStudent/config_manager.py b/OvStudent/config_manager.py new file mode 100644 index 00000000..9f6c8a06 --- /dev/null +++ b/OvStudent/config_manager.py @@ -0,0 +1,26 @@ +# config_manager.py + +import json +from pathlib import Path +import os + +class ConfigManager: + CONFIG_PATH = Path('config.json') + + @staticmethod + def load_config(): + if ConfigManager.CONFIG_PATH.exists(): + with open(ConfigManager.CONFIG_PATH, 'r') as f: + return json.load(f) + else: + return { + 'file_selection_model': 'qwen2.5-coder:3b', + 'query_processing_model': 'qwen2.5-coder:7b', + 'rate_limit': 5, + 'gemini_api_key': None + } + + @staticmethod + def save_config(config): + with open(ConfigManager.CONFIG_PATH, 'w') as f: + json.dump(config, f, indent=2) \ No newline at end of file diff --git a/OvStudent/metrics.py b/OvStudent/metrics.py new file mode 100644 index 00000000..32909f17 --- /dev/null +++ b/OvStudent/metrics.py @@ -0,0 +1,69 @@ +# metrics.py + +from prometheus_client import Counter, Histogram, Gauge +from typing import Dict +from dataclasses import dataclass, field +import os +import psutil +import os + + +@dataclass +class PerformanceMetrics: + _instance: 'PerformanceMetrics' = field(default=None, init=False, repr=False) + + # Prometheus Metrics + query_counter: Counter = field(init=False) + query_latency: Histogram = field(init=False) + cache_hits: Counter = field(init=False) + model_calls: Dict[str, Counter] = field(default_factory=dict, init=False) + memory_usage: Gauge = field(init=False) + request_duration: Histogram = field(init=False) + + def __post_init__(self): + if PerformanceMetrics._instance is not None: + raise Exception("This class is a singleton!") + else: + # Initialize Prometheus Metrics + self.query_counter = Counter('rag_queries_total', 'Total number of queries processed') + self.query_latency = Histogram('rag_query_duration_seconds', 'Query processing duration') + self.cache_hits = Counter('rag_cache_hits_total', 'Number of cache hits') + self.memory_usage = Gauge('rag_memory_usage_bytes', 'Memory usage in bytes') + self.request_duration = Histogram( + 'rag_request_duration_seconds', + 'Request duration in seconds', + buckets=(0.1, 0.5, 1.0, 2.0, 5.0) + ) + PerformanceMetrics._instance = self + + @staticmethod + def get_instance(): + if PerformanceMetrics._instance is None: + PerformanceMetrics() + return PerformanceMetrics._instance + + # Methods to record metrics + def record_query(self, duration: float): + self.query_counter.inc() + self.query_latency.observe(duration) + + def record_cache_hit(self): + self.cache_hits.inc() + + def record_model_call(self, model_name: str): + sanitized_name = model_name.replace('.', '_').replace(':', '_').replace('-', '_') + metric_name = f'rag_model_calls_{sanitized_name}' + + if model_name not in self.model_calls: + self.model_calls[model_name] = Counter( + metric_name, + f'Number of calls to model {model_name}' + ) + self.model_calls[model_name].inc() + + def record_memory_usage(self): + process = psutil.Process(os.getpid()) + self.memory_usage.set(process.memory_info().rss) + + def record_request_time(self, duration: float): + self.request_duration.observe(duration) diff --git a/OvStudent/query_cache.py b/OvStudent/query_cache.py new file mode 100644 index 00000000..d0967e3e --- /dev/null +++ b/OvStudent/query_cache.py @@ -0,0 +1,9 @@ +# query_cache.py + +from collections import OrderedDict +from ttl_cache import TTLCache +import os + +class QueryCache(TTLCache): + def __init__(self, maxsize=1000, ttl=3600): + super().__init__(maxsize=maxsize, ttl=ttl) diff --git a/OvStudent/query_manager.py b/OvStudent/query_manager.py new file mode 100644 index 00000000..27264703 --- /dev/null +++ b/OvStudent/query_manager.py @@ -0,0 +1,11 @@ +# query_manager.py +import os + +class QueryManager: + @staticmethod + def validate_query(query): + if not query or len(query.strip()) < 3: + return False, "Query must be at least 3 characters long" + if len(query) > 1000: + return False, "Query must be less than 1000 characters" + return True, "" diff --git a/OvStudent/rag_logger.py b/OvStudent/rag_logger.py new file mode 100644 index 00000000..382d0e21 --- /dev/null +++ b/OvStudent/rag_logger.py @@ -0,0 +1,31 @@ +# rag_logger.py + +import logging +import sys +from logging.handlers import RotatingFileHandler +import os + +class RAGLogger: + def __init__(self, name: str, log_file: str = 'rag_system.log'): + self.logger = logging.getLogger(name) + self.logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + + # Rotating file handler + handler = RotatingFileHandler(log_file, maxBytes=10485760, backupCount=5) # 10 MB + handler.setFormatter(formatter) + self.logger.addHandler(handler) + + # Stream handler for console output + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setFormatter(formatter) + self.logger.addHandler(stream_handler) + + def info(self, message: str): + self.logger.info(message) + + def error(self, message: str): + self.logger.error(message) + + def warning(self, message: str): + self.logger.warning(message) diff --git a/OvStudent/rag_system.log b/OvStudent/rag_system.log new file mode 100644 index 00000000..3ecaed44 --- /dev/null +++ b/OvStudent/rag_system.log @@ -0,0 +1,102 @@ +2024-12-13 08:07:11,111 - INFO - Successfully loaded KBI data from /Users/kq_m3m/PycharmProjects/OVMaster/ovrawmjson/KBI.json +2024-12-13 08:07:11,112 - ERROR - Unexpected error in create_kbi_vectorstore: Could not import gpt4all library. Please install the gpt4all library to use this embedding model: pip install gpt4all +2024-12-13 08:08:02,732 - INFO - Successfully loaded KBI data from /Users/kq_m3m/PycharmProjects/OVMaster/ovrawmjson/KBI.json +2024-12-13 08:08:08,739 - INFO - Created vector store with 43 chunks +2024-12-13 08:18:51,414 - INFO - Successfully loaded KBI data from /Users/kq_m3m/PycharmProjects/OVMaster/ovrawmjson/KBI.json +2024-12-13 08:18:53,679 - INFO - Created vector store with 43 chunks +2024-12-13 08:19:16,427 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:19:16,427 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:19:16,428 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 08:19:19,364 - INFO - Found relevant files: ['t_visualize_colorsystem.json', 't_anno_trans.json'] +2024-12-13 08:19:19,367 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:19:19,367 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:19:20,865 - INFO - Using model qwen2.5-coder:7b for query processing +2024-12-13 08:19:20,865 - INFO - Generating answer... +2024-12-13 08:20:17,143 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:20:17,143 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:20:17,143 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 08:20:17,557 - INFO - Found relevant files: ['t_preprocess_cpu.json', 't_single_batch.json'] +2024-12-13 08:20:17,560 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:20:17,560 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:20:18,978 - INFO - Using model qwen2.5-coder:7b for query processing +2024-12-13 08:20:18,979 - INFO - Generating answer... +2024-12-13 08:21:30,294 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:21:30,294 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:21:30,294 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 08:21:30,566 - INFO - Found relevant files: ['t_preprocess.json'] +2024-12-13 08:21:30,568 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:21:30,568 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:21:32,685 - INFO - Using model qwen2.5-coder:7b for query processing +2024-12-13 08:21:32,685 - INFO - Generating answer... +2024-12-13 08:21:48,099 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:21:48,099 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:21:48,099 - INFO - Cache hit for query: how to use mRNA analysis step by step +2024-12-13 08:21:48,103 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:21:48,103 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:21:49,556 - INFO - Using model qwen2.5-coder:7b for query processing +2024-12-13 08:21:49,556 - INFO - Generating answer... +2024-12-13 08:22:06,733 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:06,733 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:06,733 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 08:22:07,104 - INFO - Found relevant files: ['t_preprocess_cpu.json', 't_scdeg.json'] +2024-12-13 08:22:07,106 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:07,106 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:08,522 - INFO - Using model qwen2.5-coder:7b for query processing +2024-12-13 08:22:08,523 - INFO - Generating answer... +2024-12-13 08:22:30,574 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:30,574 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:30,575 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 08:22:30,949 - INFO - Found relevant files: ['t_preprocess_cpu.json', 't_preprocess.csv', 't_preprocess.json'] +2024-12-13 08:22:30,952 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:30,952 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:30,952 - ERROR - Error processing query: File t_preprocess.csv not found +2024-12-13 08:22:41,967 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:41,967 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:41,968 - INFO - Cache hit for query: mRNA analysis step by step using omicverse +2024-12-13 08:22:41,972 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:41,972 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:41,973 - ERROR - Error processing query: File t_preprocess.csv not found +2024-12-13 08:22:50,765 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:50,765 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:50,766 - INFO - Cache hit for query: mRNA analysis step by step using omicverse +2024-12-13 08:22:50,771 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:22:50,771 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:22:50,772 - ERROR - Error processing query: File t_preprocess.csv not found +2024-12-13 08:23:10,819 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:23:10,819 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:23:10,820 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 08:23:11,095 - INFO - Found relevant files: ['t_preprocess.json', 't_preprocess_cpu.json'] +2024-12-13 08:23:11,097 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 08:23:11,097 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 08:23:12,697 - INFO - Using model qwen2.5-coder:7b for query processing +2024-12-13 08:23:12,697 - INFO - Generating answer... +2024-12-13 23:51:37,735 - ERROR - Unexpected error in create_kbi_vectorstore: expected str, bytes or os.PathLike object, not NoneType +2024-12-13 23:59:01,671 - INFO - Created vector store for Converted_Jsons with 464 chunks. +2024-12-13 23:59:23,964 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-13 23:59:23,964 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-13 23:59:23,964 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-13 23:59:23,968 - ERROR - Error in find_relevant_file: 1 validation error for StuffDocumentsChain + Value error, document_variable_name context was not found in llm_chain input_variables: ['question'] [type=value_error, input_value={'llm_chain': LLMChain(ve...None, 'callbacks': None}, input_type=dict] + For further information visit https://errors.pydantic.dev/2.10/v/value_error +2024-12-14 00:03:54,470 - INFO - Created vector store for Converted_Jsons with 464 chunks. +2024-12-14 00:04:11,030 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-14 00:04:11,030 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-14 00:04:11,030 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-14 00:04:11,033 - ERROR - Error in find_relevant_file: Missing some input keys: {'query'} +2024-12-14 00:06:57,291 - INFO - Created vector store for Converted_Jsons with 464 chunks. +2024-12-14 00:07:22,908 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-14 00:07:22,908 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-14 00:07:22,908 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-14 00:07:22,934 - ERROR - Error in find_relevant_file: Missing some input keys: {'query'} +2024-12-14 00:11:37,283 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-14 00:11:37,284 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-14 00:11:37,284 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-14 00:11:37,307 - ERROR - Error in find_relevant_file: Missing some input keys: {'query'} +2024-12-14 00:13:52,468 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-14 00:13:52,468 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-14 00:13:52,469 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-14 00:13:52,491 - ERROR - Error in find_relevant_file: Missing some input keys: {'query'} +2024-12-14 00:17:54,769 - INFO - Available models: ['qwen2.5-coder:7b', 'llama2:latest', 'llama3.2:3b', 'bge-large:latest', 'qwen2.5-coder:3b', 'llama2:7b', 'llama3.2:latest', 'llama3.1:latest'] +2024-12-14 00:17:54,769 - INFO - Required models: ['qwen2.5-coder:3b', 'qwen2.5-coder:7b'] +2024-12-14 00:17:54,769 - INFO - Using model qwen2.5-coder:3b for file selection +2024-12-14 00:17:54,791 - ERROR - Error in find_relevant_file: Missing some input keys: {'query'} diff --git a/OvStudent/rate_limiter.py b/OvStudent/rate_limiter.py new file mode 100644 index 00000000..ba7bf61b --- /dev/null +++ b/OvStudent/rate_limiter.py @@ -0,0 +1,24 @@ +# rate_limiter.py + +import time +import os + +class RateLimiter: + def __init__(self, limit_seconds): + self.limit_seconds = limit_seconds + self.last_request_time = None + + def can_make_request(self): + if not self.last_request_time: + return True + time_since_last = time.time() - self.last_request_time + return time_since_last >= self.limit_seconds + + def time_until_next_request(self): + if not self.last_request_time: + return 0 + time_since_last = time.time() - self.last_request_time + return max(0, self.limit_seconds - time_since_last) + + def record_request(self): + self.last_request_time = time.time() diff --git a/OvStudent/requirements.txt b/OvStudent/requirements.txt new file mode 100644 index 00000000..8665246d --- /dev/null +++ b/OvStudent/requirements.txt @@ -0,0 +1,11 @@ +streamlit~=1.41.0 +requests~=2.32.3 +psutil~=6.1.0 +prometheus-client~=0.21.1 +tenacity~=9.0.0 +chromadb~=0.5.23 +langchain-community~=0.3.11 +langchain-core~=0.3.24 +langchain~=0.3.11 +langchain-google-genai~=2.0.7 +sentence-transformers \ No newline at end of file diff --git a/OvStudent/system_monitor.py b/OvStudent/system_monitor.py new file mode 100644 index 00000000..24b681fa --- /dev/null +++ b/OvStudent/system_monitor.py @@ -0,0 +1,26 @@ +# system_monitor.py + +import psutil +import time +from datetime import timedelta +import os + +class SystemMonitor: + @staticmethod + def get_system_stats(): + process = psutil.Process(os.getpid()) + memory = psutil.virtual_memory() + return { + 'memory_usage': process.memory_info().rss / 1024 / 1024, # MB + 'cpu_percent': psutil.cpu_percent(interval=1), + 'uptime': time.time() - psutil.boot_time(), + 'system_memory': { + 'total': memory.total / (1024 ** 3), # GB + 'available': memory.available / (1024 ** 3), # GB + 'percent': memory.percent + } + } + + @staticmethod + def format_uptime(seconds): + return str(timedelta(seconds=int(seconds))) diff --git a/OvStudent/ttl_cache.py b/OvStudent/ttl_cache.py new file mode 100644 index 00000000..316262b0 --- /dev/null +++ b/OvStudent/ttl_cache.py @@ -0,0 +1,23 @@ +# ttl_cache.py + +import time +from collections import OrderedDict +import os + +class TTLCache(OrderedDict): + def __init__(self, maxsize=1000, ttl=3600): + super().__init__() + self.maxsize = maxsize + self.ttl = ttl + + def __getitem__(self, key): + value, timestamp = super().__getitem__(key) + if time.time() - timestamp > self.ttl: + del self[key] + raise KeyError(key) + return value + + def __setitem__(self, key, value): + super().__setitem__(key, (value, time.time())) + if len(self) > self.maxsize: + self.popitem(last=False) \ No newline at end of file From b0979990315361143bfecc398d435cd850c1a4fd Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:30:51 +0800 Subject: [PATCH 02/40] KBIndexing&KBICLearn --- .../t_cluster_space_annotated.json | 4 + .../t_commot_flowsig_annotated.json | 4 + .../t_mapping_annotated.json | 4 + .../Tutorials-space_py/t_slat_annotated.json | 4 + .../t_spaceflow_annotated.json | 4 + .../t_stagate_annotated.json | 4 + .../t_staligner_annotated.json | 4 + .../t_starfysh_annotated.json | 4 + .../Tutorials-space_py/t_stt_annotated.json | 4 + .../t_anno_trans_annotated.json | 4 + .../Converted_Jsons/t_aucell_annotated.json | 4 + .../t_bulk2single_annotated.json | 4 + .../t_bulk_combat_annotated.json | 4 + .../t_bulktrajblend_annotated.json | 4 + .../Converted_Jsons/t_cellanno_annotated.json | 4 + .../Converted_Jsons/t_cellfate_annotated.json | 4 + .../t_cellfate_gene_annotated.json | 4 + .../t_cellfate_genesets_annotated.json | 4 + .../t_cellphonedb_annotated.json | 4 + .../Converted_Jsons/t_cluster_annotated.json | 4 + .../Converted_Jsons/t_cnmf_annotated.json | 4 + .../t_cytotrace_annotated.json | 4 + .../Converted_Jsons/t_deg_annotated.json | 4 + .../Converted_Jsons/t_deseq2_annotated.json | 4 + .../Converted_Jsons/t_gptanno_annotated.json | 4 + .../t_metacells_annotated.json | 4 + .../Converted_Jsons/t_metatime_annotated.json | 4 + .../Converted_Jsons/t_mofa_annotated.json | 4 + .../t_mofa_glue_annotated.json | 4 + .../Converted_Jsons/t_network_annotated.json | 4 + .../Converted_Jsons/t_nocd_annotated.json | 4 + .../t_preprocess_annotated.json | 4 + .../t_preprocess_cpu_annotated.json | 4 + .../t_preprocess_gpu_annotated.json | 4 + .../Converted_Jsons/t_scdeg_annotated.json | 4 + .../Converted_Jsons/t_scdrug_annotated.json | 4 + .../Converted_Jsons/t_scmulan_annotated.json | 4 + .../Converted_Jsons/t_simba_annotated.json | 4 + .../t_single2spatial_annotated.json | 4 + .../t_single_batch_annotated.json | 4 + .../Converted_Jsons/t_tcga_annotated.json | 4 + .../Converted_Jsons/t_tosica_annotated.json | 4 + .../Converted_Jsons/t_traj_annotated.json | 4 + .../Converted_Jsons/t_via_annotated.json | 4 + .../Converted_Jsons/t_via_velo_annotated.json | 4 + .../t_visualize_bulk_annotated.json | 4 + .../t_visualize_colorsystem_annotated.json | 4 + .../t_visualize_single_annotated.json | 4 + .../Converted_Jsons/t_wgcna_annotated.json | 4 + .../t_anno_trans_annotated.py | 25 ++ .../t_aucell_annotated.py | 42 ++ .../t_bulk2single_annotated.py | 52 +++ .../t_bulk_combat_annotated.py | 57 +++ .../t_bulktrajblend_annotated.py | 105 +++++ .../t_cellanno_annotated.py | 96 +++++ .../t_cellfate_annotated.py | 59 +++ .../t_cellfate_gene_annotated.py | 165 +++++++ .../t_cellfate_genesets_annotated.py | 43 ++ .../t_cellphonedb_annotated.py | 177 ++++++++ .../t_cluster_annotated.py | 117 +++++ .../t_cluster_space_annotated.py | 83 ++++ .../t_cnmf_annotated.py | 187 ++++++++ .../t_commot_flowsig_annotated.py | 62 +++ .../t_cytotrace_annotated.py | 25 ++ .../t_deg_annotated.py | 129 ++++++ .../t_deseq2_annotated.py | 32 ++ .../t_gptanno_annotated.py | 176 ++++++++ .../t_mapping_annotated.py | 36 ++ .../t_metacells_annotated.py | 88 ++++ .../t_metatime_annotated.py | 25 ++ .../t_mofa_annotated.py | 41 ++ .../t_mofa_glue_annotated.py | 50 +++ .../t_network_annotated.py | 15 + .../t_nocd_annotated.py | 29 ++ .../t_preprocess_annotated.py | 132 ++++++ .../t_preprocess_cpu_annotated.py | 136 ++++++ .../t_preprocess_gpu_annotated.py | 122 ++++++ .../t_scdeg_annotated.py | 92 ++++ .../t_scdrug_annotated.py | 79 ++++ .../t_scmulan_annotated.py | 39 ++ .../t_simba_annotated.py | 21 + .../t_single2spatial_annotated.py | 56 +++ .../t_single_batch_annotated.py | 60 +++ .../t_slat_annotated.py | 147 +++++++ .../t_spaceflow_annotated.py | 31 ++ .../t_stagate_annotated.py | 66 +++ .../t_staligner_annotated.py | 56 +++ .../t_starfysh_annotated.py | 132 ++++++ .../t_stt_annotated.py | 70 +++ .../t_tcga_annotated.py | 23 + .../t_tosica_annotated.py | 77 ++++ .../t_traj_annotated.py | 64 +++ .../t_via_annotated.py | 49 +++ .../t_via_velo_annotated.py | 28 ++ .../t_visualize_bulk_annotated.py | 50 +++ .../t_visualize_colorsystem_annotated.py | 84 ++++ .../t_visualize_single_annotated.py | 406 ++++++++++++++++++ .../t_wgcna_annotated.py | 59 +++ 98 files changed, 4191 insertions(+) create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_cluster_space_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_commot_flowsig_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_mapping_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_slat_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_spaceflow_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_stagate_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_staligner_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_starfysh_annotated.json create mode 100644 OvStudent/Converted_Jsons/Tutorials-space_py/t_stt_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_anno_trans_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_aucell_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_bulk2single_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_bulk_combat_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_bulktrajblend_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cellanno_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cellfate_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cellfate_gene_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cellfate_genesets_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cellphonedb_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cluster_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cnmf_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_cytotrace_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_deg_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_deseq2_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_gptanno_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_metacells_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_metatime_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_mofa_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_mofa_glue_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_network_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_nocd_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_preprocess_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_preprocess_cpu_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_preprocess_gpu_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_scdeg_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_scdrug_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_scmulan_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_simba_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_single2spatial_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_single_batch_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_tcga_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_tosica_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_traj_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_via_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_via_velo_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_visualize_bulk_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_visualize_colorsystem_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_visualize_single_annotated.json create mode 100644 OvStudent/Converted_Jsons/t_wgcna_annotated.json create mode 100644 OvStudent/Converted_Scripts_Annotated/t_anno_trans_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cellfate_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cellfate_genesets_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cellphonedb_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cluster_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_commot_flowsig_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_metacells_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_network_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_preprocess_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_preprocess_cpu_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_scdeg_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_scdrug_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_single_batch_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_slat_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_starfysh_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_tcga_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_tosica_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_traj_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_via_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_via_velo_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_visualize_bulk_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_visualize_colorsystem_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_visualize_single_annotated.py create mode 100644 OvStudent/Converted_Scripts_Annotated/t_wgcna_annotated.py diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_cluster_space_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_cluster_space_annotated.json new file mode 100644 index 00000000..e7e54bac --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_cluster_space_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a spatial transcriptomics analysis using the `omicverse` and `scanpy` libraries. It takes Visium spatial data, performs dimensionality reduction and clustering using several methods, and evaluates the results against known ground truth annotations. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **Imports:** Imports necessary libraries: `omicverse` for spatial analysis, `scanpy` for handling spatial data, `pandas` for data manipulation, `os` for file path handling, `matplotlib` for plotting and `sklearn` for clustering metrics.\n* **Set Plotting Parameters:** Uses `ov.plot_set()` to configure plotting defaults from `omicverse`.\n* **Read Visium Data:** Reads spatial transcriptomics data from an H5 file using `sc.read_visium()`.\n* **Make Variable Names Unique:** Ensures gene names are unique within the AnnData object.\n* **Quality Control:** Calculates QC metrics (e.g., number of reads per cell) using `sc.pp.calculate_qc_metrics()`.\n* **Gene Filtering:** Filters out genes with low total counts (<= 100).\n* **Spatial Variable Gene Selection:** Selects spatially variable genes using `ov.space.svg()`, which is a key first step to reduce noise and focus on the important spatial variation of the data.\n* **Save and Reload AnnData:** Writes the processed AnnData object to an H5AD file and then reloads it. This step might be for saving progress or for clean data reload at next run of the script.\n* **Load Ground Truth Annotations:** Reads ground truth labels from a tab-separated file using `pd.read_csv()`. The ground truth labels are assumed to be available for each spot in the analysis.\n* **Add Ground Truth to AnnData:** Adds the ground truth labels as an observation (`.obs`) column in the `AnnData` object, matching labels with the spot names.\n* **Spatial Plot of Ground Truth:** Displays the spatial layout of the spots colored by the ground truth annotation.\n\n**2. GraphST Clustering:**\n\n* **Parameter Setup:** Creates a dictionary `methods_kwargs` to store parameters for different methods and sets up parameters for the `GraphST` method, which is one of the spatial dimensionality reduction method implemented in the `omicverse`.\n* **GraphST Dimensionality Reduction and Clustering:** Performs dimensionality reduction using GraphST `ov.space.clusters()`.\n* **Mclust Clustering on GraphST Representation:** Performs Gaussian mixture model clustering (`mclust`) on the reduced representation from GraphST.\n* **Label Refinement:** Refines cluster labels by smoothing them based on spatial proximity using `ov.utils.refine_label()`.\n* **Categorical Conversion:** Converts refined cluster labels to categorical data type.\n* **Cluster Merging:** Merges clusters based on a tree structure that is constructed based on the similarities of the initial clusters. The merge cluster method implemented by `ov.space.merge_cluster()` helps to refine and interpret clustering results.\n* **Spatial Plot of Clusters:** Displays spatial layouts of mclust clusters and merged mclust clusters.\n* **Mclust_R Clustering on GraphST Representation:** The same clustering pipeline with `mclust` (above) is repeated here with the mclust_R, which calls the `mclust` R package for clustering.\n\n**3. BINARY Clustering:**\n\n* **Parameter Setup:** Updates `methods_kwargs` with parameters for the `BINARY` method. BINARY is another spatial dimensionality reduction method available in `omicverse`.\n* **BINARY Dimensionality Reduction and Clustering:** Performs dimensionality reduction using BINARY `ov.space.clusters()`.\n* **Mclust_R Clustering on BINARY Representation:** Performs mclust using the `mclust` R package on the representation from the BINARY method.\n* **Label Refinement & Categorical Conversion:** Refines and converts mclust labels.\n* **Cluster Merging:** Merges clusters.\n* **Spatial Plot of Clusters:** Displays spatial plots of the clusters.\n* **Mclust Clustering on BINARY Representation:** Performs mclust using the Python implementation on the representation from the BINARY method.\n* **Label Refinement & Categorical Conversion:** Refines and converts mclust labels.\n* **Cluster Merging:** Merges clusters.\n* **Spatial Plot of Clusters:** Displays spatial plots of the clusters.\n\n**4. STAGATE Clustering:**\n\n* **Parameter Setup:** Updates `methods_kwargs` with parameters for the `STAGATE` method. `STAGATE` is another spatial dimensionality reduction method.\n* **STAGATE Dimensionality Reduction and Clustering:** Performs dimensionality reduction using STAGATE `ov.space.clusters()`.\n* **Mclust_R Clustering on STAGATE Representation:** Performs mclust clustering using the R package on the STAGATE representation.\n* **Label Refinement & Categorical Conversion:** Refines and converts mclust labels.\n* **Cluster Merging:** Merges clusters.\n* **Spatial Plot of Clusters:** Displays spatial plots of the clusters.\n* **Gene Visualization:** Visualizes the expression of the gene with highest PI value by `omicverse` and also the user specified `MBP` gene in raw and STAGATE transformed spaces.\n\n**5. CAST Clustering:**\n\n* **Parameter Setup:** Updates `methods_kwargs` with parameters for the `CAST` method. `CAST` is another spatial dimensionality reduction method.\n* **CAST Dimensionality Reduction and Clustering:** Performs dimensionality reduction using CAST `ov.space.clusters()`.\n* **Mclust Clustering on CAST Representation:** Performs mclust using the Python implementation on the CAST representation.\n* **Label Refinement & Categorical Conversion:** Refines and converts mclust labels.\n* **Cluster Merging:** Merges clusters.\n* **Spatial Plot of Clusters:** Displays spatial plots of the clusters.\n\n**6. Evaluation:**\n\n* **Calculate Adjusted Rand Index (ARI):** Calculates and prints the Adjusted Rand Index (ARI) to compare each clustering method's result to the ground truth annotation. This metric evaluates the consistency of cluster assignment relative to a ground-truth, taking into account the number of clusters and the number of samples in each clusters.\n* **Print ARI Results:** Prints the ARI results for each clustering method.\n\n**Key functionalities:**\n\n* **Spatial Analysis:** Utilizes `omicverse` to perform spatial-aware dimensionality reduction and clustering.\n* **Dimensionality Reduction:** Leverages methods like GraphST, BINARY, STAGATE and CAST to reduce high-dimensional gene expression data into a lower-dimensional representation while retaining important spatial information.\n* **Clustering:** Employs Gaussian mixture models via `mclust` and `mclust_R` for clustering.\n* **Cluster Refinement:** Smooths cluster assignments based on spatial proximity.\n* **Cluster Merging:** Refines cluster granularity based on hierarchical relationships.\n* **Visualization:** Uses `scanpy` for spatial plotting and `matplotlib` for combining plot figures.\n* **Evaluation:** Computes Adjusted Rand Index to evaluate the quality of clustering results with respect to known ground truth annotations.\n\n**In Summary:**\n\nThis script provides a comprehensive workflow for spatial transcriptomics analysis. It performs data loading, quality control, dimensionality reduction, multiple clustering approaches using both Python and R packages, and finally calculates and compares the cluster results with the ground truth labels using ARI metric. It uses `omicverse` for spatial analysis methods and `scanpy` for spatial data handling and visualization. The script is well-organized and modular, employing loops for repeated tasks and using dictionaries to manage method parameters.", + "file": "t_cluster_space_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_commot_flowsig_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_commot_flowsig_annotated.json new file mode 100644 index 00000000..a6eff0c2 --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_commot_flowsig_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive spatial transcriptomics analysis using the `omicverse` library along with `scanpy`, `pandas`, `sklearn`, and `matplotlib`. It integrates spatial data, gene expression, and intercellular communication analysis to identify key cell-cell interactions and biological flows. Here's a breakdown of the functionality and structure:\n\n**Overall Workflow:**\n\n1. **Data Loading and Preprocessing:**\n * Loads Visium spatial data.\n * Performs basic quality control (QC) filtering on genes.\n * Selects spatially variable genes (SVGs).\n2. **Intercellular Communication Analysis:**\n * Loads a ligand-receptor database (CellChat).\n * Filters the database based on genes present in the dataset.\n * Performs spatial communication analysis using CellChat.\n * Analyzes communication direction for a specified pathway (e.g., FGF).\n * Visualizes cell communication patterns.\n3. **Integration of Annotation Data:**\n * Loads a ground truth annotation file.\n * Adds annotation data to the AnnData object.\n * Visualizes spatial data with annotations.\n4. **Biological Flow Analysis:**\n * Constructs gene expression modules (GEMs) using non-negative matrix factorization (NMF).\n * Extracts top genes for a specific GEM.\n * Constructs cellular flows from communication data.\n * Determines informative variables in the flow data.\n * Performs KMeans clustering on spatial coordinates.\n * Learns intercellular flows, validates them, and filters low-confidence edges.\n * Constructs the intercellular flow network.\n5. **Visualization and Output:**\n * Visualizes the GEM expression by cell type.\n * Visualizes the intercellular flow network.\n * Saves intermediate and final results to files.\n\n**Line-by-Line Explanation:**\n\n* **Lines 1-3:** Imports the required libraries: `omicverse` (as `ov`) and `scanpy` (as `sc`).\n* **Line 5:** Sets the plotting style using `omicverse`.\n* **Line 7:** Loads Visium spatial data into an `AnnData` object named `adata` using `scanpy.read_visium`.\n* **Line 8:** Ensures variable names (gene names) are unique in the `adata` object using `adata.var_names_make_unique()`.\n* **Line 10:** Calculates QC metrics for each cell in place in the `adata` object using `scanpy.pp.calculate_qc_metrics`.\n* **Line 11:** Filters the genes (variables) in `adata`, retaining only those with total counts greater than 100.\n* **Line 12:** Performs spatial variable gene selection using `omicverse.space.svg`, saving the result to `adata`.\n* **Line 13:** Displays the `adata` object.\n* **Line 15:** Writes the `adata` object to a compressed `.h5ad` file.\n* **Lines 19-21:** Loads a human ligand-receptor database using `omicverse.externel.commot.pp.ligand_receptor_database` and prints its shape.\n* **Lines 23-26:** Filters the ligand-receptor database to keep only interactions involving genes present in `adata` and prints the shape of the filtered dataframe.\n* **Lines 28-35:** Performs spatial communication analysis using `omicverse.externel.commot.tl.spatial_communication`, storing the result in `adata`.\n* **Lines 36-37:** Imports the `pandas` library (as `pd`) and the `os` library.\n* **Line 38:** Loads a ground truth annotation file into a pandas `DataFrame`, setting the index and column names.\n* **Line 39:** Assigns a column name 'Ground_Truth' to the annotation DataFrame.\n* **Line 40:** Adds the ground truth annotations to the `adata.obs` DataFrame, matching on cell IDs.\n* **Line 41:** Defines a list of colors to be used for plotting.\n* **Line 43:** Generates a spatial plot of the data colored by ground truth annotations using `scanpy.pl.spatial`.\n* **Line 45:** Creates a dictionary mapping ground truth categories to their corresponding colors.\n* **Line 47:** Prints the head of the ligand-receptor information stored within the `adata` object.\n* **Line 49:** Imports the `matplotlib.pyplot` library as `plt`.\n* **Lines 50-52:** Sets parameters for the spatial communication analysis (scale, neighborhood size, target pathway).\n* **Line 53:** Performs communication direction analysis for the specified pathway.\n* **Lines 54-62:** Visualizes cell communication for the specified pathway.\n* **Line 63:** Sets the title of the communication visualization plot.\n* **Line 67:** Writes the updated `adata` object to a compressed `.h5ad` file.\n* **Line 69:** Reads the h5ad file back into the `adata` object.\n* **Line 70:** Displays the `adata` object.\n* **Line 72:** Creates a new layer named 'normalized' in the AnnData object by copying the data from `adata.X`.\n* **Lines 74-79:** Constructs gene expression modules using NMF.\n* **Line 80:** Sets the target gene expression module for further analysis.\n* **Lines 81-87:** Extracts the top genes from the selected GEM module using `omicverse.externel.flowsig.ul.get_top_gem_genes`.\n* **Line 88:** Displays the top genes for the selected GEM module.\n* **Line 90:** Defines a commot output key, which is the commot-cellchat output.\n* **Lines 91-98:** Constructs cellular flows from commot output.\n* **Lines 99-108:** Determines informative variables in the flow data.\n* **Line 109:** Imports the `KMeans` class from sklearn.\n* **Line 111:** Performs KMeans clustering on spatial coordinates.\n* **Line 112:** Adds the spatial KMeans clustering labels to the adata.obs.\n* **Lines 115-121:** Learns intercellular flows using `ov.externel.flowsig.tl.learn_intercellular_flows`.\n* **Lines 123-128:** Applies biological flow validation using `ov.externel.flowsig.tl.apply_biological_flow`.\n* **Line 129:** Sets the threshold to filter low-confidence edges in the network.\n* **Lines 131-136:** Filters low-confidence edges using `ov.externel.flowsig.tl.filter_low_confidence_edges`.\n* **Line 137:** Writes the `adata` object to a compressed h5ad file.\n* **Line 141:** Constructs the intercellular flow network from the adata object.\n* **Line 144:** Sets the flowsig expression key.\n* **Line 145:** Retrieves the expression data associated with the flow key.\n* **Line 146:** Creates a new AnnData object from the expression data.\n* **Line 147:** Assigns the observations from adata to adata\\_subset.\n* **Line 148:** Rename variable names using a GEM naming convention.\n* **Line 151:** Imports the matplotlib plotting library.\n* **Line 152-153:** Creates a dotplot of GEM expression by ground truth, with specified parameters.\n* **Line 154:** Creates a dictionary mapping ground truth categories to colors for the dotplot.\n* **Line 156:** Plots the flowsig network.\n\n**Key Libraries:**\n\n* **`omicverse`:** A library for multi-omics data analysis, including spatial omics, with functionality for spatial gene selection, intercellular communication analysis, and biological flow analysis.\n* **`scanpy`:** A popular library for single-cell RNA-seq analysis, used here for loading, preprocessing, and visualizing spatial transcriptomics data.\n* **`pandas`:** Used for data manipulation, primarily loading the ground truth annotation file.\n* **`sklearn`:** Used for KMeans clustering.\n* **`matplotlib`:** Used for general purpose plotting.\n\n**In summary,** this script performs a detailed analysis of spatial transcriptomics data, combining gene expression, spatial information, intercellular communication, and biological flow to identify meaningful patterns and relationships within the tissue. It utilizes several libraries for this purpose, demonstrating a common approach to these kinds of analyses.", + "file": "t_commot_flowsig_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_mapping_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_mapping_annotated.json new file mode 100644 index 00000000..c99e11b8 --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_mapping_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a spatial transcriptomics analysis, mapping single-cell RNA-seq data onto spatial transcriptomics data using the Tangram method. Here's a breakdown of its functionality and structure:\n\n**Overall Workflow:**\n\n1. **Data Loading and Preprocessing:**\n - Loads single-cell RNA-seq data and preprocesses it using omicverse functionalities (shifting, log-transform, feature selection, normalization).\n - Loads spatial transcriptomics data from scanpy's built-in dataset.\n - Calculates QC metrics and performs basic filtering for spatial data.\n - Identifies spatially variable genes in the spatial data.\n\n2. **Tangram Spatial Mapping:**\n - Initializes the Tangram object using preprocessed single-cell and spatial data, specifying the `Subset` annotation from the single-cell data as the basis for mapping.\n - Trains the Tangram model.\n - Performs cell-to-location mapping using the trained model.\n\n3. **Visualization:**\n - Visualizes the mapped cell types in the spatial domain using both scanpy and omicverse functionalities.\n\n**Line-by-Line Explanation:**\n\n* **Lines 1-3: Importing Libraries:** Imports necessary libraries: `omicverse` (for single-cell and spatial analysis), and `scanpy` (for spatial data loading and basic processing).\n* **Line 5: Setting Plotting Parameters:** Configures omicverse's plotting settings for consistent visualization.\n* **Line 7: Loading Single-Cell Data:** Loads single-cell data stored in an `h5ad` file using `omicverse`.\n* **Line 8: Importing Matplotlib:** Imports the plotting library `matplotlib.pyplot`.\n* **Line 9: Creating Matplotlib Figure and Axes:** Creates a matplotlib figure object for plotting single-cell embeddings.\n* **Line 10-18: Generating Embedding Plot:** Generates a UMAP plot of the single-cell data, coloring points based on the 'Subset' annotation.\n* **Line 20: Printing Maximum Raw Expression Value:** Prints the maximum expression value before normalization, using the `.X` attribute for raw expression values.\n* **Line 21: Single-Cell Preprocessing:** Normalizes the single-cell data using `omicverse` preprocessing (`shiftlog|pearson` mode), selecting highly variable genes (HVGs), and targeting a sum of 10,000 counts per cell.\n* **Line 22: Saving Raw Data:** Saves preprocessed data in the `.raw` attribute to be used later, often during differential expression analysis.\n* **Line 23: Selecting Highly Variable Genes:** Subsets the AnnData to only include the identified highly variable genes.\n* **Line 24: Printing Maximum Normalized Expression Value:** Prints the maximum expression value after normalization.\n* **Line 26: Loading Spatial Data:** Loads a Visium spatial transcriptomics dataset from `scanpy`.\n* **Line 27: Adding Sample Information:** Adds a \"sample\" column to the observation (`.obs`) attribute containing information about the sample origin.\n* **Line 28: Making Gene Names Unique:** Ensures that gene names are unique, an important step before downstream analysis.\n* **Line 30: Calculating QC Metrics:** Calculates QC metrics like total counts using `scanpy`.\n* **Line 31: Filtering Spatial Data:** Filters out spots with low gene counts (total counts less than or equal to 100).\n* **Line 32: Identifying Spatially Variable Genes:** Identifies spatially variable genes (SVGs) using omicverse with the `prost` method and parameters specific to Visium data.\n* **Line 33: Saving Raw Spatial Data:** Saves processed spatial data in `.raw`.\n* **Line 34: Selecting Spatially Variable Genes:** Subsets the AnnData to only include the SVGs.\n* **Line 35: Copying Spatial Data:** Creates a copy of the spatially processed data.\n* **Line 36: Displaying Spatial Data:** Displays the structure of the spatial AnnData object.\n* **Line 38: Initializing Tangram Object:** Creates a `Tangram` object, linking the preprocessed single-cell data and the processed spatial data. The `clusters` parameter specifies that mapping will occur based on cell types defined by single cell `Subset`.\n* **Line 40: Training the Tangram Model:** Trains the Tangram model using the cluster mapping method, for a specified number of epochs on a CUDA device.\n* **Line 42: Performing Cell-to-Location Mapping:** Maps cells from the single-cell data to spatial locations using the trained model.\n* **Line 43: Displaying Mapped Columns:** Displays available columns in the observation data of the mapped object.\n* **Line 45-47: Defining Cell Type List:** Defines a list of cell types to visualize.\n* **Line 48-56: Visualizing Spatial Mapping (Scanpy):** Generates and displays spatial plots showing the distributions of specific cell types using `scanpy.pl.spatial`.\n* **Line 57-59: Creating Color Dictionary:** Creates a dictionary to map single-cell 'Subset' categories to a color for better plotting.\n* **Line 60: Importing Matplotlib (again):** Re-imports Matplotlib (possibly due to potential scope issues).\n* **Line 61-63: Selecting Cell Types:** Extracts a subset of cell types to be used in a later plotting function.\n* **Line 64: Matplotlib rc Context:** Uses a context to define specific plotting parameters for the omicverse spatial plot\n* **Line 65-71: Visualizing Spatial Mapping (Omicverse):** Generates and displays a spatial plot showing the distributions of specific cell types using `ov.pl.plot_spatial`.\n\n**In summary, this script performs a comprehensive spatial transcriptomics analysis, loading and processing single-cell and spatial data, mapping cell types from single-cell data to spatial locations using Tangram, and visualizing the mapping results. It demonstrates the use of `omicverse` and `scanpy` for this type of analysis.**", + "file": "t_mapping_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_slat_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_slat_annotated.json new file mode 100644 index 00000000..8b0ae044 --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_slat_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive spatial transcriptomics analysis, focusing on the alignment and comparison of two developmental timepoints of a kidney dataset. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Libraries:**\n\n* **Imports:**\n * `omicverse as ov`: Imports the `omicverse` library, likely containing specialized tools for single-cell analysis and spatial data.\n * `os`: Imports the standard `os` module for operating system interactions (though not directly used in this script).\n * `scanpy as sc`: Imports the popular `scanpy` library for single-cell data analysis.\n * `numpy as np`: Imports the `numpy` library for numerical computations.\n * `pandas as pd`: Imports the `pandas` library for data manipulation and analysis.\n * `torch`: Imports the `torch` library for deep learning purposes (used in the scSLAT algorithm).\n * `matplotlib.pyplot as plt`: Import the library to create plots.\n * Specific modules from `omicverse.externel.scSLAT`: This implies the use of a spatial alignment tool (scSLAT) within the omicverse ecosystem. These modules include functions for:\n * Loading and processing AnnData objects (`load_anndatas`)\n * Calculating spatial networks (`Cal_Spatial_Net`)\n * Running the SLAT alignment algorithm (`run_SLAT`)\n * Performing spatial matching (`spatial_match`)\n * Visualization of spatial match (`match_3D_multi`, `hist`, `Sankey`, `match_3D_celltype`, `Sankey`, `Sankey_multi`, `build_3D`)\n * Statistical analysis of matched regions (`region_statistics`)\n* **Plot Style:** Sets the plot style using `ov.plot_set()`, ensuring consistent plotting aesthetics throughout the script.\n\n**2. Data Loading and Preparation:**\n\n* **Loading AnnData Objects:** Reads two AnnData objects, `adata1` (`data/E115_Stereo.h5ad`) and `adata2` (`data/E125_Stereo.h5ad`), representing spatial transcriptomics data at two time points, E11.5 and E12.5, respectively.\n* **Adding 'week' Observation:** Adds a new observation ('week') to each AnnData object to label them with their respective timepoints (E11.5 and E12.5).\n* **Initial Spatial Plots:** Generates spatial plots for both datasets, colored by the `annotation` variable, allowing a visual inspection of spatial cell type distribution.\n\n**3. Spatial Alignment and Matching:**\n\n* **Spatial Network Calculation:** Calculates spatial networks for both `adata1` and `adata2` using a k-Nearest Neighbors (KNN) approach with `k_cutoff=20`. This generates a representation of the spatial relationships between cells.\n* **Feature and Edge Extraction:** Extracts features (using DPCA, which stands for Differentially Expressed Principle Component Analysis) and edges from the AnnData objects, these are used in the next step by SLAT.\n* **Running SLAT:** Runs the scSLAT (Spatial Latent Alignment) algorithm to generate embeddings (latent space representations) for each timepoint and also infers the time information based on the input data.\n* **Spatial Matching:** Performs spatial matching between the latent space embeddings of the two timepoints. The matching results represent the predicted correspondence of cell locations between time points.\n* **Matching Statistics:** Calculates matching distances and prints region statistics (based on distance distribution), which may quantify the quality of the matching.\n\n**4. 3D Visualization and Quality Assessment:**\n\n* **3D Model Construction:** Builds a 3D model using the two AnnData objects and matching information. The model visually represents the spatial transformation between the two time points.\n* **3D Visualization:** Draws the constructed 3D model, visualizing the spatial alignment.\n* **Quality Index:** Adds the matching distance as a 'low_quality_index' observation to `adata2`, and visualizes its spatial distribution using spatial plot. This plot can help identify regions with less reliable matches.\n\n**5. Sankey Plot:**\n\n* **Sankey Diagram:** Creates and displays a Sankey diagram which visually depicts the flow of cells between the two timepoints based on cell type annotations and the calculated matches, helping to understand cell fate decisions. Saves the diagram as html file\n\n**6. Cell-Type Specific 3D Alignment:**\n\n* **Dataframe Preparation:** Converts the anndata object into a pandas dataframe.\n* **Cell type specific 3D Alignment:** Creates a 3D cell-type specific alignment.\n\n**7. Function to Extract Matched Cells:**\n\n* **`cal_matching_cell` function:** Defines a function that takes two `anndata` objects (target and query), the matching array, and a query celltype to filter cells in target adata based on cells with that celltype in query_adata, effectively pulling out cells from the target corresponding to the query cell type.\n\n**8. Lineage Analysis:**\n\n* **Extracting Kidney Cells:** Uses `cal_matching_cell` to extract the cells in adata1 (the target data) that correspond to 'Kidney' cells in adata2 (the query data).\n* **Adding `kidney_anno` to adata1:** A new column called `kidney_anno` is added to the adata1 dataframe and is populated with annotations of the matched cells from the query_adata.\n* **Spatial Plot:** Generates a spatial plot for adata1 colored by `kidney_anno`, showing the predicted lineages from E11.5.\n* **Combining Lineage Cells:** Combines matched cells in `adata1` and kidney cells from `adata2` into a new AnnData object `kidney_lineage_ad` for lineage analysis.\n* **Preprocessing** Preprocess the lineage combined AnnData.\n* **UMAP and Clustering:** Performs dimensionality reduction using PCA, clustering using Leiden algorithm, and generates a UMAP embedding plot for visualization.\n* **Gene Expression Analysis:** Generates a dotplot of specific genes grouped by leiden cluster, aiding in the identification of cell populations and their characteristics.\n* **Renaming Clusters**: Renames clusters based on leiden groups.\n* **UMAP Visualization:** Generates a UMAP of the combined lineage data.\n* **Spatial Plot of Kidney Lineage in adata1**: Visualizes the annotations on the original dataset.\n\n**9. Differential Expression Analysis:**\n\n* **DEG Analysis:** Performs differential expression analysis using a t-test to find genes that are differentially expressed between cells in the E11.5 and E12.5 timepoints within the combined kidney lineage data using pyDEG.\n* **Volcano Plot**: Generates a volcano plot based on the differential expression analysis results.\n* **Dotplot for DEG Genes:** Creates a dotplot of top DEGs (up and down) grouped by the re-anno labels.\n* **Ranked Gene Analysis:** Performs ranked gene analysis, based on re_anno labels and generates a dotplot of the top genes.\n\n**In summary, this script is designed to:**\n\n1. **Load and preprocess spatial transcriptomics data** from two time points.\n2. **Align spatial data using scSLAT**.\n3. **Visualize the spatial alignment** in 3D and using Sankey plots.\n4. **Extract cell lineages** based on the alignment.\n5. **Perform lineage specific analyses** using UMAP, leiden clustering, and visualization tools.\n6. **Conduct differential expression analysis** to identify genes differentially expressed across the two timepoints.\n7. **Visualize and interpret** the results with various plots (spatial, UMAP, dotplot, volcano plots).\n\nThis script leverages several libraries to implement a robust analysis pipeline, highlighting the power of combined computational tools for investigating developmental processes using spatial transcriptomics data.", + "file": "t_slat_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_spaceflow_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_spaceflow_annotated.json new file mode 100644 index 00000000..2950a9dc --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_spaceflow_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs spatial transcriptomics analysis using the `omicverse` and `scanpy` libraries. Here's a breakdown of its functionality and structure:\n\n**Overall Functionality:**\n\nThe script aims to:\n\n1. **Load and Preprocess Spatial Transcriptomics Data:** Reads Visium data, performs basic quality control, and identifies spatially variable genes.\n2. **Integrate Ground Truth Annotations:** Loads a ground truth annotation file and adds it to the spatial transcriptomics data.\n3. **Perform Spatial Embedding with PySpaceFlow:** Uses a spatial deep learning model (`PySpaceFlow`) to generate a low-dimensional representation of the data while considering spatial relationships between cells.\n4. **Cluster Spatial Data:** Clusters the data based on the spatial embedding generated by PySpaceFlow using a Gaussian Mixture Model.\n5. **Visualize Results:** Generates spatial plots to visualize the ground truth annotations, the PySpaceFlow embedding, and the clustering results.\n\n**Detailed Structure and Functionality:**\n\n1. **Import Libraries:**\n - `import omicverse as ov`: Imports the `omicverse` library, which provides specialized spatial analysis tools, and aliases it as `ov`.\n - `import scanpy as sc`: Imports the `scanpy` library, which is a widely used tool for single-cell analysis, and aliases it as `sc`.\n - `import pandas as pd`: Imports the `pandas` library, used for data manipulation and working with dataframes, and aliases it as `pd`.\n - `import os`: Imports the `os` library, which provides functions for interacting with the operating system.\n\n2. **Setup:**\n - `ov.utils.ov_plot_set()`: Sets some plotting parameters for the `omicverse` library to improve the visual output.\n\n3. **Load and Preprocess Spatial Data:**\n - `adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')`: Reads the Visium spatial transcriptomics data from the specified H5 file into an `AnnData` object (a common data structure in `scanpy`) named `adata`.\n - `adata.var_names_make_unique()`: Ensures that gene names in the `AnnData` object are unique, which is crucial for downstream analysis.\n - `sc.pp.calculate_qc_metrics(adata, inplace=True)`: Calculates quality control (QC) metrics for each cell in the `AnnData` object and stores them in the `adata.obs` DataFrame (i.e., cell metadata).\n - `adata = adata[:,adata.var['total_counts']>100]`: Filters out genes (variables in the AnnData object) that have a total count of 100 or less, keeping only genes that are expressed at a higher level across all cells.\n - `adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)`: Computes spatially variable genes (SVGs) using the `omicverse` implementation. It calculates the spatial variance of genes and identifies those that are most variable across the spatial locations. It uses the `prost` method to perform this calculation. The top 3000 SVGs are selected.\n - `adata.raw = adata`: Creates a backup of the original count data by storing it in the `adata.raw` attribute. This is useful for some analysis steps.\n - `adata = adata[:, adata.var.space_variable_features]`: Filters the `AnnData` object to keep only the genes identified as spatially variable in the previous step. This focuses the analysis on spatially relevant gene expression patterns.\n - `adata`: This line simply outputs the AnnData object to the console, showing basic information about the object.\n\n4. **Integrate Ground Truth Annotations:**\n - `Ann_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\\t', header=None, index_col=0)`: Loads the ground truth annotation data from a tab-separated file into a pandas DataFrame, using the first column as the index.\n - `Ann_df.columns = ['Ground Truth']`: Sets the column name of the annotation DataFrame to 'Ground Truth'.\n - `adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']`: Adds the ground truth annotation from the DataFrame to the `adata.obs` DataFrame (cell metadata), matching cell names between `adata.obs_names` and `Ann_df` index.\n - `sc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])`: Generates a spatial plot where each cell is colored based on its 'Ground Truth' annotation. The `hires` option specifies that the high-resolution image data is used for the plot.\n\n5. **Spatial Embedding with PySpaceFlow:**\n - `sf_obj=ov.space.pySpaceFlow(adata)`: Initializes a PySpaceFlow object, which is a spatial deep learning model from the `omicverse` library, using the spatial transcriptomics `AnnData` object as input.\n - `sf_obj.train(...)`: Trains the PySpaceFlow model using various parameters. This step learns a low-dimensional representation of the spatial data, capturing spatial dependencies between cells.\n - `sf_obj.cal_pSM(...)`: Calculates the pseudo-spatial mapping (pSM) using the trained model. This step transforms the data into a space where spatial similarity is better reflected, using a neighborhood graph and graph operations.\n\n6. **Visualize PySpaceFlow Results:**\n - `sc.pl.spatial(adata, color=['pSM_spaceflow','Ground Truth'],cmap='RdBu_r')`: Generates a spatial plot showing the 'pSM_spaceflow' (the PySpaceFlow embedding) and 'Ground Truth' annotations, using the 'RdBu_r' colormap.\n\n7. **Cluster Spatial Data:**\n - `ov.utils.cluster(adata,use_rep='spaceflow',method='GMM',...)`: Clusters the cells based on their representation using the spaceflow embedding obtained in the previous step. It utilizes a Gaussian Mixture Model (`GMM`) clustering method.\n\n8. **Visualize Clustering Results:**\n - `sc.pl.spatial(adata, color=['gmm_cluster',\"Ground Truth\"])`: Generates another spatial plot showing the 'gmm_cluster' assignments (clusters) and the ground truth annotations, allowing for a comparison between the automated clustering and the existing annotations.\n\n**Key Libraries and Concepts:**\n\n- **`omicverse`:** A library specialized in spatial omics analysis. It provides functions for spatial variable gene detection and a spatial embedding model (PySpaceFlow).\n- **`scanpy`:** A popular Python library for single-cell RNA-seq data analysis, also widely used in spatial transcriptomics. It provides functions for data reading, pre-processing, plotting, and clustering.\n- **`AnnData`:** A fundamental data structure in `scanpy` that stores both expression data (gene expression counts) and cell-level metadata (cell annotations and quality control metrics).\n- **Visium:** A spatial transcriptomics technology that captures both gene expression and spatial information from tissues.\n- **Spatial Variable Genes (SVGs):** Genes whose expression varies significantly based on spatial locations in the tissue.\n- **PySpaceFlow:** A spatial deep learning model that can be used to learn a low-dimensional representation of the data while considering spatial relationships between cells.\n- **Pseudo-Spatial Mapping (pSM):** The output of PySpaceFlow's transformation which encodes cell's spatial context and location into a low dimensional representation, capturing neighborhood effects and spatial distributions.\n- **Gaussian Mixture Model (GMM):** A probabilistic clustering method that assumes that the data is generated from a mixture of Gaussian distributions.\n\n**In summary, this script is a comprehensive pipeline for analyzing spatial transcriptomics data. It performs preprocessing, spatial embedding with a deep learning model, clustering, and visualization to explore and understand the spatial patterns of gene expression and their correspondence with the ground truth annotations.**", + "file": "t_spaceflow_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_stagate_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_stagate_annotated.json new file mode 100644 index 00000000..a39fe7ca --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_stagate_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs spatial transcriptomics analysis, using the `omicverse`, `scanpy`, and `pandas` libraries. Here's a breakdown of its functionality and structure:\n\n**Overall Goal:**\n\nThe script aims to:\n\n1. **Load and preprocess spatial transcriptomics data.** This includes loading Visium data, performing quality control, and selecting spatially variable genes.\n2. **Integrate spatial information using GraphST.** This step utilizes GraphST to learn an embedding that incorporates spatial relationships between cells.\n3. **Cluster data using GraphST embeddings.** It employs different clustering methods like mclust, louvain, and leiden after applying GraphST.\n4. **Integrate spatial information using pySTAGATE.** This step utilizes pySTAGATE to learn another embedding incorporating spatial relationships between cells.\n5. **Cluster data using pySTAGATE embeddings.** It employs different clustering methods like mclust, louvain, and leiden after applying STAGATE.\n6. **Visualize the results.** The script generates spatial plots of raw gene expression, GraphST clustering, and pySTAGATE clustering, and also spatial plots of pseudospacial similarity.\n7. **Evaluate the clustering performance.** It calculates the Adjusted Rand Index (ARI) to compare the clustering results against known ground truth labels.\n\n**Detailed Breakdown:**\n\n**1. Initialization & Data Loading:**\n\n* **Imports:**\n * `omicverse` (aliased as `ov`): A library for multi-omics data analysis, focusing on spatial data.\n * `scanpy` (aliased as `sc`): A library for single-cell data analysis.\n * `pandas` (aliased as `pd`): A library for data manipulation and analysis.\n * `os`: A library for interacting with the operating system, used here to build file paths.\n * `matplotlib.pyplot` (aliased as `plt`): Library for plotting data.\n * `sklearn.metrics.cluster.adjusted_rand_score`: Used to calculate adjusted Rand index.\n* **Plotting Setup:** `ov.plot_set()` sets default plotting parameters for `omicverse`.\n* **Data Loading:**\n * `sc.read_visium()` loads Visium spatial data into an `AnnData` object named `adata`.\n * `adata.var_names_make_unique()` ensures gene names are unique.\n* **Quality Control:**\n * `sc.pp.calculate_qc_metrics()` calculates quality control metrics for the data.\n * `adata = adata[:, adata.var['total_counts'] > 100]` filters genes by total counts, keeping only genes with total count above 100.\n* **Spatial Variable Gene Selection:**\n * `ov.space.svg()` identifies spatially variable genes using the `prost` mode, setting the number of genes, a target sum, and specifying Visium as the platform.\n\n**2. Ground Truth Annotation:**\n\n* **Load Ground Truth:**\n * `pd.read_csv()` reads ground truth annotation data from a tab-separated file.\n* **Add Annotation:**\n * The `Ground Truth` column is added to the AnnData object's `obs` (observation metadata). The annotations are mapped to the `adata` observations using the observation names.\n* **Spatial Plot:**\n * `sc.pl.spatial()` creates a spatial plot with annotations colored by `Ground Truth`.\n\n**3. GraphST Integration:**\n\n* **GraphST Model Initialization:**\n * `ov.externel.GraphST.GraphST()` initializes a GraphST model, specifying the `AnnData` object and the device to use for training (GPU 'cuda:0').\n* **GraphST Training:**\n * `model.train()` trains the GraphST model.\n* **GraphST Clustering:**\n * `ov.utils.cluster()` is used to perform clustering using mclust with the GraphST embedding along with other representations, with different clustering algorithms like louvain and leiden as well. The resulting clustering labels are refined to incorporate neighborhood information.\n\n**4. Spatial Visualization with Cluster Labels:**\n\n* **Spatial Plot:**\n * `sc.pl.spatial()` produces spatial plots of refined mclust, leiden, louvain clustering results along with ground truth.\n\n**5. STAGATE Integration:**\n\n* **Coordinate Extraction:**\n * Spatial coordinates from `adata.obsm['spatial']` are extracted and stored in the `adata.obs` DataFrame as 'X' and 'Y'.\n* **STAGATE Model Initialization:**\n * `ov.space.pySTAGATE()` initializes a pySTAGATE model, specifying `AnnData` object, batch sizes, spatial coordinates, cutoff radius, epoch number, and other parameters.\n* **STAGATE Training:**\n * `STA_obj.train()` trains the STAGATE model.\n* **STAGATE Prediction:**\n * `STA_obj.predicted()` uses the trained STAGATE model to generate predictions.\n* **STAGATE Clustering:**\n * `ov.utils.cluster()` is used to perform clustering using mclust with the STAGATE embedding, along with other representations, with different clustering algorithms like louvain and leiden as well. The resulting clustering labels are refined to incorporate neighborhood information.\n\n**6. Spatial Visualization with STAGATE Labels:**\n\n* **Spatial Plot:**\n * `sc.pl.spatial()` produces spatial plots of refined mclust, leiden, louvain clustering results from STAGATE embedding along with ground truth.\n\n**7. Gene Visualization:**\n\n* **Gene of Interest Selection:**\n * The top 10 genes based on their PI value are displayed.\n * `plot_gene` is set to `MBP`.\n* **Spatial Plots of Expression:**\n * `sc.pl.spatial()` generates two spatial plots for `MBP`: one for the raw gene expression and another for STAGATE-transformed expression (`STAGATE_ReX` layer).\n\n**8. Pseudospacial Similarity Analysis:**\n\n* **Pseudospacial Similarity Computation:**\n * `STA_obj.cal_pSM()` computes the pseudospacial similarity matrix (pSM) using the trained STAGATE model.\n* **Spatial Plot of pSM:**\n * `sc.pl.spatial()` generates a spatial plot that visualizes the `Ground Truth` and the computed `pSM_STAGATE`.\n\n**9. Performance Evaluation:**\n\n* **Adjusted Rand Index Calculation:**\n * `adjusted_rand_score` from `sklearn.metrics` is used to calculate the ARI between the different clustering results and the `Ground Truth`.\n * The ARI values for the different GraphST and pySTAGATE clustering results are printed.\n\n**Structure:**\n\nThe script follows a typical workflow for spatial transcriptomics analysis:\n\n1. **Data Loading and Preprocessing:** Handles raw data, filters, and transforms into a suitable format.\n2. **Spatial Integration:** Applies GraphST and pySTAGATE models to leverage spatial context.\n3. **Clustering:** Performs clustering on integrated data to identify cell groups.\n4. **Visualization:** Generates spatial plots to explore the results.\n5. **Evaluation:** Quantifies clustering performance against known labels.\n\n**Key Concepts:**\n\n* **AnnData:** A data structure for single-cell and spatial data analysis, used in `scanpy` and `omicverse`.\n* **Spatial Variable Genes:** Genes whose expression pattern varies across spatial locations.\n* **GraphST:** A deep learning method for learning spatial embeddings.\n* **pySTAGATE:** Another deep learning method for learning spatial embeddings.\n* **Clustering:** Grouping similar data points (cells in this case) based on their features.\n* **Adjusted Rand Index (ARI):** A measure of the similarity between two clusterings, correcting for chance.\n\n**In Summary:**\n\nThis script is a comprehensive example of how to use `omicverse`, `scanpy`, and related libraries to analyze spatial transcriptomics data. It demonstrates: data loading, preprocessing, spatial feature selection, spatial integration with GraphST and pySTAGATE, clustering, visualization, and quantitative evaluation of clustering performance. It provides a solid framework for spatial transcriptomics analysis and serves as a good starting point for more complex analyses.", + "file": "t_stagate_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_staligner_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_staligner_annotated.json new file mode 100644 index 00000000..b47a8bac --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_staligner_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script is designed for integrating and visualizing spatial transcriptomics data from multiple datasets using the STAligner method. Here's a breakdown of its functionality and structure:\n\n**Overall Functionality:**\n\nThe script takes two spatial transcriptomics datasets (Slide-seqV2_MoB and Stereo-seq_MoB), preprocesses them individually, combines them, applies the STAligner integration algorithm, and then visualizes the results using UMAP and spatial plots. This effectively aims to remove batch effects and align the data while preserving spatial information.\n\n**Detailed Structure:**\n\n1. **Imports:**\n * `scipy.sparse.csr_matrix`: For creating sparse matrices (efficient for large gene expression data).\n * `omicverse as ov`: A library for spatial transcriptomics analysis, likely containing the STAligner implementation.\n * `scanpy as sc`: A library for single-cell analysis, used for preprocessing and visualization.\n * `anndata as ad`: A library for handling annotated data (used as a main data structure).\n * `pandas as pd`: For data manipulation.\n * `os`: For interacting with the operating system, especially for file path manipulation.\n\n2. **Initialization:**\n * `ov.utils.ov_plot_set()`: Sets the plotting style for omicverse plots.\n * `Batch_list`, `adj_list`: Empty lists to store processed data and adjacency matrices, respectively.\n * `section_ids`: A list of strings representing the IDs of the datasets being analyzed.\n * `pathway`: The file path to the directory containing the h5ad files.\n\n3. **Data Loading and Preprocessing Loop:**\n * Iterates through each `section_id` in `section_ids`.\n * Loads an AnnData object from an h5ad file using `sc.read_h5ad()`.\n * Converts the data matrix to a sparse matrix if it's a pandas DataFrame, improving efficiency.\n * Ensures that gene (variable) names are unique by appending \"++\" to duplicates.\n * Makes observation (cell/spot) names unique by appending the section ID.\n * Calculates a spatial network based on spot coordinates using `ov.space.Cal_Spatial_Net()` and stores the adjacency matrix in `adata.uns['adj']`.\n * Identifies highly variable genes using `sc.pp.highly_variable_genes()`.\n * Normalizes and log transforms the data using `sc.pp.normalize_total()` and `sc.pp.log1p()`.\n * Subsets the AnnData object to include only highly variable genes.\n * Appends the adjacency matrix and processed AnnData object to the respective lists.\n\n4. **Data Concatenation**\n * Prints the list of AnnData objects that have been loaded.\n * Concatenates the list of preprocessed `AnnData` objects into a single `AnnData` object using `ad.concat()`, adding a 'slice_name' column that stores the original data set ids, `section_ids`.\n * Creates a 'batch_name' column from the 'slice_name' column and converts to category type.\n * Prints the shape of the concatenated `AnnData` object.\n\n5. **STAligner Integration:**\n * Creates a list of tuples indicating the order of slice integration using `iter_comb`\n * Initializes an STAligner object `ov.space.pySTAligner()`, passing in data (`adata_concat`), training parameters (`verbose`, `knn_neigh`, `n_epochs`, `iter_comb`) and setting the batch key to `\"batch_name\"`.\n * Trains the STAligner model using `STAligner_obj.train()`.\n * Retrieves the predicted latent representation (embeddings) using `STAligner_obj.predicted()`.\n\n6. **Downstream Analysis and Visualization:**\n * Calculates the nearest neighbor graph using the STAligner embeddings with `sc.pp.neighbors()`.\n * Performs Leiden clustering using `ov.utils.cluster()`.\n * Performs UMAP dimensionality reduction using `sc.tl.umap()`.\n * Generates and displays a UMAP plot, colored by 'batch_name' and 'leiden' cluster.\n * Sets up spatial plot parameters (spot sizes, title sizes).\n * Creates a figure with two subplots for spatial visualization.\n * Generates spatial plots for each dataset using `sc.pl.spatial()`, colored by cluster and removes the legend. Sets axes labels, and titles.\n * Inverts the y-axis of the second spatial plot for proper visualization.\n * Displays the generated plots using `plt.show()`.\n\n**Key Points:**\n\n* **Spatial Network:** The script uses `ov.space.Cal_Spatial_Net()` to calculate the spatial relationships between spots/cells, crucial for spatial integration.\n* **Batch Correction:** STAligner aims to remove batch effects between datasets (Slide-seqV2 and Stereo-seq).\n* **Downstream Analysis:** The processed data is clustered and visualized using UMAP, a common technique for single-cell data.\n* **Spatial Visualization:** The script produces spatial plots, allowing visual assessment of how the data is organized spatially after integration and clustering.\n\n**In summary, this script provides a complete workflow for analyzing two spatial transcriptomics datasets, integrating them with STAligner, and visualizing the results with both dimension reduction (UMAP) and spatial plots.** It uses various single-cell and spatial analysis libraries to perform these tasks efficiently. The primary goal is to combine spatial transcriptomic datasets in order to analyze cellular behavior in multiple spatial contexts, and the integration process should remove technical bias due to data collection.", + "file": "t_staligner_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_starfysh_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_starfysh_annotated.json new file mode 100644 index 00000000..540ebad4 --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_starfysh_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "Okay, let's break down the functionality and structure of this Python script. It's a comprehensive single-cell and spatial transcriptomics analysis pipeline, leveraging several libraries to achieve its goals.\n\n**Overall Functionality:**\n\nThe script's main purpose is to analyze spatial transcriptomics data (likely Visium data) using a pipeline that combines preprocessing, anchor spot identification, archetypal analysis, and a deep learning model (Starfysh) for cell type deconvolution and expression prediction. It aims to:\n\n1. **Load and Preprocess Data:** Reads in spatial gene expression data, image data, and gene signature information. It performs initial normalization, dimensionality reduction (UMAP), and image processing.\n2. **Identify Anchor Spots:** Determines spatially informative spots that correspond to distinct cell types using a gene signature.\n3. **Refine Anchor Spots with Archetypal Analysis:** Uses archetypal analysis to identify underlying cell states, improve anchor spot assignments, and refine gene signatures.\n4. **Train a Starfysh Model:** Trains a deep learning model (Starfysh) to learn the relationships between gene expression and spatial location.\n5. **Predict Cell Type Proportions and Expressions:** Deconvolves cell types based on the learned model, infers cell type-specific expression patterns, and predicts expression of specific genes of interest.\n6. **Visualize Results:** Generates various plots to visualize spatial gene expression, UMAP embeddings, archetype assignments, cell type proportions, and predicted gene expression.\n7. **Save Results:** Saves the trained model and the processed AnnData object to disk.\n\n**Structure and Key Steps (Line by Line):**\n\n**1. Imports and Setup:**\n\n* **Lines 1-6:** Imports essential libraries:\n * `scanpy` (`sc`): For single-cell analysis.\n * `omicverse` (`ov`): For multi-omics analysis, specifically spatial omics and integration.\n * `omicverse.externel.starfysh` (AA, utils, plot_utils, post\\_analysis, sf\\_model): Starfysh specific tools.\n * Sets `omicverse` plotting parameters.\n* **Lines 8-10:** Defines file paths, sample ID, and gene signature filename.\n\n**2. Data Loading and Preparation:**\n\n* **Lines 12-14:** Loads AnnData object (containing expression data) and its normalized version using `utils.load_adata` from `omicverse.externel.starfysh`. Keeps 2000 highly variable genes.\n* **Lines 16-17:** Imports `pandas` (`pd`) and `os` for data manipulation and file system interactions.\n* **Line 18:** Reads gene signatures from a CSV file into a Pandas DataFrame.\n* **Line 19:** Filters the gene signature to only include genes present in the loaded data.\n* **Line 20:** Displays the first few rows of the filtered gene signature.\n* **Lines 22-24:** Preprocesses image data using `utils.preprocess_img`, extracting spatial coordinates.\n* **Line 26:** Extracts the image, mapping information, and scaling factor from image metadata.\n* **Line 27:** Computes UMAP embeddings for visualization using `utils.get_umap`.\n\n**3. Image Visualization:**\n\n* **Lines 30-32:** Imports `matplotlib.pyplot` and displays the loaded image.\n* **Line 34:** Shows the first few rows of the spatial mapping information.\n\n**4. Visium Analysis Setup:**\n\n* **Lines 36-42:** Creates a `utils.VisiumArguments` object to encapsulate all necessary parameters for the spatial analysis, including data, gene signatures, image metadata, number of anchor spots, window size, and sample ID.\n\n**5. Anchor Spot and Data Updates:**\n\n* **Lines 44-45:** Retrieves updated AnnData and anchor spots dataframes using the `VisiumArguments` object.\n* **Lines 47-48:** Adds log library size and windowed log library size to the AnnData object's observation data, calculated by the `VisiumArguments` object.\n\n**6. Spatial Visualization of Library Size and Genes:**\n\n* **Lines 50-54:** Plots the spatial distribution of 'log library size' using `scanpy.pl.spatial`.\n* **Lines 59-63:** Plots the spatial distribution of 'windowed log library size' using `scanpy.pl.spatial`.\n* **Lines 68-72:** Plots the spatial distribution of 'IL7R' gene expression using `scanpy.pl.spatial`.\n* **Lines 77-80:** Plots anchor spots and their signatures using `plot_utils.plot_anchor_spots`.\n\n**7. Archetypal Analysis:**\n\n* **Line 82:** Initializes `AA.ArchetypalAnalysis` using normalized data.\n* **Line 83:** Computes archetypes using `aa_model.compute_archetypes` with 40 archetypes.\n* **Line 85:** Identifies archetypal spots using `aa_model.find_archetypal_spots` with major archetypes.\n* **Line 87:** Finds marker genes associated with each archetypal cluster.\n* **Line 89:** Maps archetypes to the closest anchor spots using `aa_model.assign_archetypes`.\n* **Line 91:** Finds distant archetypes not assigned to annotated cell types.\n* **Line 93:** Plots explained variance ratios using `plot_utils.plot_evs`.\n* **Line 95:** Plots the archetypes using `aa_model.plot_archetypes`.\n* **Line 97:** Plots the archetype mapping results.\n* **Lines 99-103:** Refines the anchor spots by passing `visium_args` and `aa_model` to `utils.refine_anchors`.\n\n**8. Starfysh Model Training and Evaluation:**\n\n* **Lines 106-108:** Updates adata and gene signature after refinement.\n* **Line 110:** Imports `torch` for deep learning.\n* **Lines 111-114:** Sets training parameters (repeats, epochs, patience) and device for model training.\n* **Lines 116-120:** Trains the Starfysh model using `utils.run_starfysh`.\n* **Line 122:** Get the updated AnnData objects after the training process.\n* **Lines 123-127:** Evaluates the model using `sf_model.model_eval` to get inference outputs.\n* **Line 129:** Import `numpy` library.\n* **Lines 130-131:** Get the number of cell types and randomly choose one cell type index to plot.\n* **Lines 132-135:** Plots the mean expression versus inferred proportion for a chosen random cell type using `post_analysis.gene_mean_vs_inferred_prop`.\n* **Line 137:** Plot the spatial distribution of inferred expression for 'ql_m' feature.\n\n**9. Cell to Proportion Conversion and Visualization:**\n\n* **Lines 139-146:** Defines a function `cell2proportion` to convert cell expression data to proportion data by creating a new AnnData object and copying the necessary components to the new object.\n* **Line 147:** Converts the adata\\_ object to proportion object using cell2proportion function.\n* **Line 149:** Show the `adata_plot` object.\n* **Lines 151-156:** Plots spatial distributions of cell type proportions (Basal, LumA, LumB) using `scanpy.pl.spatial`.\n* **Lines 159-164:** Visualizes cell type proportions (Basal, LumA, MBC, Normal epithelial) in UMAP embeddings using `omicverse.pl.embedding`.\n\n**10. Cell Type Specific Expression Prediction:**\n\n* **Lines 167-170:** Predicts cell type-specific expressions using `sf_model.model_ct_exp`.\n* **Lines 172-173:** Specifies the gene ('IL7R') and cell type ('Tem') to predict expression for.\n* **Line 174:** Adds the predicted expression for 'IL7R' in 'Tem' cell type as a layer to the `adata_` object.\n* **Lines 176-182:** Plots spatial distribution of predicted expression of IL7R using `scanpy.pl.spatial`.\n\n**11. Saving Results:**\n\n* **Lines 187-189:** Defines output directory and creates it if necessary.\n* **Line 191:** Saves the trained Starfysh model state dictionary.\n* **Line 193:** Saves the updated AnnData object in H5AD format.\n\n**Key Libraries and Their Roles:**\n\n* **scanpy:** For single-cell analysis functionalities like dimensionality reduction, spatial visualization, and data handling.\n* **omicverse:** A multi-omics analysis library with modules specifically for spatial omics and integration of various data types, and also contains the implementation for Starfysh model.\n* **pandas:** For tabular data manipulation, handling gene signatures.\n* **matplotlib:** For plotting various visualizations.\n* **torch:** For the deep learning model implementation and training.\n* **numpy:** For numerical operations\n\n**In Summary:**\n\nThis script represents a comprehensive spatial transcriptomics analysis pipeline. It goes beyond basic analysis by integrating archetypal analysis to refine cell type signatures and employing a deep learning model (Starfysh) for cell type deconvolution and cell type specific gene expression prediction. It includes comprehensive visualization and result saving functionalities. This pipeline is a good example of an advanced spatial analysis workflow, making use of various libraries to accomplish its task.", + "file": "t_starfysh_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/Tutorials-space_py/t_stt_annotated.json b/OvStudent/Converted_Jsons/Tutorials-space_py/t_stt_annotated.json new file mode 100644 index 00000000..2eda2cc2 --- /dev/null +++ b/OvStudent/Converted_Jsons/Tutorials-space_py/t_stt_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs spatial transcriptomics analysis using the `omicverse` and `scanpy` libraries, along with `scvelo`. It aims to identify developmental trajectories and spatial patterns within a mouse brain dataset. Let's break down the functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **Lines 1, 3, 4:** Imports necessary libraries:\n * `omicverse` as `ov`: For spatial transcriptomics analysis and visualization.\n * `scvelo` as `scv`: For RNA velocity analysis. Though imported, the script does not actually use scvelo functionality.\n * `scanpy` as `sc`: For single-cell data handling and preprocessing.\n* **Line 5:** Sets plotting parameters using `omicverse` to ensure consistent visual appearance.\n* **Line 7:** Reads a pre-existing AnnData object from the `mouse_brain.h5ad` file using `scanpy`. AnnData is a standard data format used in single-cell genomics.\n* **Line 8:** Displays the loaded AnnData object's metadata and structure.\n\n**2. Spatial Temporal Trajectory (STT) Analysis:**\n\n* **Line 10:** Creates an `STT` object using `omicverse`. This object represents a spatial and temporal model built upon the input `adata`. It specifies the spatial location data (`xy_loc`) and region annotations (`Region`).\n* **Line 12:** Estimates developmental stages for the cells within the STT model.\n* **Lines 14-15:** Trains the STT model using a set of specific parameters:\n * `n_states`: Number of potential developmental attractors or states.\n * `n_iter`: Number of iterations for model training.\n * `weight_connectivities`: Weight given to the connectivity matrix when constructing the STT landscape.\n * `n_neighbors`: Number of neighbors to consider when constructing the connectivity matrix.\n * `thresh_ms_gene`: Threshold for considering genes as MS (mature spliced) genes\n * `spa_weight`: The weight parameter of spatial information during STT training.\n* **Lines 17-19:** Generates an embedding plot visualizing the spatial locations of cells, colored by the estimated attractors.\n* **Lines 21-23:** Generates another spatial embedding plot, colored by the defined tissue region annotations.\n\n**3. Pathway Analysis:**\n\n* **Line 25:** Prepares a dictionary of pathway gene sets from a `KEGG_2019_Mouse.txt` file for mouse.\n* **Line 27:** Computes pathway scores for each cell based on the provided pathway gene sets and stores them within the STT object.\n* **Lines 29-33:** Generates and displays a pathway score plot, where each axis represents an embedding dimension. Sets axis labels with specified font size.\n\n**4. Tensor Pathway and Attractor Visualization:**\n\n* **Line 35:** Imports `matplotlib.pyplot` as `plt` for finer control over plotting.\n* **Lines 36-38:** Generates a tensor pathway visualization for the 'Wnt signaling pathway', mapped onto the spatial coordinates.\n* **Lines 40-42:** Generates a tensor pathway visualization for the 'TGF-beta signaling pathway' mapped onto the spatial coordinates.\n* **Lines 44-45:** Creates tensor plots for specified attractors, applying a filter and density parameters to improve clarity.\n\n**5. Landscape Construction, Lineage Inference and Sankey Diagram:**\n\n* **Line 48:** Constructs a landscape for the STT object using the 'xy_loc' coordinates, this landscape shows the potential flow of cells towards attractors\n* **Line 50:** Generates a spatial embedding plot of the transformed coordinates, colored by attractors and regions.\n* **Lines 52-53:** Infers cell lineages using the `MPPT` method. Includes parameters for flow, color, and point size.\n* **Line 55:** Generates a Sankey diagram visualizing the transitions between attractors and regions.\n\n**6. Saving and Loading Data:**\n\n* **Lines 60-61:** Saves the modified AnnData object (`adata`) and the aggregated version to separate files (`mouse_brain_adata.h5ad` and `mouse_brain_adata_aggr.h5ad`, respectively).\n* **Lines 63-64:** Reads the saved AnnData and aggregated AnnData objects.\n* **Line 66:** Recreates an STT object with the loaded data.\n* **Line 67:** Loads the saved data into the STT object.\n\n**7. Gene Expression Analysis and Visualization:**\n\n* **Line 69:** Sorts the genes based on their `r2_test` values in descending order within the AnnData's variable (gene) information.\n* **Line 71:** Plots the expression of the top 6 genes.\n* **Line 73:** Imports `matplotlib.pyplot` again for plotting.\n* **Lines 74-96:** Generates a figure containing subplots, each visualizing the expression of the 'Sim1' gene from different layers ('Ms', 'Mu', 'velo', and expression) mapped onto the spatial locations.\n\n**In Summary:**\n\nThis script performs a comprehensive spatial transcriptomics analysis using `omicverse` to:\n\n1. Load spatial data and annotations.\n2. Build and train an STT model for spatial and temporal analysis.\n3. Visualize spatial distributions of cells colored by attractors and regions.\n4. Perform pathway analysis.\n5. Visualize pathway activity in spatial context.\n6. Perform spatial trajectory analysis and visualize cell flow towards attractors.\n7. Save and load results.\n8. Analyze and visualize gene expression from different data layers.\n\nThe script effectively integrates several methods to gain insights into developmental processes and spatial patterns within the mouse brain data. The `omicverse` library provides a high-level interface for performing complex spatial analysis. It combines trajectory inference with spatial information to provide a more detailed view of cell behavior.", + "file": "t_stt_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_anno_trans_annotated.json b/OvStudent/Converted_Jsons/t_anno_trans_annotated.json new file mode 100644 index 00000000..557e065d --- /dev/null +++ b/OvStudent/Converted_Jsons/t_anno_trans_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a multi-omics analysis, combining RNA and ATAC sequencing data, followed by cell type label transfer and visualization. Here's a breakdown of its functionality and structure:\n\n**Core Functionality:**\n\n1. **Data Loading and Preparation:**\n - Loads RNA and ATAC sequencing data from `.h5ad` files into `AnnData` objects using `scanpy` (`sc`).\n - Merges (concatenates) the RNA and ATAC `AnnData` objects, keeping shared cell IDs using `sc.concat`.\n\n2. **Dimensionality Reduction & Embedding:**\n - Calculates a Manifold Diffusion Embedding (MDE) based on a \"glue\" embedding using `omicverse`'s `ov.utils.mde`. This is likely a joint embedding derived from both RNA and ATAC data that was precomputed and stored in `X_glue`.\n - Visualizes data using embedding plots (likely a 2D representation like UMAP or t-SNE) colored by cell `domain` (RNA or ATAC) and `major_celltype`.\n\n3. **Cell Type Label Transfer:**\n - Creates a weighted k-nearest neighbor (KNN) trainer object based on RNA data using `omicverse`'s `ov.utils.weighted_knn_trainer`.\n - Transfers cell type labels from RNA cells to ATAC cells using weighted KNN, along with an uncertainty value for each transfer, using `ov.utils.weighted_knn_transfer`.\n - Adds the transferred cell type labels and their uncertainties to the ATAC data's observation metadata.\n - Overwrites the original `major_celltype` column in ATAC with the transferred labels.\n - Visualizes ATAC data with UMAP, colored by transfer uncertainty and final `major_celltype`.\n\n4. **Visualization and Redundant Steps:**\n - Repeats the merge of RNA and ATAC data, followed by the calculation of a manifold diffusion embedding and the visualization. This seems to be redundant, since it does not change the data.\n\n**Code Structure:**\n\n* **Imports:**\n - `omicverse` as `ov`: Provides functions for multi-omics analysis, including embeddings and KNN transfer.\n - `matplotlib.pyplot` as `plt`: Used for plotting configuration, but not directly used for generating plots (plots are being generated by `omicverse`).\n - `scanpy` as `sc`: Used for handling `AnnData` objects and loading data.\n\n* **Data Loading and Preparation (Lines 6-10, 57-58):**\n - `rna = sc.read(\"data/analysis_lymph/rna-emb.h5ad\")`: Loads RNA data.\n - `atac = sc.read(\"data/analysis_lymph/atac-emb.h5ad\")`: Loads ATAC data.\n - `combined = sc.concat([rna, atac], merge='same')`: Concatenates RNA and ATAC data.\n - `combined1 = sc.concat([rna, atac], merge='same')`: Redundant concatenation of RNA and ATAC data.\n\n* **Embedding and Visualization (Lines 13-17, 23-27, 61-64):**\n - `combined.obsm['X_mde'] = ov.utils.mde(combined.obsm['X_glue'])`: Calculates MDE.\n - `ov.utils.embedding(...)`: Generates and displays embedding plots, using the calculated 'X_mde' embedding, with different color schemes.\n\n* **Cell Type Transfer (Lines 31-47):**\n - `knn_transformer = ov.utils.weighted_knn_trainer(...)`: Creates KNN trainer object.\n - `labels, uncert = ov.utils.weighted_knn_transfer(...)`: Transfers cell type labels.\n - `atac.obs[\"transf_celltype\"] = labels.loc[atac.obs.index, \"major_celltype\"]`: Assigns transferred labels to ATAC data.\n - `atac.obs[\"transf_celltype_unc\"] = uncert.loc[atac.obs.index, \"major_celltype\"]`: Assigns label uncertainties to ATAC data.\n - `atac.obs[\"major_celltype\"] = atac.obs[\"transf_celltype\"].copy()`: Overwrites original cell types.\n\n* **Final Visualization (Lines 49-54):**\n - `ov.utils.embedding(...)`: Generates and displays an embedding plot of ATAC data, colored by transferred cell type and uncertainty.\n\n**Key Points:**\n\n* **Multi-Omics Focus:** The script demonstrates a common workflow in multi-omics analysis: combining data from different assays (RNA and ATAC) to obtain a more comprehensive view of the biological system.\n* **Joint Embedding:** The \"glue\" embedding (`X_glue`) suggests that a prior step was taken to integrate the RNA and ATAC data into a shared embedding space.\n* **Weighted KNN:** Using weighted KNN allows for cell type label transfer based on the similarities in the integrated embedding, potentially accounting for varying levels of information in each source cell.\n* **Uncertainty Quantification:** Recording the uncertainty of label transfers provides valuable information for evaluating the quality and reliability of the transferred labels.\n* **Redundancy:** The repeated `sc.concat`, followed by MDE calculation and visualization seems like code that can be removed.\n\n**In summary, the script provides a workflow for integrating RNA and ATAC data, computing a joint embedding, transferring cell type labels, and visualizing the results. It showcases how `omicverse` and `scanpy` libraries can be leveraged for complex multi-omics analysis.**", + "file": "t_anno_trans_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_aucell_annotated.json b/OvStudent/Converted_Jsons/t_aucell_annotated.json new file mode 100644 index 00000000..9a221d2c --- /dev/null +++ b/OvStudent/Converted_Jsons/t_aucell_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell RNA sequencing analysis, focusing on pathway and gene set enrichment using the `omicverse`, `scanpy`, and `scvelo` libraries. Here's a breakdown of its functionality and structure:\n\n**Overall Goal:**\n\nThe script aims to:\n\n1. **Load and Preprocess Data:** Load a pancreas single-cell RNA-seq dataset, normalize, and log-transform the gene expression data.\n2. **Calculate AUCell Scores:** Use the AUCell (Area Under the Curve) method to quantify the activity of specific pathways and gene sets within each cell.\n3. **Perform Pathway Enrichment Analysis:** Assess which pathways are significantly enriched in the different cell populations.\n4. **Differential Gene Expression Analysis:** Identify genes that are differentially expressed between cell clusters.\n5. **Visualize Results:** Generate UMAP embeddings colored by AUCell scores, pathway enrichment scores, and cell clusters, and also dotplots of ranked genes.\n\n**Detailed Breakdown:**\n\n1. **Imports:**\n * `import omicverse as ov`: Imports the `omicverse` library, which provides tools for single-cell analysis, pathway analysis, and visualization.\n * `import scanpy as sc`: Imports the `scanpy` library, a popular tool for single-cell data analysis and visualization.\n * `import scvelo as scv`: Imports the `scvelo` library, which focuses on RNA velocity analysis.\n\n2. **Setup:**\n * `ov.utils.ov_plot_set()`: Sets the plotting style for `omicverse`.\n * `ov.utils.download_pathway_database()`: Downloads the pathway database required by `omicverse`.\n * `ov.utils.download_geneid_annotation_pair()`: Downloads the gene ID annotation pairs for use with `omicverse`.\n\n3. **Data Loading and Preprocessing:**\n * `adata = scv.datasets.pancreas()`: Loads the pancreas dataset provided by `scvelo` as an `AnnData` object.\n * `adata`: Displays the loaded `AnnData` object (which contains the gene expression data).\n * `adata.X.max()`: Finds the maximum expression value in the data matrix (`adata.X`).\n * `sc.pp.normalize_total(adata, target_sum=1e4)`: Normalizes the total gene expression counts per cell to 10,000.\n * `sc.pp.log1p(adata)`: Applies the log1p (log(1 + x)) transformation to the gene expression matrix.\n * `adata.X.max()`: Finds the maximum expression value after normalization and log transformation.\n\n4. **Pathway and Gene Set Analysis using AUCell:**\n * `pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2021.txt',organism='Mouse')`: Prepares a dictionary of gene sets from a GO Biological Process file, specific to mouse.\n * `geneset_name='response to vitamin (GO:0033273)'`: Defines a single gene set of interest (response to vitamin).\n * `ov.single.geneset_aucell(adata, geneset_name=geneset_name, geneset_dict=pathway_dict,layer='X')`: Calculates the AUCell scores for the specified single geneset.\n * `sc.pl.embedding(adata, basis='umap', color=[geneset_name])`: Plots the UMAP embedding, colored by the AUCell scores for the single geneset.\n * `geneset_names=['response to vitamin (GO:0033273)','response to vitamin D (GO:0033280)']`: Defines multiple gene sets (response to vitamin and response to vitamin D).\n * `ov.single.pathway_aucell(adata, pathway_names=geneset_names, pathway_dict=pathway_dict,layer='X')`: Calculates AUCell scores for the multiple specified pathways.\n * `sc.pl.embedding(adata, basis='umap', color=geneset_names)`: Plots the UMAP embedding, colored by the AUCell scores for each specified pathway.\n * `ov.single.geneset_aucell(adata, geneset_name='test_geneset', geneset_dict={'test_geneset':['Gnas','Ins2','Gcg']},layer='X')`: Calculates AUCell scores for a user-defined \"test_geneset\" containing `Gnas`, `Ins2`, and `Gcg` genes.\n * `sc.pl.embedding(adata, basis='umap', color=['test_geneset'])`: Plots the UMAP embedding colored by the \"test\\_geneset\" AUCell scores.\n\n5. **Pathway Enrichment Analysis:**\n * `adata_aucs=ov.single.pathway_aucell_enrichment(adata,pathway_dict=pathway_dict,layer='X')`: Calculates pathway enrichment scores using AUCell for all pathways.\n * `adata_aucs.obs=adata[adata_aucs.obs.index].obs`: Copies the metadata (`obs`) from the original `adata` to the enriched `adata_aucs`.\n * `adata_aucs.obsm=adata[adata_aucs.obs.index].obsm`: Copies the spatial embeddings (`obsm`) from the original `adata` to the enriched `adata_aucs`.\n * `adata_aucs.obsp=adata[adata_aucs.obs.index].obsp`: Copies the spatial neighborhood information (`obsp`) from the original `adata` to the enriched `adata_aucs`.\n * `adata_aucs`: Displays the `AnnData` object containing the AUCell enrichment results.\n * `adata_aucs.write_h5ad('data/pancreas_auce.h5ad',compression='gzip')`: Saves the enriched `adata_aucs` object to a H5AD file.\n * `adata_aucs=sc.read('data/pancreas_auce.h5ad')`: Loads the saved `adata_aucs` object from the H5AD file.\n * `sc.pl.embedding(adata_aucs, basis='umap', color=['response to vitamin (GO:0033273)','response to vitamin D (GO:0033280)'])`: Plots the UMAP embedding of `adata_aucs`, colored by AUCell scores of specified pathways.\n\n6. **Differential Gene Expression Analysis:**\n * `sc.tl.rank_genes_groups(adata_aucs, 'clusters', method='t-test',n_genes=100)`: Performs differential gene expression analysis using the t-test, comparing gene expression between clusters.\n * `sc.pl.rank_genes_groups_dotplot(adata_aucs,groupby='clusters')`: Displays the top ranked genes across clusters as a dotplot.\n * `degs = sc.get.rank_genes_groups_df(adata_aucs, group='Beta', key='rank_genes_groups', log2fc_min=2)`: Retrieves a dataframe of differentially expressed genes in the 'Beta' cell cluster (log2 fold change >= 2).\n * `degs`: Displays the dataframe of differentially expressed genes.\n\n7. **Combined Visualization:**\n * `import matplotlib.pyplot as plt`: Imports the matplotlib library for plotting.\n * `axes=sc.pl.embedding(adata_aucs,ncols=3,color=['clusters',degs['names'][0],degs['names'][1],degs['names'][2]],legend_loc='on data',return_axes=True)`: Creates a combined UMAP plot with clusters and the top 3 differentially expressed genes.\n * `axes.tight_layout()`: Adjusts the plot layout for better viewing.\n\n8. **Original Data Analysis:**\n * `adata.uns['log1p']['base']=None`: Resets the log transformation base in `adata`.\n * `sc.tl.rank_genes_groups(adata, 'clusters', method='t-test',n_genes=100)`: Performs differential gene expression analysis on the original adata object.\n * `res=ov.single.pathway_enrichment(adata,pathways_dict=pathway_dict,organism='Mouse',layer='X',groupby='clusters')`: Performs pathway enrichment analysis.\n * `ax=ov.single.pathway_enrichment_plot(res,plot_title='Enrichment',cmap='Reds')`: Plots the pathway enrichment results.\n\n**In Summary:**\n\nThis script is a comprehensive example of how to combine `omicverse`, `scanpy`, and `scvelo` for single-cell RNA-seq analysis. It goes beyond basic preprocessing and visualization by focusing on pathway activity and enrichment analysis, allowing researchers to gain insights into the biological processes driving cellular heterogeneity.", + "file": "t_aucell_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_bulk2single_annotated.json b/OvStudent/Converted_Jsons/t_bulk2single_annotated.json new file mode 100644 index 00000000..95632d3a --- /dev/null +++ b/OvStudent/Converted_Jsons/t_bulk2single_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a bulk RNA-seq deconvolution analysis using single-cell RNA-seq data as a reference. It leverages several libraries for omics data analysis and visualization, including `scanpy`, `omicverse`, `matplotlib`, `anndata`, and `scvelo`. Here's a breakdown of its functionality and structure:\n\n**1. Library Imports:**\n\n* **`import scanpy as sc`**: Imports the `scanpy` library, a popular tool for single-cell RNA-seq analysis. (Redundant import on line 66)\n* **`import omicverse as ov`**: Imports `omicverse`, a library that provides high-level functions for various omics analyses, including bulk-to-single cell deconvolution.\n* **`import matplotlib.pyplot as plt`**: Imports the `matplotlib` library for creating static visualizations.\n* **`import anndata`**: Imports the `anndata` library, used for handling annotated data matrices, commonly used in single-cell analysis.\n* **`import scvelo as scv`**: Imports the `scvelo` library, often used for RNA velocity analysis and provides some single-cell datasets.\n\n**2. Data Loading and Preprocessing:**\n\n* **`ov.plot_set()`**: Applies default plotting settings from `omicverse`, ensuring consistency in visualizations.\n* **`bulk_data = ov.read('data/GSE74985_mergedCount.txt.gz', index_col=0)`**: Reads bulk RNA-seq count data from a gzipped text file. The first column of the file is specified as the index (gene names).\n* **`bulk_data = ov.bulk.Matrix_ID_mapping(bulk_data, 'genesets/pair_GRCm39.tsv')`**: Maps gene IDs in the bulk data to a standard nomenclature (likely mouse gene symbols) using a provided mapping file.\n* **`bulk_data.head()`**: Displays the first few rows of the processed bulk data, allowing a quick inspection of the data.\n* **`single_data = scv.datasets.dentategyrus()`**: Loads a pre-processed single-cell RNA-seq dataset of the dentate gyrus from `scvelo`.\n* **`single_data`**: Prints the loaded single-cell data object, providing a summary of its structure.\n\n**3. Bulk-to-Single Cell Deconvolution Model Initialization:**\n\n* **`model = ov.bulk2single.Bulk2Single(...)`**: Initializes a `Bulk2Single` model from `omicverse` for deconvoluting bulk RNA-seq data.\n * `bulk_data`: Provides the processed bulk RNA-seq data.\n * `single_data`: Provides the reference single-cell RNA-seq data.\n * `celltype_key='clusters'`: Specifies the column in `single_data`'s annotation that defines cell types.\n * `bulk_group=['dg_d_1','dg_d_2','dg_d_3']`: Specifies which samples in the bulk data correspond to bulk measurements used for training.\n * `top_marker_num=200`: Sets the number of top marker genes to be used for each cell type.\n * `ratio_num=1`: Determines how many marker genes are selected relative to the total number of genes.\n * `gpu=0`: Disables GPU usage (likely running on CPU).\n\n**4. Initial Cell Fraction Prediction and Visualization:**\n\n* **`CellFractionPrediction = model.predicted_fraction()`**: Predicts cell type fractions in each bulk sample based on marker genes.\n* **`CellFractionPrediction.head()`**: Displays the first few rows of the predicted cell fraction matrix.\n* **`ax = CellFractionPrediction.plot(kind='bar', stacked=True, figsize=(8, 4))`**: Generates a stacked bar plot visualizing the cell type fractions for each bulk sample.\n* **`ax.set_xlabel('Sample')`, `ax.set_ylabel('Cell Fraction')`, `ax.set_title('TAPE Cell fraction predicted')`**: Sets plot labels and titles for clarity.\n* **`plt.legend(bbox_to_anchor=(1.05, 1), ncol=1)`**: Displays the legend outside of the plot area to avoid overlap with bars.\n* **`plt.show()`**: Shows the generated plot.\n\n**5. VAE Model Training:**\n\n* **`model.bulk_preprocess_lazy()` & `model.single_preprocess_lazy()`**: Preprocesses both bulk and single-cell data lazily (likely using a mechanism that avoids excessive memory use) to prepare for VAE training.\n* **`model.prepare_input()`**: Prepares the input data for the VAE model.\n* **`vae_net = model.train(...)`**: Trains a variational autoencoder (VAE) model.\n * `batch_size=512`: Sets the batch size for gradient descent during training.\n * `learning_rate=1e-4`: Sets the learning rate of the optimizer.\n * `hidden_size=256`: Sets the size of the hidden layers in the VAE model.\n * `epoch_num=3500`: Sets the number of training iterations (epochs).\n * `vae_save_dir='data/bulk2single/save_model'`, `vae_save_name='dg_vae'`: Specifies where to save the trained model weights.\n * `generate_save_dir='data/bulk2single/output'`, `generate_save_name='dg'`: Specifies the location to save generated single-cell data.\n* **`model.plot_loss()`**: Plots the loss function curve over the training epochs, a common method to verify model training progress.\n\n**6. VAE Model Loading and Data Generation:**\n\n* **`vae_net = model.load('data/bulk2single/save_model/dg_vae.pth')`**: Loads a pre-trained VAE model from a specified path, potentially skipping training if a model is already available.\n* **`generate_adata = model.generate()`**: Generates pseudo-single-cell expression profiles based on the trained VAE model and bulk data input. This essentially transforms the bulk profiles into a format resembling single-cell data.\n* **`generate_adata`**: Prints the structure of the generated `anndata` object.\n\n**7. Data Filtering and Visualization of Generated and Original Data:**\n\n* **`generate_adata = model.filtered(generate_adata, leiden_size=25)`**: Filters the generated data, keeping only cell clusters identified with leiden clustering that have a size of at least 25 cells.\n* **`generate_adata`**: Prints the structure of the filtered data.\n* **`ov.bulk2single.bulk2single_plot_cellprop(generate_adata, celltype_key='clusters')`**: Creates a plot of the cell type proportions for the filtered generated data.\n* **`plt.grid(False)`**: Removes grid lines from the plot.\n* **`ov.bulk2single.bulk2single_plot_cellprop(single_data, celltype_key='clusters')`**: Creates a plot of the cell type proportions for the original single-cell data.\n* **`plt.grid(False)`**: Removes grid lines from the plot.\n* **`ov.bulk2single.bulk2single_plot_correlation(single_data, generate_adata, celltype_key='clusters')`**: Creates a scatter plot to compare cell type proportions between the original and generated single-cell data.\n* **`plt.grid(False)`**: Removes grid lines from the plot.\n\n**8. Dimensionality Reduction and Embedding Visualization:**\n\n* **`generate_adata.obsm[\"X_mde\"] = ov.utils.mde(generate_adata.obsm[\"X_pca\"])`**: Computes an MDE (Manifold Dimensionality Embedding) from the PCA (Principal Component Analysis) coordinates of the generated data. MDE is a dimensionality reduction technique often used to visualize single-cell data.\n* **`ov.utils.embedding(...)`**: Generates and visualizes an embedding plot of the generated data using the calculated MDE.\n * `basis='X_mde'`: Specifies the MDE coordinates for embedding.\n * `color=['clusters']`: Colors cells in the plot by cluster identity.\n * `wspace=0.4`: Sets horizontal spacing between subplots.\n * `palette=ov.utils.pyomic_palette()`: Uses a color palette defined in `omicverse`.\n * `frameon='small'`: Reduces the frame size.\n\n**In summary, this script performs the following workflow:**\n\n1. **Loads and Preprocesses Data:** Reads bulk RNA-seq data and single-cell RNA-seq data, performs gene ID mapping for the bulk data.\n2. **Initial Cell Fraction Prediction:** Predicts cell fractions in bulk data using marker genes.\n3. **Trains a VAE Model:** Uses the bulk and single-cell data to train a VAE model for data generation.\n4. **Generates Pseudo Single-Cell Data:** Uses the trained VAE model to generate single-cell data from the bulk data.\n5. **Filters and Visualizes Generated Data:** Filters generated data based on cell cluster size, and visualizes proportions of cell type.\n6. **Compares Original and Generated Data:** Compares the cell type proportions between original and generated single-cell data and provides visualization using a correlation plot.\n7. **Performs Dimensionality Reduction:** Performs MDE and visualize it with colored by cluster information.\n\nThis script aims to computationally deconvolute bulk RNA-seq data into single-cell-like profiles, which allows for further analysis and interpretation of cellular heterogeneity present within the bulk samples. It combines the power of omicverse's bulk2single module, VAE models for representation learning, and established single-cell analysis techniques.", + "file": "t_bulk2single_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_bulk_combat_annotated.json b/OvStudent/Converted_Jsons/t_bulk_combat_annotated.json new file mode 100644 index 00000000..bb15f24c --- /dev/null +++ b/OvStudent/Converted_Jsons/t_bulk_combat_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs batch correction on three datasets and visualizes the effects of the correction. Here's a breakdown of its functionality and structure:\n\n**Functionality:**\n\n1. **Data Loading and Preparation:**\n * It loads three pre-existing datasets (likely gene expression data) from pickle files (`GSE18520.pickle`, `GSE66957.pickle`, `GSE69428.pickle`). These are stored as pandas DataFrames.\n * Each DataFrame is converted into an `anndata.AnnData` object, which is a structured object designed to store annotated data (e.g., gene expression data along with sample metadata). The data matrix of each AnnData object is the transpose of the loaded DataFrame.\n * A 'batch' column is added to the `obs` (observation metadata) attribute of each AnnData object, labeling each dataset with '1', '2', or '3'.\n\n2. **Data Integration:**\n * The three AnnData objects are concatenated into a single AnnData object called `adata`. The `merge='same'` argument ensures that observations with matching names are merged.\n\n3. **Batch Correction:**\n * Batch correction is performed on the concatenated `adata` object using the `ov.bulk.batch_correction` function from the `omicverse` library. The 'batch' column is used to identify data from different batches. This aims to remove systematic differences between datasets introduced by technical biases.\n\n4. **Data Extraction:**\n * The original data and batch-corrected data are extracted from the `adata` object as pandas DataFrames (and transposed) to save the genes as columns and samples as rows.\n * These extracted DataFrames are saved to CSV files (`raw_data.csv`, `removing_data.csv`) for external use.\n * The `adata` object is saved to an H5AD file (`adata_batch.h5ad`), a common format for storing AnnData objects.\n\n5. **Visualization**\n * **Boxplots:** Boxplots are created for both the original (raw) and the batch-corrected data. Each box corresponds to a sample from a batch. The boxes are colored based on the batch they belong to (red, blue, green). The purpose of the boxplots is to show the before and after effect of batch correction of the data\n * **PCA and Embedding Plots:** The script performs PCA on both the original and batch-corrected data layers. Then, it generates embedding plots (likely scatter plots using PCA components) to visualize how samples are distributed in reduced dimensional space. These plots are colored by batch, allowing for a visual assessment of how batch effects are reduced or eliminated by the correction.\n\n**Structure:**\n\n* **Libraries:**\n * `anndata`: For handling annotated data objects.\n * `pandas`: For data manipulation and reading data.\n * `omicverse`: For omics data analysis, plotting, and batch correction functionality.\n * `matplotlib.pyplot`: For creating plots\n\n* **Workflow:**\n 1. **Data Loading & Annotation:** Loading the datasets, converting them to AnnData format, and annotating the samples.\n 2. **Data Integration:** Merging the datasets into a single AnnData object.\n 3. **Batch Correction:** Applying batch correction to the integrated data.\n 4. **Data Extraction & Storage:** Extracting and saving both the raw and batch-corrected data in tabular and anndata formats\n 5. **Visualization:** Visualizing both the raw and corrected data to verify the effect of batch correction using boxplots and embedding plots.\n 6. **PCA:** Performing PCA for dimensional reduction and plotting\n\n**Key Takeaways:**\n\n* The script demonstrates a typical workflow for batch correction in omics data analysis.\n* It uses `anndata` as the core data structure, which allows for flexible handling of data and annotations.\n* It relies on the `omicverse` library for batch correction and visualization.\n* The script aims to remove technical variation between different batches and then visualize the samples to see if samples from different batches have been integrated with each other.\n\n**In summary, this script is a tool for integrating and batch-correcting multiple gene expression datasets, providing visual and tabular outputs to evaluate the success of batch effect removal.**", + "file": "t_bulk_combat_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_bulktrajblend_annotated.json b/OvStudent/Converted_Jsons/t_bulktrajblend_annotated.json new file mode 100644 index 00000000..92aa99f5 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_bulktrajblend_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive analysis of single-cell RNA sequencing data, incorporating bulk RNA-seq data for trajectory inference and cell type identification. It leverages several libraries, including `omicverse`, `scanpy`, `scvelo`, `numpy`, and `matplotlib`. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading (Lines 1-14):**\n\n - **Imports:** Necessary libraries are imported with aliases for easier use (`ov` for `omicverse`, `sc` for `scanpy`, `scv` for `scvelo`, `np` for `numpy`).\n - **Plot Settings:** `ov.plot_set()` configures plotting styles for `omicverse`.\n - **Single-Cell Data Loading:** The `scvelo.datasets.dentategyrus()` function loads a single-cell dataset of the dentate gyrus into an AnnData object (`adata`).\n - **Bulk Data Loading:** `ov.utils.read()` reads a bulk RNA-seq dataset from a file (`GSE74985_mergedCount.txt.gz`).\n - **Gene ID Mapping:** `ov.bulk.Matrix_ID_mapping()` maps gene IDs in the bulk data to the appropriate reference.\n - **Data Inspection:** The script displays the `adata` object and the first few rows of the processed `bulk` data.\n\n**2. Bulk-to-Single-Cell Mapping with `BulkTrajBlend` (Lines 16-38):**\n\n - **`BulkTrajBlend` Initialization:** An instance of the `BulkTrajBlend` class is created (`bulktb`). This class facilitates the mapping of bulk RNA-seq data onto single-cell data. It uses the loaded bulk and single-cell datasets. It also specifies bulk group labels (`dg_d_1`, `dg_d_2`, `dg_d_3`) and the cell type key within the single-cell data ('clusters').\n - **VAE Configuration:** The Variational Autoencoder (VAE) model within `BulkTrajBlend` is configured using `bulktb.vae_configure()` with 100 target cells.\n - **VAE Training:** The VAE model is trained using `bulktb.vae_train()`. Training parameters include batch size, learning rate, hidden size, number of epochs, and save directories for the model and generated data.\n - **VAE Loading:** The script attempts to load a pre-trained VAE model using `bulktb.vae_load()`.\n - **Data Generation:** The trained VAE is used to generate new single-cell data based on the bulk data using `bulktb.vae_generate()` with a specified `leiden_size`.\n - **Cell Proportion Plot:** `ov.bulk2single.bulk2single_plot_cellprop()` visualizes the cell proportion after the bulk-to-single-cell mapping.\n\n**3. GNN-Based Trajectory Inference (Lines 40-50):**\n\n - **GNN Configuration:** The Graph Neural Network (GNN) model within `BulkTrajBlend` is configured using `bulktb.gnn_configure()` with training parameters.\n - **GNN Training:** The GNN model is trained using `bulktb.gnn_train()`.\n - **GNN Loading:** The script loads a pre-trained GNN model using `bulktb.gnn_load()`.\n - **GNN Result Generation:** The trained GNN model is used to generate results using `bulktb.gnn_generate()`.\n - **MDE Computation:** MDE (Multidimensional Diffusion Embedding) coordinates are computed and stored in the single-cell data (`bulktb.nocd_obj.adata`).\n\n**4. MDE Visualization and Filtering (Lines 50-57):**\n\n - **MDE Plot:** MDE embeddings are plotted using `sc.pl.embedding()` to visualize the single-cell data, colored by clusters and a factor named 'nocd_n'.\n - **Filtered MDE Plot:** A second MDE embedding is plotted, filtered to only include cells where `nocd_n` does not contain a hyphen '-', colored by clusters and `nocd_n`.\n\n**5. Cell Interpolation (Lines 59-62):**\n\n - **Raw Cell Count:** Prints the number of cells in the raw `adata`.\n - **Cell Interpolation:** `bulktb.interpolation('OPC')` interpolates cell data based on the 'OPC' (Oligodendrocyte Precursor Cells) cell type.\n - **Interpolated Cell Count:** Prints the number of cells in the interpolated data.\n\n**6. Preprocessing for Original and Interpolated Data (Lines 64-78):**\n\n - **Data Preparation:** The script prepares the single-cell data (`adata` and `adata1` - interpolated data) for downstream analysis using a series of steps.\n - **Raw Data Storage:** The original data is stored in the `.raw` slot of each AnnData object.\n - **Highly Variable Gene Selection:** `sc.pp.highly_variable_genes()` identifies highly variable genes in both datasets.\n - **Subset to Highly Variable Genes:** The datasets are subsetted to include only the identified highly variable genes.\n - **Data Scaling:** `sc.pp.scale()` scales the gene expression data in both datasets.\n - **PCA:** Principal Component Analysis is performed on both datasets using `sc.tl.pca()`.\n\n**7. MDE Computation for Original and Interpolated Data (Lines 80-81):**\n\n - **MDE for both data:** MDE coordinates are computed and stored in `adata` and `adata1`.\n\n**8. MDE Visualization (Lines 83-93):**\n\n - **MDE Plot (Original):** An MDE embedding is plotted for the original single-cell data, colored by clusters.\n - **MDE Plot (Interpolated):** An MDE embedding is plotted for the interpolated single-cell data, colored by clusters.\n\n**9. Trajectory Analysis with `pyVIA` (Lines 95-105):**\n\n - **`pyVIA` Initialization (Original):** A `pyVIA` object (`v0`) is created for the original single-cell data. `pyVIA` is a tool to perform trajectory inference.\n - **`pyVIA` Run (Original):** The `pyVIA` analysis is executed using `v0.run()`.\n - **`pyVIA` Initialization (Interpolated):** A `pyVIA` object (`v1`) is created for the interpolated single-cell data.\n - **`pyVIA` Run (Interpolated):** The `pyVIA` analysis is executed using `v1.run()`.\n\n**10. Stream Plots and Pseudo Time (Lines 107-126):**\n\n - **Stream Plot (Original):** A stream plot is generated for the original data using `v0.plot_stream()`, displaying cell trajectories colored by clusters.\n - **Stream Plot (Interpolated):** A stream plot is generated for the interpolated data using `v1.plot_stream()`, displaying cell trajectories colored by clusters.\n - **Pseudo Time Stream Plot (Original):** A stream plot is generated for the original data using `v0.plot_stream()`, displaying cell trajectories colored by pseudo time.\n - **Pseudo Time Stream Plot (Interpolated):** A stream plot is generated for the interpolated data using `v1.plot_stream()`, displaying cell trajectories colored by pseudo time.\n\n**11. PAGA Graph Generation and Visualization (Lines 128-142):**\n\n - **Pseudo Time Computation (Original):** `v0.get_pseudotime(adata)` computes and stores pseudo time from the pyVIA analysis into `adata`.\n - **Neighbor Calculation (Original):** The neighbor graph is calculated using `sc.pp.neighbors()`.\n - **PAGA Calculation (Original):** A PAGA (Partition-based graph abstraction) graph is computed with pseudo time as prior, using `ov.utils.cal_paga()`.\n - **PAGA Plot (Original):** The generated PAGA graph is displayed using `ov.utils.plot_paga()`.\n - **Pseudo Time Computation (Interpolated):** `v1.get_pseudotime(adata1)` computes and stores pseudo time from the pyVIA analysis into `adata1`.\n - **Neighbor Calculation (Interpolated):** The neighbor graph is calculated using `sc.pp.neighbors()`.\n - **PAGA Calculation (Interpolated):** A PAGA graph is computed with pseudo time as prior, using `ov.utils.cal_paga()`.\n - **PAGA Plot (Interpolated):** The generated PAGA graph is displayed using `ov.utils.plot_paga()`.\n\n**In Summary:**\n\nThe script performs a complex single-cell RNA sequencing analysis, combining bulk data information with trajectory inference. Key steps involve:\n\n- **Data Loading and Preprocessing:** Loading and preprocessing both single-cell and bulk data.\n- **Bulk-to-Single-Cell Mapping:** Mapping bulk RNA-seq data onto the single-cell space using a VAE, followed by trajectory inference using a GNN.\n- **Data Interpolation:** Interpolating cell data based on a specific cell type, which might represent an intermediate or a missing population in the data.\n- **Trajectory Inference with pyVIA:** Running trajectory inference on original and interpolated datasets using `pyVIA`.\n- **Visualization:** Using MDE embeddings, stream plots, and PAGA graphs to visualize results and cell trajectories.\n- **Pseudo Time:** Computing and visualizing pseudo time for both the raw and interpolated data to infer development progression.\n\nThis script is a comprehensive example of using several bioinformatics tools and libraries to analyze and interpret single-cell RNA sequencing data, integrating bulk data to potentially enhance understanding of developmental or biological processes.", + "file": "t_bulktrajblend_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cellanno_annotated.json b/OvStudent/Converted_Jsons/t_cellanno_annotated.json new file mode 100644 index 00000000..e150f8c9 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cellanno_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive single-cell RNA-seq analysis using the `omicverse` and `scanpy` libraries. It covers data loading, preprocessing, quality control, dimensionality reduction, clustering, cell type annotation, and visualization. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **Libraries Import:** Imports necessary libraries: `omicverse` (as `ov`), and `scanpy` (as `sc`).\n* **Version Check:** Prints the versions of `omicverse` and `scanpy` for reproducibility.\n* **Plotting Defaults:** Sets plotting defaults for `omicverse`.\n* **Data Loading:** Reads 10X Genomics data from a specified directory using `sc.read_10x_mtx`. It uses gene symbols as variable names and enables caching for faster loading in subsequent runs.\n\n**2. Preprocessing and Quality Control:**\n\n* **Quality Control (QC):** Filters cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes using `ov.pp.qc`.\n* **Preprocessing:** Normalizes and calculates highly variable genes using `ov.pp.preprocess`. It uses a shiftlog transform and pearson scaling by default.\n* **Highly Variable Gene Selection:** Saves the original data in `adata.raw`, filters the `AnnData` object to retain only the highly variable genes.\n* **Scaling:** Scales the expression data using `ov.pp.scale`.\n\n**3. Dimensionality Reduction and Clustering:**\n\n* **Principal Component Analysis (PCA):** Performs PCA on the scaled data using `ov.pp.pca`, reducing the dimensionality to 50 principal components.\n* **Neighborhood Graph Construction:** Constructs a neighborhood graph using `sc.pp.neighbors`. It uses scaled PCA representation for graph construction.\n* **Leiden Clustering:** Performs Leiden clustering to identify cell populations using `sc.tl.leiden`.\n* **Manifold Dimensionality Embedding (MDE):** Reduces the dimensionality of the PCA representation for visualization using `ov.utils.mde` and stores it in `adata.obsm[\"X_mde\"]`.\n\n**4. Cell Type Annotation:**\n\n* **pySCSA Initialization (CellMarker):** Creates a `pySCSA` object configured to use the CellMarker database for cell type annotation. It sets parameters for fold change, p-value, target (CellMarker), tissue (all tissues), and database path.\n* **Cell Annotation (CellMarker):** Annotates cells based on Leiden clusters using the CellMarker database. It uses ranked gene expression to perform the annotation.\n* **Auto Annotation (CellMarker):** Automatically annotates the cells based on the CellMarker annotation results.\n* **pySCSA Initialization (PanglaoDB):** Creates a `pySCSA` object configured to use the PanglaoDB database for cell type annotation. It sets parameters for fold change, p-value, target (PanglaoDB), tissue (all tissues), and database path.\n* **Cell Annotation (PanglaoDB):** Annotates cells based on Leiden clusters using the PanglaoDB database. It uses ranked gene expression to perform the annotation.\n* **Print Cell Annotations:** Prints the cell annotation summary using `scsa.cell_anno_print()`\n* **Auto Annotation (PanglaoDB):** Automatically annotates the cells based on the PanglaoDB annotation results.\n\n**5. Visualization:**\n\n* **Embedding Plot (Leiden/CellMarker/PanglaoDB):** Generates and displays an embedding plot colored by Leiden clusters, CellMarker annotations, and PanglaoDB annotations.\n* **Group Creation:** Adds a \"group\" column to `adata.obs`, initially set to \"A\" for all cells, and sets the first 1000 cells to \"B\".\n* **Embedding Plot (Group):** Generates an embedding plot colored by group membership.\n* **Cell Proportion Plot:** Generates a cell proportion plot based on CellMarker annotations, comparing the proportions of different cell types between groups A and B.\n* **Embedding Plot (Cell Type):** Generates a celltype embedding plot.\n* **Ratio of Observed to Expected (Ro/e):** Calculates the ratio of observed to expected cell proportions using the provided 'group' and 'scsa_celltype_cellmarker'.\n* **Ro/e Heatmap:** Transforms the Ro/e ratios based on thresholds (>=2 is '+++', >=1.5 is '++', >=1 is '+', otherwise '+/-'), and displays a heatmap with these symbolic annotations.\n\n**6. Marker Gene Analysis:**\n\n* **Marker Gene Dictionary:** Creates a dictionary of marker genes for each cell type\n* **Leiden Dendrogram:** Computes and stores a dendrogram based on leiden clustering.\n* **Dot Plot:** Generates a dot plot of gene expression for a list of marker genes across Leiden clusters.\n* **Cluster to Annotation Mapping:** Creates a dictionary manually mapping Leiden clusters to major cell type labels.\n* **Annotation with Major Cell Types:** Uses a function to map from the dictionary and add the major cell type to adata.obs as a column.\n* **Embedding Plot (Major and CellMarker):** Generates and displays an embedding plot colored by major cell types and CellMarker annotations.\n* **Get Cell Type Markers:** Extracts marker genes for each cell type identified by CellMarker.\n* **Print Marker Genes:** Prints the keys of the marker dictionary and the marker genes associated with the \"B cell\" type.\n* **Get Tissue list:** Prints the available tissue list in the PySCSA database.\n\n**In Summary:**\n\nThis script provides a comprehensive workflow for analyzing single-cell RNA-seq data. It covers essential steps from data loading and quality control to visualization and cell type annotation using different annotation databases (cellmarker and panglaodb). It also goes a step further in analyzing the ratio of observed to expected cell proportion across different groups as well as using a dotplot to analyze marker gene expression. The script is well-commented, making it easier to understand and adapt for other datasets. It is using `omicverse` for a major part of the pipeline.", + "file": "t_cellanno_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cellfate_annotated.json b/OvStudent/Converted_Jsons/t_cellfate_annotated.json new file mode 100644 index 00000000..7ecd6ce4 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cellfate_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell analysis workflow using the `omicverse` library, focusing on inferring driver regulators along a specific lineage. Here's a breakdown of the functionality and structure:\n\n**1. Library Imports:**\n - `import omicverse as ov`: Imports the `omicverse` library, likely for single-cell analysis. It's aliased as `ov` for easier access.\n - `import scanpy as sc`: Imports the `scanpy` library, a common tool for single-cell analysis, aliased as `sc`.\n - `import pandas as pd`: Imports the `pandas` library, used for data manipulation, aliased as `pd`.\n - `from tqdm.auto import tqdm`: Imports the `tqdm` library for progress bars during computations.\n - `import matplotlib.pyplot as plt`: Imports `matplotlib` for generating plots.\n - `import seaborn as sns`: Imports `seaborn` for statistical data visualization.\n\n**2. Initialization and Data Loading:**\n - `ov.plot_set()`: Sets up the plotting environment from the `omicverse` library.\n - `adata = ov.single.mouse_hsc_nestorowa16()`: Loads a pre-processed single-cell dataset of mouse hematopoietic stem cells from `omicverse` and stores it in the `adata` variable, which is an AnnData object used by Scanpy.\n - `prior_network = ov.single.load_human_prior_interaction_network(dataset='nichenet')`: Loads a human prior interaction network from `omicverse`. The `nichenet` dataset is specified.\n\n**3. Network Conversion & Storage:**\n - `prior_network = ov.single.convert_human_to_mouse_network(prior_network, server_name='asia')`: Converts the human interaction network to a mouse network using a server in Asia.\n - `prior_network.to_csv('result/combined_network_Mouse.txt.gz',sep='\\t')`: Saves the converted network to a compressed tab-separated file in the `result` directory.\n - `prior_network=ov.read('result/combined_network_Mouse.txt.gz',index_col=0)`: Reads the saved mouse interaction network back into memory.\n\n**4. pyCEFCON Analysis:**\n - `CEFCON_obj = ov.single.pyCEFCON(adata, prior_network, repeats=5, solver='GUROBI')`: Initializes a `pyCEFCON` object, using the loaded single-cell data (`adata`) and the mouse interaction network (`prior_network`). The `repeats` and `solver` parameters are passed to the algorithm. `pyCEFCON` is likely a method for identifying cell fate regulators.\n - `CEFCON_obj.preprocess()`: Preprocesses the data for the `pyCEFCON` algorithm.\n - `CEFCON_obj.train()`: Trains the `pyCEFCON` model to infer regulatory relationships.\n - `CEFCON_obj.predicted_driver_regulators()`: Identifies and stores the predicted driver regulators for each lineage in the data.\n - `CEFCON_obj.predicted_RGM()`: Predicts the activity of the inferred regulatory gene modules (RGM).\n\n**5. Results Exploration:**\n - `CEFCON_obj.cefcon_results_dict['E_pseudotime'].driver_regulator.head()`: Displays the first few rows of the `driver_regulator` table for the lineage named `E_pseudotime`.\n - `CEFCON_obj.cefcon_results_dict['E_pseudotime']`: Accesses and prints the result object for the `E_pseudotime` lineage.\n - `lineage = 'E_pseudotime'`: Sets the lineage of interest to be `E_pseudotime`.\n - `result = CEFCON_obj.cefcon_results_dict[lineage]`: Extracts the `E_pseudotime` results.\n \n**6. Gene Embedding Analysis:**\n - `gene_ad=sc.AnnData(result.gene_embedding)`: Creates a new AnnData object containing the gene embeddings from the CEFCON result.\n - `sc.pp.neighbors(gene_ad, n_neighbors=30, use_rep='X')`: Computes neighbor graph using the gene embeddings.\n - `sc.tl.leiden(gene_ad, resolution=1)`: Performs Leiden clustering on the gene embedding space.\n - `sc.tl.umap(gene_ad, n_components=2, min_dist=0.3)`: Reduces the dimensionality of gene embedding using UMAP, preparing for visualization.\n - `ov.utils.embedding(...)`: Creates and displays a UMAP plot showing Leiden clusters based on the gene embeddings, customizsed with `omicverse` specific plotting functions\n \n**7. Driver Regulator Visualization:**\n - `data_for_plot = result.driver_regulator[result.driver_regulator['is_driver_regulator']]`: Filters the driver regulator table to keep only those regulators that were deemed as drivers.\n - `data_for_plot = data_for_plot[0:20]`: Selects the top 20 driver regulators.\n - The subsequent code uses `matplotlib` and `seaborn` to create a horizontal bar plot of the influence scores of the top 20 driver regulators for the specified lineage, and applies customization to the plot style.\n\n**8. Additional Result Visualization:**\n - `result.plot_driver_genes_Venn()`: Generates a Venn diagram showing the overlap of driver genes for different lineages (This function is likely available from the `omicverse` library).\n - `adata_lineage = adata[adata.obs_names[adata.obs[result.name].notna()],:]`: Extracts cells with data for the `E_pseudotime` lineage from the original `adata` object.\n - `result.plot_RGM_activity_heatmap(...)`: Generates a heatmap of Regulatory Gene Module (RGM) activity across different cell types within the lineage, and applies customization such as clustering and a custom legend placement.\n\n**In Summary:**\n\nThis script performs the following key operations:\n\n1. Loads single-cell data and a prior interaction network.\n2. Converts the network to the appropriate species.\n3. Runs the `pyCEFCON` algorithm to infer driver regulators along a specific lineage.\n4. Visualizes the results:\n - Gene embeddings via UMAP and Leiden clustering\n - Influence scores of top driver regulators using a bar plot.\n - Driver gene overlaps through a Venn Diagram\n - RGM activity heatmap over cell types\n5. Customizes plots with formatting and styling.\n\nThis script uses omicverse in conjuction with scanpy, pandas, matplotlib and seaborn to conduct a single-cell analysis pipeline. The script explores the concept of cellular lineage and regulatory elements, allowing for a deeper understanding of the molecular basis of cellular differentiation or development. The utilization of a pre-defined interaction network allows for prior knowledge to guide the inference of regulatory mechanisms.", + "file": "t_cellfate_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cellfate_gene_annotated.json b/OvStudent/Converted_Jsons/t_cellfate_gene_annotated.json new file mode 100644 index 00000000..9d25880b --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cellfate_gene_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive single-cell RNA sequencing analysis, starting from raw data loading to trajectory inference, gene expression trend analysis, and the identification of key genes associated with cell fate transitions. Here's a breakdown of its functionality and structure:\n\n**Overall Workflow:**\n\n1. **Data Loading and Preprocessing:**\n * Loads a publicly available dataset (`dentategyrus`) from the `scvelo` library.\n * Applies quality control (QC) filtering to remove low-quality cells based on mitochondrial percentage, number of UMIs, and the number of detected genes.\n * Stores the raw counts as a layer in the AnnData object.\n * Preprocesses the data using `shiftlog` and Pearson normalization, selects highly variable genes (HVGs), and scales the data.\n * Stores the processed data as raw for later.\n * Performs principal component analysis (PCA) on the scaled data.\n * Computes Minimum Distortion Embedding (MDE) from PCA result.\n\n2. **Clustering and Embedding:**\n * Visualizes the MDE embedding, color-coded by cell clusters.\n * Removes 'Endothelial' cells.\n\n3. **SEACells Analysis:**\n * Applies SEACells, a method for creating a representation of single-cell data using archetypal analysis to find common expression profiles.\n * Initializes, constructs the kernel matrix, initializes archetypes and fits the SEACells model.\n * Monitors the convergence of the model by plotting the residual sum of squares (RSS).\n * Increments the model fit.\n * Visualizes a 2D plot of the PCA embedding, showing the spatial distribution of the SEACells.\n\n4. **SEACell Summarization:**\n * Generates a soft SEACell representation of the data by summarizing the expression data according to the SEACell matrix and cell type labels.\n\n5. **Trajectory Inference and Pseudotime Calculation:**\n * Preprocesses the soft SEACell data, performing HVG selection, scaling, PCA, and UMAP dimensionality reduction.\n * Reorders the cell type categories to align with the original clusters.\n * Performs trajectory inference using `pyVIA`, a trajectory inference algorithm, based on celltype.\n * Calculates and adds pseudotime to the summarized data.\n * Visualizes pseudotime on a UMAP plot.\n\n6. **Cell Fate Genie Analysis:**\n * Initializes the `cellfategenie` class.\n * Runs the ATR filtering method and visualizes gene filtering process.\n * Fits the `cellfategenie` model.\n * Visualizes the gene fitting by raw and filtered expression.\n * Calculates Kendall tau correlations.\n * Identifies and displays genes that show statistically significant changes in expression along the inferred pseudotime.\n * Calculates and adds border cell position into SEACell_soft_ad object based on the pseudotime and cell type.\n * Identifies and visualizes border genes between 'Granule immature' and 'Granule mature' cells.\n * Identifies and displays kernel genes within 'Granule immature' cells.\n\n7. **Output:**\n * Saves the summarized SEACells data in a .h5ad format.\n\n**Structure:**\n\nThe script follows a clear, modular structure:\n\n* **Imports:** Imports necessary libraries (`omicverse` as `ov`, `scvelo` as `scv`, `matplotlib.pyplot` as `plt`, `SEACells`, and `scanpy` as `sc`).\n* **Data Loading and Preprocessing:** A series of functions from `omicverse` and `scvelo` are used for data loading, QC, and preprocessing.\n* **SEACells Model:** SEACells model is initialized, fitted, and visualized.\n* **SEACell Data Summarization:** SEACells are aggregated to represent cell type.\n* **Trajectory Inference:** A trajectory inference using pyVIA is performed, and pseudotime is calculated.\n* **Cell Fate Genie Analysis:** Cellfategenie analysis is performed to determine expression trends based on pseudotime and visualize those.\n* **Gene Trend Analysis:** The genes with the most dynamic expression changes are identified and visualized.\n* **Border and Kernel Gene Identification:** Specific genes that mark transitions and that are important in specific cell states along the trajectory are identified.\n* **Visualization:** Various visualizations are produced using `matplotlib`, and the `omicverse` plotting functions, including embedding plots, convergence plots, gene trend plots, and heatmaps.\n* **Data Storage:** Writes the final AnnData object to an h5ad file.\n\n**Key Libraries and Functions Used:**\n\n* **`omicverse (ov)`:** A comprehensive toolkit for single-cell analysis, providing functions for data preprocessing, dimensionality reduction, visualization, and trajectory inference.\n* **`scvelo (scv)`:** Primarily for RNA velocity analysis, it provides the `dentategyrus` dataset used in the script.\n* **`matplotlib.pyplot (plt)`:** For generating static plots.\n* **`SEACells`:** For single-cell analysis based on archetypal analysis.\n* **`scanpy (sc)`:** A popular single-cell analysis package with functions for data preprocessing, dimensionality reduction, clustering, and more.\n* **`AnnData`:** The core data structure used by `scanpy`, `omicverse`, and `SEACells`.\n\n**In summary,** this script provides a comprehensive single-cell analysis pipeline for the `dentategyrus` dataset. It demonstrates how to integrate tools from several libraries to perform quality control, preprocessing, data analysis and visualization, trajectory inference and identification of important genes along the trajectory. The script is highly modular, well commented, and showcases the ability of integrating multiple packages for a single-cell analysis task.", + "file": "t_cellfate_gene_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cellfate_genesets_annotated.json b/OvStudent/Converted_Jsons/t_cellfate_genesets_annotated.json new file mode 100644 index 00000000..4f23ab24 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cellfate_genesets_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a cell trajectory analysis using the `omicverse` library, specifically focusing on identifying gene expression patterns related to pseudotime. Here's a breakdown of its functionality and structure:\n\n**Overall Goal:** The script aims to analyze gene expression changes along a pseudotime trajectory in a single-cell dataset. It leverages AUCell for pathway enrichment, CellFateGenie for filtering and fitting gene expression trends, and GeneTrends for visualizing those trends. Finally, it visualizes gene expression with heatmaps and word clouds.\n\n**Detailed Functionality and Structure:**\n\n1. **Import Libraries (Lines 1-3):**\n * `import omicverse as ov`: Imports the main `omicverse` library, likely for handling single-cell data analysis. This suggests that `omicverse` provides tools for data reading, pathway analysis, trajectory inference, and visualization.\n * `import scvelo as scv`: Imports the `scvelo` library, which is often used for RNA velocity analysis. Although `scvelo` is imported, it is not actually used in this specific script. This might indicate a prior or planned usage.\n * `import matplotlib.pyplot as plt`: Imports the standard `matplotlib` plotting library for creating figures.\n\n2. **Set Plotting Style (Line 4):**\n * `ov.ov_plot_set()`: Sets a specific plotting style provided by the `omicverse` library, ensuring a consistent visual appearance for all plots generated.\n\n3. **Load Data (Line 6):**\n * `adata = ov.read('data/tutorial_meta_den.h5ad')`: Reads an AnnData object from the file `'data/tutorial_meta_den.h5ad'`. AnnData is a common format for storing single-cell data. This line loads the dataset into the `adata` variable, which is central for all subsequent analyses.\n\n4. **Convert Raw Layer (Line 7):**\n * `adata = adata.raw.to_adata()`: Converts the `raw` layer of the `adata` object to the main data layer. This effectively makes the raw gene expression data the primary data. This is common if you want to analyze the original counts without potential modifications made to the main layer.\n\n5. **Display AnnData Object (Line 8):**\n * `adata`: Displays the loaded `adata` object. This likely prints summary information about the dataset, like cell counts, gene counts, and metadata.\n\n6. **Prepare Gene Sets (Lines 10-12):**\n * `pathway_dict = ov.utils.geneset_prepare('../placenta/genesets/GO_Biological_Process_2021.txt', organism='Mouse')`: Prepares a dictionary of gene sets based on the provided GO (Gene Ontology) file and specifies that the data is from Mouse. This line loads pathway information needed for AUCell.\n * `len(pathway_dict.keys())`: Calculates and likely prints the number of pathways (or gene sets) present in the loaded dictionary.\n\n7. **AUCell Enrichment (Lines 15-20):**\n * `adata_aucs = ov.single.pathway_aucell_enrichment(adata, pathway_dict=pathway_dict)`: Calculates the AUCell (Area Under the Curve) enrichment scores for each pathway in each cell. This step computes enrichment for each pathway using gene expression in cells from the AnnData object and using pathway definitions from the `pathway_dict`. The resulting AUCell scores are stored in a new AnnData object called `adata_aucs`.\n * `adata_aucs.obs = adata[adata_aucs.obs.index].obs`, `adata_aucs.obsm = adata[adata_aucs.obs.index].obsm`, `adata_aucs.obsp = adata[adata_aucs.obs.index].obsp`, `adata_aucs.uns = adata[adata_aucs.obs.index].uns`: These lines copy metadata, observation matrices, pairwise observation matrices, and unstructured data from the original `adata` object to the new `adata_aucs` object. This step makes sure that the metadata is preserved in the output after the AUCell calculation.\n\n8. **Display AUCell AnnData Object (Line 22):**\n * `adata_aucs`: Displays the `adata_aucs` object. This likely shows the AUCell scores in a new AnnData object along with the copied metadata from the original AnnData object.\n\n9. **CellFateGenie Initialization (Line 24-25):**\n * `cfg_obj = ov.single.cellfategenie(adata_aucs, pseudotime='pt_via')`: Initializes a CellFateGenie object using the `adata_aucs` data and specifies `pt_via` as the pseudotime variable. This likely sets up the model for analyzing gene expression changes along the provided pseudotime.\n * `cfg_obj.model_init()`: Initializes the internal CellFateGenie model parameters.\n\n10. **Adaptive Time Resolution (ATR) Filtering (Line 27):**\n * `cfg_obj.ATR(stop=500)`: Performs Adaptive Time-Resolution filtering to identify genes with meaningful changes along the pseudotime. This step refines the gene set based on the selected pseudotime.\n\n11. **Plot Filtering Results (Lines 29-30):**\n * `fig, ax = cfg_obj.plot_filtering(color='#5ca8dc')`: Plots the filtering results (likely showing the trajectory and gene filtering information) using a specific color.\n * `ax.set_title('Dentategyrus Metacells\\nCellFateGenie')`: Sets the plot title.\n\n12. **Fit CellFateGenie Model (Line 32):**\n * `res = cfg_obj.model_fit()`: Fits the CellFateGenie model to the filtered data and stores the results in the `res` variable.\n\n13. **Plot Color Fitting (Lines 34-36):**\n * `cfg_obj.plot_color_fitting(type='raw', cluster_key='celltype')`: Creates a color fitting plot using the raw gene expression data, where the points are colored based on the 'celltype' annotation.\n * `cfg_obj.plot_color_fitting(type='filter', cluster_key='celltype')`: Creates a color fitting plot using the filtered gene expression data, where the points are colored based on the 'celltype' annotation. These two plots show how the filtering affects the visual separation of cells in pseudotime based on gene expression.\n\n14. **Kendall Tau Filtering (Lines 38-39):**\n * `kt_filter = cfg_obj.kendalltau_filter()`: Performs Kendall Tau correlation filtering on the CellFateGenie results. This step calculates the correlation of each gene's expression with pseudotime.\n * `kt_filter.head()`: Displays the top rows of the Kendall Tau filtering results, likely showing the correlation values and p-values.\n\n15. **Select Genes (Line 41):**\n * `var_name = kt_filter.loc[kt_filter['pvalue'] < kt_filter['pvalue'].mean()].index.tolist()`: Selects genes whose p-value from the Kendall Tau test is less than the mean p-value. This effectively selects genes that are significantly correlated with pseudotime and stores them as a list in `var_name`.\n\n16. **GeneTrends Analysis (Lines 42-43):**\n * `gt_obj = ov.single.gene_trends(adata_aucs, 'pt_via', var_name)`: Initializes a GeneTrends object using the selected genes, AUCell scores data, and `pt_via` as pseudotime.\n * `gt_obj.calculate(n_convolve=10)`: Calculates the gene trends using a moving average convolution of size 10.\n\n17. **Print Number of Genes (Line 45):**\n * `print(f\"Dimension: {len(var_name)}\")`: Prints the number of genes selected in the previous step.\n\n18. **Plot Gene Trends (Lines 47-48):**\n * `fig, ax = gt_obj.plot_trend(color=ov.utils.blue_color[3])`: Plots the calculated gene trends using a specific color from the `omicverse` library's `blue_color` list.\n * `ax.set_title(f'Dentategyrus meta\\nCellfategenie', fontsize=13)`: Sets the title of the gene trends plot.\n\n19. **Plot Heatmap (Lines 50-56):**\n * `g = ov.utils.plot_heatmap(adata_aucs, var_names=var_name, sort_by='pt_via', color_by='celltype')`: Creates a heatmap of the selected genes, sorted by pseudotime and colored by cell type.\n * `g.fig.set_size_inches(2, 6)`: Sets the figure size for the heatmap.\n * `g.fig.suptitle('CellFateGenie', x=0.25, y=0.83)`: Sets the title for the heatmap.\n * `g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), fontsize=12)`: Adjusts the y-axis labels' font size on the heatmap.\n\n20. **Show Plot (Line 57):**\n * `plt.show()`: Displays all created matplotlib plots.\n\n21. **Gene Set Word Cloud (Lines 59-60):**\n * `gw_obj1 = ov.utils.geneset_wordcloud(adata=adata_aucs[:, var_name], meta_key='celltype')`: Initializes a Gene Set Word Cloud object using the expression data for the selected genes and colors by `celltype`.\n * `gw_obj1.get()`: Generates the word cloud data.\n\n22. **Plot Word Cloud Heatmap (Lines 62-63):**\n * `g = gw_obj1.plot_heatmap(figwidth=6, cmap='RdBu_r')`: Creates a heatmap representation of the gene set word cloud using a specific figure width and color map.\n * `plt.suptitle('CellFateGenie', x=0.18, y=0.95)`: Sets the main title for the heatmap.\n\n**In Summary:**\n\nThe script performs a comprehensive single-cell analysis focused on identifying gene expression changes along a pseudotime trajectory. It uses a combination of pathway enrichment (AUCell), trajectory analysis (CellFateGenie), trend analysis (GeneTrends), and various visualizations including filtering plots, colored fitting plots, trend plots, heatmaps and wordclouds. The analysis pipeline highlights the use of the `omicverse` library for a variety of single-cell analysis tasks. This is likely applied on a dataset of dentategyrus metacells from a mouse organism. The focus is to identify genes that change along a specific trajectory using pseudotime values defined by 'pt_via', visualize the expression patterns of those genes, and understand the underlying biological pathways through the analysis.", + "file": "t_cellfate_genesets_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cellphonedb_annotated.json b/OvStudent/Converted_Jsons/t_cellphonedb_annotated.json new file mode 100644 index 00000000..51b33b68 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cellphonedb_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs single-cell RNA sequencing analysis, focusing on cell-cell communication using the CellphoneDB method and further exploring gene set enrichment. Here's a breakdown of its functionality and structure:\n\n**1. Library Imports:**\n\n* `scanpy as sc`: Imports the Scanpy library, a common toolkit for single-cell analysis. It handles reading, filtering, and manipulating AnnData objects which are the standard data structure for single-cell data.\n* `matplotlib.pyplot as plt`: Imports the plotting library to create visualizations.\n* `pandas as pd`: Imports Pandas, crucial for data manipulation, especially dataframes and tabular data.\n* `numpy as np`: Imports NumPy for numerical operations and array handling.\n* `omicverse as ov`: Imports the OmicVerse library, which seems to be a custom or third-party package that extends functionality for omics data analysis, especially for visualization and CellphoneDB integration.\n* `os`: Imports the `os` module, providing functions for interacting with the operating system (like file paths).\n\n**2. Data Loading and Preprocessing:**\n\n* `ov.plot_set()`: Sets the default plotting style using a function from `omicverse`.\n* `adata = sc.read('data/cpdb/normalised_log_counts.h5ad')`: Reads single-cell gene expression data from an HDF5 file named `normalised_log_counts.h5ad` into a Scanpy AnnData object named `adata`.\n* `adata = adata[adata.obs['cell_labels'].isin([...])]`: Filters the `adata` object to only include cells with specific cell labels (e.g., 'eEVT', 'iEVT', 'DC', 'dNK1', etc.). It keeps the cells present in the provided list.\n* `adata`: Displays the filtered `adata` object (likely a summary representation).\n* `ov.pl.embedding(...)`: Creates a Uniform Manifold Approximation and Projection (UMAP) embedding plot of the cells, coloring each point by its cell label.\n* `adata.X.max()`: Finds the maximum value in the expression matrix of `adata`, useful to get a sense of the range of values.\n* `sc.pp.filter_cells(adata, min_genes=200)`: Filters cells, removing any cell that has less than 200 expressed genes.\n* `sc.pp.filter_genes(adata, min_cells=3)`: Filters genes, removing any gene that is expressed in less than 3 cells.\n* `adata1 = sc.AnnData(...)`: Creates a new AnnData object `adata1` from filtered count data, preserving cell and gene IDs from the original `adata` object. This is done most likely to work on a separate object while keeping the initial data object for different processing steps.\n* `adata1.write_h5ad('data/cpdb/norm_log.h5ad', compression='gzip')`: Writes the new AnnData object `adata1` to an HDF5 file.\n* `adata1`: Displays the new `adata1` object.\n\n**3. Metadata Preparation:**\n\n* `df_meta = pd.DataFrame(...)`: Creates a Pandas DataFrame `df_meta` containing cell IDs and cell labels.\n* `df_meta.set_index('Cell', inplace=True)`: Sets the 'Cell' column as the index of `df_meta`.\n* `df_meta.to_csv('data/cpdb/meta.tsv', sep='\\t')`: Saves `df_meta` to a tab-separated file.\n\n**4. CellphoneDB Analysis:**\n\n* `cpdb_file_path`, `meta_file_path`, `counts_file_path`, `out_path`: Defines file paths for the CellphoneDB database, metadata, counts matrix, and output.\n* `from cellphonedb.src.core.methods import cpdb_statistical_analysis_method`: Imports the CellphoneDB statistical analysis function.\n* `cpdb_results = cpdb_statistical_analysis_method.call(...)`: Executes CellphoneDB analysis with various parameters:\n * Database file path, metadata path, counts matrix path, gene identifiers\n * Parameters like the number of iterations, threshold for gene expression, number of threads, and p-value\n * Subsampling options for large datasets.\n* `ov.utils.save(cpdb_results,'data/cpdb/gex_cpdb_test.pkl')`: Saves CellphoneDB results to a pickle file.\n* `cpdb_results = ov.utils.load('data/cpdb/gex_cpdb_test.pkl')`: Loads CellphoneDB results from the pickle file.\n\n**5. Cell-Cell Interaction Network Analysis and Visualization:**\n\n* `interaction = ov.single.cpdb_network_cal(...)`: Calculates interaction edges based on CellphoneDB results to use to visualize the interactions.\n* `interaction['interaction_edges'].head()`: Displays the first few rows of the interaction edges DataFrame.\n* Several `ov.pl` (from `omicverse`) functions are used to create various visualizations based on the CellphoneDB results:\n * `cpdb_heatmap`: Creates a heatmap showing interaction scores between cell types.\n * `cpdb_chord`: Generates a chord diagram showing the strength of interactions between cell types.\n * `cpdb_network`: Produces a network graph representing cell-cell interactions with node size and edges scaled by the interaction scores\n* `ov.single.cpdb_plot_network(...)`: Customizes the CellphoneDB network plot with advanced parameters for labeling, titles, colors, node size, etc.\n* The code then filters the interaction edges and adata to create a subset of the data related to the `EVT` and `dNK` cell types. These subset objects are used to generate similar visualizations like heatmap, chord diagram and network plot for those specific cells.\n\n**6. Detailed Interaction Analysis for Specific Cell Types:**\n\n* `sub_means = ov.single.cpdb_exact_target(..., ['eEVT', 'iEVT'])`: Extracts mean interaction scores specifically targeting 'eEVT' and 'iEVT' cells.\n* `sub_means = ov.single.cpdb_exact_source(..., ['dNK1','dNK2','dNK3'])`: Extracts means for source cells 'dNK1', 'dNK2', and 'dNK3'.\n* `sub_means.head()`: Displays the head of the resulting interaction means DataFrame.\n* `ov.pl.cpdb_interacting_heatmap(...)`: Generates a heatmap showing mean expression scores between source and target cells with more control on what to display.\n* `ov.pl.cpdb_group_heatmap(...)`: Creates a grouped heatmap (source/target) of interaction means.\n* `ov.pl.cpdb_interacting_network(...)`: Generates a network graph displaying interactions between source and target cells from the specific means, taking the minimum mean and sum of means to keep only the most relevant interactions.\n\n**7. Gene Set Enrichment Analysis:**\n\n* `sub_means = sub_means.loc[~sub_means['gene_a'].isnull()]`: Removes null values from the 'gene\\_a' column of the dataframe\n* `sub_means = sub_means.loc[~sub_means['gene_b'].isnull()]`: Removes null values from the 'gene\\_b' column of the dataframe\n* `enrichr_genes = sub_means['gene_a'].tolist() + sub_means['gene_b'].tolist()`: Collects all gene names (ligands and receptors) to prepare a gene list for the enrichment analysis.\n* `pathway_dict = ov.utils.geneset_prepare(...)`: Loads a pathway database file (\"GO_Biological_Process_2023.txt\").\n* `enr = ov.bulk.geneset_enrichment(...)`: Performs gene set enrichment analysis on the interaction genes, against the GO terms.\n* `ov.plot_set()`: Sets the plotting style\n* `ov.bulk.geneset_plot(...)`: Creates a dotplot visualization of the gene set enrichment analysis results.\n\n**In Summary:**\n\nThis script is a comprehensive workflow for analyzing single-cell RNA sequencing data with a focus on cell-cell communication. It performs the following major steps:\n\n1. **Data Loading and Filtering:** Loads gene expression data and filters cells based on cell labels and gene counts.\n2. **CellphoneDB Analysis:** Uses CellphoneDB to infer potential cell-cell interactions.\n3. **Visualization:** Creates several visualizations to show cell-cell interactions (UMAP, heatmaps, chord diagrams, network graphs).\n4. **Targeted Analysis:** Examines interactions between specific source and target cell types in more detail and filters interaction means based on cell types of interest.\n5. **Gene Set Enrichment:** Performs gene set enrichment analysis of interaction molecules to further understand the biological mechanisms at play.\n\nThis script leverages several powerful libraries, making it a valuable tool for exploring complex biological systems at single-cell resolution.", + "file": "t_cellphonedb_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cluster_annotated.json b/OvStudent/Converted_Jsons/t_cluster_annotated.json new file mode 100644 index 00000000..16208c8f --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cluster_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive single-cell RNA sequencing (scRNA-seq) analysis using several popular libraries, including `omicverse`, `scanpy`, `scvelo`, and `sklearn`. It aims to demonstrate various dimensionality reduction, clustering, and topic modeling techniques, ultimately comparing their performance. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading (Lines 1-8):**\n\n* **Imports:** The script begins by importing the necessary libraries: `omicverse` (aliased as `ov`), `scanpy` (`sc`), and `scvelo` (`scv`). It imports `scvelo` twice (redundantly) in lines 3 and 6.\n* **Plot Style:** It sets the plotting style using `ov.plot_set()`, which likely customizes the appearance of plots generated by `omicverse`.\n* **Data Loading:** The `scv.datasets.dentategyrus()` function loads a preprocessed dataset of the dentate gyrus from the `scvelo` library into an `AnnData` object named `adata`. This dataset likely contains gene expression counts of single cells from the dentate gyrus.\n* **Initial Display:** The `adata` variable is printed (likely using IPython's display functionality) to provide information about the dataset, such as its size and available metadata.\n\n**2. Preprocessing (Lines 10-13):**\n\n* **Preprocessing:** The core `ov.pp.preprocess()` function applies preprocessing steps to the gene expression data stored in `adata`. These steps include:\n * `'shiftlog|pearson'`: Likely a shift-log transformation followed by Pearson correlation based feature selection.\n * `n_HVGs=3000`: It selects the top 3000 most highly variable genes (HVGs) for further analysis.\n* **Storing Raw Data:** `adata.raw = adata` stores the processed expression data in the `.raw` attribute, often used in `scanpy` to keep a copy of the original data before scaling.\n* **Subsetting HVGs:** The `adata` object is then subset to retain only the selected highly variable genes, discarding the others.\n* **Scaling:** `ov.pp.scale(adata)` scales the gene expression data (using each gene's values scaled to mean zero and standard deviation one).\n\n**3. Dimensionality Reduction (Lines 14-16):**\n\n* **PCA:** `ov.pp.pca(adata, layer='scaled', n_pcs=50)` performs Principal Component Analysis (PCA) on the scaled gene expression data, reducing it to 50 principal components (PCs).\n* **Variance Ratio Plot:** `ov.utils.plot_pca_variance_ratio(adata)` visualizes the variance explained by each PC, helping to assess the effectiveness of PCA.\n\n**4. Clustering and Embedding (Lines 18-31):**\n\n* **Neighborhood Graph:** `sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, use_rep='scaled|original|X_pca')` computes the neighborhood graph based on the top 50 PCs. The code offers three potential sources for the top PCs: scaled, original, and the already calculated X_pca.\n* **Leiden Clustering:** `ov.utils.cluster(adata, method='leiden', resolution=1)` performs Leiden clustering, a graph-based community detection algorithm, and adds the cluster labels to the `adata.obs` DataFrame, with resolution parameter set to 1.\n* **UMAP Embedding (Leiden):** `ov.utils.embedding(...)` generates a UMAP (Uniform Manifold Approximation and Projection) embedding of the data, coloring it by the generated leiden cluster assignments.\n* **Louvain Clustering:** Repeated execution of `sc.pp.neighbors`, `ov.utils.cluster`, and `ov.utils.embedding` for Louvain clustering, similar to Leiden clustering, are performed.\n* **GMM Clustering:** `ov.utils.cluster(adata, method='GMM', n_components=21, ...)` performs Gaussian Mixture Model (GMM) clustering, aiming to identify 21 underlying Gaussian distributions within the data. The resulting clusters are added to `adata.obs`.\n* **UMAP Embedding (GMM):** UMAP embedding is generated, colored by the GMM cluster assignments.\n\n**5. Topic Modeling (Lines 31-45):**\n\n* **LDA:** `ov.utils.LDA_topic(adata, ...)` performs Latent Dirichlet Allocation (LDA) topic modeling on the gene expression data. LDA aims to discover underlying \"topics\" that explain the patterns of gene expression.\n* **Topic Contribution Plots:** The script generates two plots visualizing the topic contributions.\n* **UMAP Embedding (LDA Topics):** UMAP embedding colored by the LDA topic contributions.\n* **Random Forest Classification (LDA):** A Random Forest Classifier (`ov.utils.LDA_topic.get_results_rfc()`) is trained using the LDA topics to predict cell clusters.\n\n**6. Consensus Non-negative Matrix Factorization (cNMF) (Lines 47-65):**\n\n* **cNMF Initialization:** A `ov.single.cNMF` object is created to perform cNMF.\n* **cNMF Worker and Combination:** The code executes a cNMF algorithm distributed across four workers. The worker with `worker_i=0` is launched. The results from the individual workers are then combined into a single result using `cnmf_obj.combine()`.\n* **K-Selection Plot:** The script generates a plot for selecting an appropriate number of components.\n* **Consensus Step:** The script executes a consensus clustering step of the cNMF results, using `cnmf_obj.consensus`. The function computes a consensus matrix from multiple runs of NMF and performs hierarchical clustering. The clustergram is displayed for interactive K selection.\n* **cNMF Results Loading:** cNMF results are loaded from the generated files, and added to the `adata` object via the `get_results` function.\n* **UMAP Embedding (cNMF Topics):** UMAP embedding colored by the usage matrix of cNMF topics are generated.\n* **Random Forest Classification (cNMF):** A Random Forest Classifier is trained using the cNMF topics to predict cell clusters.\n* **UMAP Embedding (Random Forest LDA/cNMF):** UMAP embedding colored by the prediction of cell clusters from the Random forest classifier based on the LDA and cNMF results.\n\n**7. Evaluation and Comparison (Lines 68-90):**\n\n* **Adjusted Rand Index (ARI):** The script calculates the Adjusted Rand Index (ARI) to quantify the similarity between different clustering results and the initial \"clusters\" label.\n* **Print ARI Scores:** The ARI scores for all clustering methods (Leiden, Louvain, GMM, LDA, LDA+RFC, cNMF, and cNMF+RFC ) are printed to console for analysis.\n\n**Summary:**\n\nThe script performs a comprehensive scRNA-seq analysis, encompassing:\n\n* **Data loading and preprocessing:** Loads data, filters for high variance genes, scales the data.\n* **Dimensionality reduction:** PCA is performed to reduce dimensionality, with the variance of each component visualized.\n* **Clustering:** Leiden, Louvain, and GMM clustering are performed.\n* **Topic modeling:** LDA topic modeling is performed to discover latent topics in gene expression.\n* **cNMF:** cNMF is performed for consensus matrix factorization and feature selection.\n* **Classification:** Random Forest Classifier is trained using the results of the topic modeling and the consensus matrix factorization.\n* **Visualization:** UMAP embeddings are generated for each clustering result.\n* **Evaluation:** Adjusted Rand Index (ARI) is calculated for each clustering result.\n\n**Redundancies and Points to Consider:**\n\n* **Redundant Imports:** The `scvelo` library is imported twice. This has no impact on the code's execution.\n* **Redundant `sc.pp.neighbors` Calls:** `sc.pp.neighbors` is called multiple times with the same parameters. A single call would likely suffice if the resulting neighborhood structure is intended for different clustering methods.\n* **Dense Array Conversion:** Conversion of `adata.X` to dense matrix using `.toarray()` is done twice (lines 47 and 66), and is usually only required for certain algorithms that do not operate on sparse matrices.\n* **Repetitive Code:** There is repetition in how UMAP embeddings are generated and configured.\n\nIn summary, this script provides a good example of how to use `omicverse`, `scanpy`, and `scvelo` to analyze single-cell data. It showcases various dimensionality reduction, clustering, topic modeling, and consensus matrix factorization techniques, making it a valuable resource for learning scRNA-seq analysis. The redundancy and repetitions could be improved for cleaner code.", + "file": "t_cluster_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cnmf_annotated.json b/OvStudent/Converted_Jsons/t_cnmf_annotated.json new file mode 100644 index 00000000..1a8532b8 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cnmf_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive single-cell RNA-seq analysis using several libraries, primarily focusing on **non-negative matrix factorization (cNMF)** for identifying gene expression programs within the dataset. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **Lines 1-5:** Import necessary libraries:\n * `scanpy` (`sc`): For general single-cell analysis tasks.\n * `omicverse` (`ov`): A library that seems to be used for multi-omics analysis, preprocessing, and visualization.\n * `scvelo` (`scv`): For RNA velocity analysis (though not explicitly used for velocity in this script).\n * `matplotlib.pyplot` (`plt`): For general plotting.\n * `matplotlib.patheffects`: For creating outlines in plots.\n * `numpy` (`np`): For numerical operations.\n * `seaborn` (`sns`): For statistical data visualization.\n * `matplotlib.gridspec`: For arranging subplots.\n* **Line 3:** Sets a default plot style using `omicverse`.\n* **Line 6:** Loads a pre-existing dataset (dentategyrus) using `scvelo`. This is stored in an `AnnData` object called `adata`, a common data structure for single-cell analysis.\n\n**2. Preprocessing and Dimensionality Reduction:**\n\n* **Line 8:** Starts timing the preprocessing step using an IPython magic command.\n* **Line 9:** Preprocesses the data:\n * `shiftlog|pearson` normalization is applied. This likely involves shifting log-transformed data and then scaling using Pearson correlation.\n * Selects the top 2000 highly variable genes (`n_HVGs=2000`).\n* **Line 10:** Displays the preprocessed AnnData object.\n* **Line 12:** Scales the data using `omicverse`.\n* **Line 13:** Performs principal component analysis (PCA) using `omicverse`.\n\n**3. Visualization of Cell Clusters:**\n\n* **Lines 15-17:** Sets up a `matplotlib` figure and axes for plotting.\n* **Lines 18-35:** Generates a UMAP embedding plot using `omicverse`. It colors cells by 'clusters', a pre-existing annotation in the dataset. It also applies several cosmetic customizations like legend size, font outline, title etc..\n * The `show=False` indicates that the plot will be displayed only after all components have been drawn.\n\n**4. cNMF Analysis:**\n\n* **Lines 37-39:** Initializes a cNMF object using `omicverse`:\n * Sets the number of components to test (5 to 10).\n * Specifies iterations, random seed, number of high variance genes, output directory, and name for the cNMF run.\n* **Line 41:** Performs matrix factorization for the cNMF object using 2 workers for parallel processing.\n* **Line 43:** Combines the results from the parallel runs.\n* **Line 45:** Creates a plot for k-selection using `cnmf_obj.k_selection_plot()`, helping to determine the appropriate number of components.\n* **Lines 47-48:** Sets parameters for consensus clustering : `selected_K=7` (number of clusters), and an initial `density_threshold=2.00` to be used in consensus clustering.\n* **Lines 50-53:** Performs consensus clustering using the specified parameters and displays the clustering result.\n* **Lines 55-60:** Repeats the consensus clustering, this time using `density_threshold=0.10`.\n\n**5. Visualization of Topic Distances and Cluster Labels:**\n\n* **Lines 62-76:** Prepare subplots for visualizing topic distances and cluster labels using matplotlib's gridspec layout and other plotting libraries.\n * The layout consists of:\n - A color map visualization of the cNMF topic distance matrix\n - Cluster labels placed left and above the distance matrix\n - A colorbar for the distance matrix.\n* **Lines 78:** Extracts the topic distance matrix.\n* **Lines 79-83:** Create and plot the distance matrix.\n* **Lines 85-89:** Create and plot cluster labels on the left side.\n* **Lines 91-95:** Create and plot cluster labels on the top side.\n* **Lines 97-107:** Creates a colorbar axis with a title and plots the colorbar.\n* **Lines 109:** Filters the cNMF components based on a density threshold.\n\n**6. Histogram of Local Density:**\n\n* **Lines 110-123:** Generates a histogram of local density values, and adds a vertical line at the density threshold.\n * Indicates the filtering of spectra based on density.\n * Provides context for the chosen threshold.\n\n**7. Loading and Inspecting cNMF Results:**\n\n* **Line 124:** Loads the cNMF results with the specified K and density threshold.\n* **Lines 127-133:** Displays the first few rows of the dataframes from cNMF results ('usage_norm', 'gep_scores', 'gep_tpm', 'top_genes'), showing how results are stored.\n\n**8. Integrating cNMF results into AnnData Object and Visualization:**\n\n* **Line 135:** Adds the cNMF results to the `adata` object.\n* **Lines 137-138:** Generates UMAP embedding plots colored by the `usage_norm` values which represent the contribution of each cNMF component in the single cell.\n* **Lines 140-156:** Creates another UMAP plot now colored by the `cNMF_cluster`, which is added to adata in line 135.\n * Visualizes clusters identified by cNMF.\n\n**9. Random Forest Classification:**\n\n* **Lines 158-160:** Adds random forest classifier results to the `adata` object based on the cNMF results. The classifier identifies the cells which belong to each cluster.\n* **Lines 162-178:** Generates a UMAP embedding plot now colored by classifier results `cNMF_cluster_rfc` and `cNMF_cluster_clf`.\n\n**10. Dot Plot of Top Genes:**\n\n* **Lines 180-183:** Extracts the top 3 genes from each cNMF cluster and puts it in a list called `plot_genes`.\n* **Lines 184-185:** Creates a dotplot using Scanpy, showing the expression levels of the top genes across the cNMF clusters.\n\n**In Summary:**\n\nThis script is a detailed example of using `omicverse` and other single-cell analysis tools to perform the following:\n\n1. **Load and preprocess single-cell data.**\n2. **Reduce the dimensionality of the data using PCA.**\n3. **Visualize cell clusters using UMAP.**\n4. **Apply cNMF to identify gene expression programs.**\n5. **Visualize the topic distance matrix and clustering results.**\n6. **Evaluate the local density of cNMF components.**\n7. **Use a random forest classifier for each identified cluster.**\n8. **Visualize cNMF component scores on the UMAP embedding.**\n9. **Visualize top marker genes using a dotplot.**\n\nThe script is designed for researchers to explore gene expression patterns and cell heterogeneity using cNMF, which can reveal more nuanced patterns than traditional clustering approaches. It provides a comprehensive workflow from data loading to visualization and evaluation.", + "file": "t_cnmf_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_cytotrace_annotated.json b/OvStudent/Converted_Jsons/t_cytotrace_annotated.json new file mode 100644 index 00000000..ffbadf07 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_cytotrace_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs single-cell trajectory inference using the `omicverse` and `scvelo` libraries. Here's a breakdown of its functionality and structure:\n\n**Functionality:**\n\n1. **Import Libraries:**\n * It imports the `omicverse` library (likely a single-cell analysis toolkit) as `ov`.\n * It imports the `scvelo` library (used for RNA velocity analysis) as `scv`.\n * These libraries provide functions for data loading, preprocessing, dimensionality reduction, and trajectory inference.\n\n2. **Load Dataset:**\n * It loads the `dentategyrus` dataset, a pre-processed single-cell RNA-seq dataset from `scvelo`, into an `AnnData` object named `adata`. `AnnData` is a common data structure used in single-cell analysis.\n\n3. **Display Initial Data:**\n * It displays the initial `AnnData` object `adata`, showing information about the cells, genes, and associated metadata.\n\n4. **Preprocess Data:**\n * It preprocesses the `AnnData` object using `omicverse`'s `pp.preprocess` function.\n * It applies a shift-log transformation (likely to normalize the data) and uses Pearson correlation for feature selection.\n * It selects the top 2000 highly variable genes (HVGs) which are often used in dimensionality reduction and trajectory inference.\n\n5. **Display Preprocessed Data:**\n * It displays the `AnnData` object after preprocessing, showing any changes in data structure and contents.\n\n6. **Run CytoTRACE2:**\n * It runs the `cytotrace2` algorithm, a method for inferring cell potency from single-cell data, on the preprocessed `adata` object.\n * The algorithm is configured with specific parameters:\n * `use_model_dir`: Specifies the directory containing pre-trained models used by CytoTRACE2.\n * `species`: Sets the species of origin for the data, which may affect the pre-trained models.\n * `batch_size`, `smooth_batch_size`: Control the batching of cells during calculations, likely for performance and memory reasons.\n * `disable_parallelization`, `max_cores`: Control whether and how parallel processing should be used.\n * `max_pcs`: Sets the maximum number of principal components (dimensionality reduction) for the algorithm.\n * `seed`: Sets a random seed for reproducibility.\n * `output_dir`: Sets the location where CytoTRACE2 results are stored.\n * The results are stored in a variable named `results`.\n\n7. **Generate UMAP Plots:**\n * It uses `omicverse`'s `utils.embedding` function to generate Uniform Manifold Approximation and Projection (UMAP) plots.\n * **First Plot:**\n * It plots the UMAP embedding with cells colored by their `clusters` and the `CytoTRACE2_Score` which indicates cell potency.\n * **Second Plot:**\n * It plots the UMAP embedding with cells colored by the `CytoTRACE2_Potency` and `CytoTRACE2_Relative`, potentially more refined cell potency measures.\n * Both plots use the 'small' frame, the 'Reds' colormap, and set the spacing between the plots.\n\n**Structure:**\n\nThe script is structured sequentially:\n\n1. **Import necessary libraries.**\n2. **Load the dataset.**\n3. **Preprocess the data.**\n4. **Run the CytoTRACE2 algorithm.**\n5. **Visualize the results with UMAP plots.**\n\n**In summary,** this script takes a single-cell dataset, preprocesses it, infers cell potency scores using CytoTRACE2, and visualizes the results on a UMAP embedding. This workflow is common in single-cell biology for exploring cell trajectories and developmental hierarchies.", + "file": "t_cytotrace_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_deg_annotated.json b/OvStudent/Converted_Jsons/t_deg_annotated.json new file mode 100644 index 00000000..ae9a628d --- /dev/null +++ b/OvStudent/Converted_Jsons/t_deg_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs differential gene expression (DEG) analysis and gene set enrichment analysis (GSEA) on bulk RNA-seq data, utilizing the `omicverse` library along with `scanpy` and `matplotlib`. Here's a breakdown of its functionality and structure:\n\n**1. Library Imports & Setup (Lines 1-5):**\n\n - **`import omicverse as ov`**: Imports the `omicverse` library, a toolbox for omics data analysis, aliased as `ov`.\n - **`import scanpy as sc`**: Imports the `scanpy` library, often used for single-cell data analysis but can be helpful for general data handling, aliased as `sc`.\n - **`import matplotlib.pyplot as plt`**: Imports the `matplotlib.pyplot` library for generating plots, aliased as `plt`.\n - **`ov.plot_set()`**: Sets plotting styles using the `omicverse` library. This likely configures default colors, fonts, etc. for consistent visualizations.\n\n**2. Data Loading and Preprocessing (Lines 7-15):**\n\n - **`ov.utils.download_geneid_annotation_pair()`**: Downloads annotation files to map gene IDs (likely between different ID formats). This file will be later used for ID mapping.\n - **`data = ov.read('data/counts.txt', index_col=0, header=1)`**: Reads the gene count data from the file `data/counts.txt`. The `index_col=0` parameter makes the first column the index, and `header=1` sets the second row as the header.\n - **`data.columns = [i.split('/')[-1].replace('.bam','') for i in data.columns]`**: Renames the columns (samples) by extracting the filename from the full path and removing the `.bam` suffix. This assumes the original columns had paths in them such as `data/sample1.bam`.\n - **`data.head()`**: Displays the first few rows of the data to inspect the loaded counts.\n - **`data = ov.bulk.Matrix_ID_mapping(data, 'genesets/pair_GRCm39.tsv')`**: Maps the gene IDs in the count data to another type of ID using the `pair_GRCm39.tsv` file.\n - **`data.head()`**: Displays the head of the data after the ID mapping to see if the mapping was successful.\n\n**3. Differential Gene Expression (DEG) Analysis (Lines 17-48):**\n\n - **`dds = ov.bulk.pyDEG(data)`**: Creates a `DEG` object, a core element in `omicverse` for carrying out differential expression analysis.\n - **`dds.drop_duplicates_index()`**: Removes any duplicate gene IDs which can cause problems.\n - **`dds.normalize()`**: Normalizes the gene expression data, for example by using methods such as TMM, RLE or CPM depending on the settings.\n - **`treatment_groups = ['4-3', '4-4']`**: Defines the sample names for the treatment group for the differential expression.\n - **`control_groups = ['1--1', '1--2']`**: Defines the sample names for the control group for the differential expression.\n - **`result = dds.deg_analysis(treatment_groups, control_groups, method='ttest')`**: Performs DEG analysis using a t-test comparing treatment and control samples, this may vary based on the settings in the DEGs.\n - **`result.head()`**: Displays the first few rows of the result DataFrame containing the DEG results.\n - **`print(result.shape)`**: Prints the dimensions of the results.\n - **`result = result.loc[result['log2(BaseMean)'] > 1]`**: Filters the DEG results, keeping only genes with a base mean greater than 1.\n - **`print(result.shape)`**: Prints the dimensions of the filtered DEG results.\n - **`dds.foldchange_set(fc_threshold=-1, pval_threshold=0.05, logp_max=6)`**: Sets thresholds for fold change, p-value, and max log p-value for DEG results filtering. The `fc_threshold` of -1 implies genes that have a log2 fold change less than -1 or larger than 1 would be counted as differentially expressed.\n - **`dds.plot_volcano(title='DEG Analysis', figsize=(4, 4), plot_genes_num=8, plot_genes_fontsize=12)`**: Creates a volcano plot visualizing the log fold change against the negative log p-value of DEGs, labeling 8 most significant genes.\n - **`dds.plot_boxplot(genes=['Ckap2', 'Lef1'], treatment_groups=treatment_groups, control_groups=control_groups, figsize=(2, 3), fontsize=12, legend_bbox=(2, 0.55))`**: Generates boxplots showing gene expression levels for genes 'Ckap2' and 'Lef1' in the treatment and control groups.\n - **`dds.plot_boxplot(genes=['Ckap2'], treatment_groups=treatment_groups, control_groups=control_groups, figsize=(2, 3), fontsize=12, legend_bbox=(2, 0.55))`**: Generates boxplot showing gene expression level for gene 'Ckap2' in the treatment and control groups.\n\n**4. Gene Set Enrichment Analysis (GSEA) (Lines 50-84):**\n\n - **`ov.utils.download_pathway_database()`**: Downloads the pathway databases from the omicverse.\n - **`pathway_dict = ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt', organism='Mouse')`**: Prepares a pathway dictionary using `omicverse` from a gene set file (in this case, WikiPathways for mouse).\n - **`deg_genes = dds.result.loc[dds.result['sig'] != 'normal'].index.tolist()`**: Extracts a list of differentially expressed gene names based on the significance results from DEG analysis.\n - **`enr = ov.bulk.geneset_enrichment(gene_list=deg_genes, pathways_dict=pathway_dict, pvalue_type='auto', organism='mouse')`**: Performs GSEA to check if differentially expressed genes are overrepresented in certain pathways.\n - **`ov.bulk.geneset_plot(enr, figsize=(2, 5), fig_title='Wiki Pathway enrichment', cax_loc=[2, 0.45, 0.5, 0.02], bbox_to_anchor_used=(-0.25, -13), node_diameter=10, custom_ticks=[5, 7], text_knock=3, cmap='Reds')`**: Creates a plot for the enrichment results for the pathways.\n - **GO Enrichment analysis:**\n - These lines (66-84) perform a similar GSEA analysis as the previous steps, but instead of WikiPathways it uses GO (Gene Ontology) terms related to Biological Process (BP), Molecular Function (MF), and Cellular Component (CC) separately.\n - **`enr_dict = {'BP': enr_go_bp, 'MF': enr_go_mf, 'CC': enr_go_cc}`**: Stores the enrichment results for GO terms in a dictionary.\n - **`colors_dict = {'BP': ov.pl.red_color[1], 'MF': ov.pl.green_color[1], 'CC': ov.pl.blue_color[1]}`**: Defines color mapping for the GO categories for visualization.\n\n**5. Multi-GSEA Plotting (Lines 91-162):**\n\n - **`ov.bulk.geneset_plot_multi(enr_dict, colors_dict, num=3, figsize=(2, 5), text_knock=3, fontsize=8, cmap='Reds')`**: Plots multiple GSEA results (from the GO terms) in a single plot for comparison using an `omicverse` function.\n - **`def geneset_plot_multi(...)`**: Defines a custom function `geneset_plot_multi` to visualize the results of multiple gene set enrichment analyses, utilizing the `PyComplexHeatmap` library for complex plotting. It creates a dot-clustermap of enrichment terms across multiple GO categories, providing detailed annotations, labels, and customizations. This function replaces the previous one by not using the `omicverse` function.\n\n**In Summary:**\n\nThis script takes RNA-seq count data, performs essential preprocessing, identifies differentially expressed genes, and then carries out gene set enrichment analysis using pathway and GO term databases. It generates various plots to aid in the interpretation of the results. This script uses the `omicverse` library as its main framework.\n\n**Key Functionalities:**\n\n* **Data Loading and Preprocessing:** Reading count data, renaming columns, mapping gene IDs.\n* **DEG Analysis:** Identifying differentially expressed genes using t-tests.\n* **Data Filtering:** Filtering results based on the average expression and significance.\n* **Visualization:** Generating volcano plots and boxplots for DEGs.\n* **GSEA:** Determining pathways and GO terms enriched among differentially expressed genes.\n* **Multi-GSEA visualization:** Generating dot-clustermaps to compare enriched pathways or GO terms across different categories.\n\nThis script effectively leverages the `omicverse` library to streamline complex omics data analysis workflows. The final multi-GSEA visualization is custom-made for a specific use case by using `PyComplexHeatmap` instead of the `omicverse` visualization function.", + "file": "t_deg_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_deseq2_annotated.json b/OvStudent/Converted_Jsons/t_deseq2_annotated.json new file mode 100644 index 00000000..96d93b19 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_deseq2_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script utilizes the `omicverse` library to perform a comprehensive RNA-seq analysis, encompassing data loading, preprocessing, differential gene expression analysis, and gene set enrichment analysis. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **`import omicverse as ov`:** Imports the `omicverse` library and assigns it the alias `ov` for easier access.\n* **`ov.utils.ov_plot_set()`:** Sets a predefined plotting style for `omicverse`, ensuring consistent visualization throughout the analysis.\n* **`data = ov.utils.read(...)`:** Reads RNA-seq count data from a specified URL using `ov.utils.read`. The function loads the data into a pandas DataFrame, using the first column as the index and the second row as the header.\n* **`data.columns = [...]`:** Cleans up the column names of the DataFrame by extracting the sample name from the file paths and removing the `.bam` extension.\n* **`data.head()`:** Displays the first few rows of the loaded DataFrame to inspect the data.\n\n**2. Gene ID Mapping:**\n\n* **`ov.utils.download_geneid_annotation_pair()`:** Downloads a file containing gene ID annotation pairs.\n* **`data = ov.bulk.Matrix_ID_mapping(data, 'genesets/pair_GRCm39.tsv')`:** Uses the downloaded annotation file to map the original gene IDs in the DataFrame to a more standardized format. This allows for downstream pathway analysis.\n* **`data.head()`:** Displays the first few rows of the DataFrame again to show the effects of the gene ID mapping.\n\n**3. Differential Gene Expression (DEG) Analysis:**\n\n* **`dds = ov.bulk.pyDEG(data)`:** Initializes a `pyDEG` object from the DataFrame. This object will handle differential expression analysis.\n* **`dds.drop_duplicates_index()`:** Removes any duplicate entries from the index of the `pyDEG` object, ensuring data integrity.\n* **`print('... drop_duplicates_index success')`:** Informs the user that the duplicate index removal was successful.\n* **`treatment_groups = [...]` & `control_groups = [...]`:** Defines lists of sample names corresponding to the treatment and control groups for differential expression comparison.\n* **`result = dds.deg_analysis(...)`:** Performs differential expression analysis using the `DEseq2` method, comparing the treatment and control groups. The results are stored in a DataFrame called `result`.\n* **`print(result.shape)`:** Prints the dimensions (rows and columns) of the `result` DataFrame.\n* **`result = result.loc[result['log2(BaseMean)']>1]`:** Filters the results to include only genes with a log2(BaseMean) greater than 1. This is a standard step to focus on genes with sufficient expression levels.\n* **`print(result.shape)`:** Prints the dimensions of the filtered `result` DataFrame, to see the effect of the filtering.\n* **`dds.foldchange_set(...)`:** Sets fold change and p-value thresholds for determining significant genes, although the thresholds are not used immediately here but are stored in the `dds` object for future use in visualizations.\n* **`dds.plot_volcano(...)`:** Generates a volcano plot, visualizing the results of the differential expression analysis. It shows the log2 fold change of the genes against their p-values.\n* **`dds.plot_boxplot(...)`:** Generates boxplots of expression levels for specified genes (`Ckap2`, `Lef1` and then `Ckap2`) across the treatment and control groups.\n* **Note:** There is a repeated boxplot of 'Ckap2', which is most likely a mistake or an incremental testing of functionality.\n\n**4. Gene Set Enrichment Analysis (GSEA):**\n\n* **`ov.utils.download_pathway_database()`:** Downloads a pathway database, which contains collections of genes associated with specific pathways.\n* **`pathway_dict = ov.utils.geneset_prepare(...)`:** Parses the downloaded pathway database and prepares a gene set dictionary, where each key is the name of a pathway and each value is the list of associated genes.\n* **`rnk = dds.ranking2gsea()`:** Converts the differential expression results into a ranked gene list, which is used as input for GSEA.\n* **`gsea_obj = ov.bulk.pyGSEA(rnk, pathway_dict)`:** Initializes a `pyGSEA` object, which will perform the gene set enrichment analysis.\n* **`enrich_res = gsea_obj.enrichment()`:** Performs the GSEA, identifying which pathways are significantly enriched in the differentially expressed genes.\n* **`gsea_obj.enrich_res.head()`:** Displays the first few rows of the enrichment results to inspect the results.\n* **`gsea_obj.plot_enrichment(...)`:** Generates a bar plot of the enriched pathways.\n* **`gsea_obj.enrich_res.index[:5]`:** Displays the first 5 indices (names of the most enriched pathways) from the enrichment results.\n* **`fig = gsea_obj.plot_gsea(...)`:** Generates a GSEA plot for a specific pathway, showing how its gene members are distributed along the ranked list of differentially expressed genes.\n\n**Overall, this script performs the following key steps:**\n\n1. **Data Acquisition & Preprocessing:** Loads count data, cleans column names, and maps gene IDs.\n2. **Differential Expression Analysis:** Uses DESeq2 to identify genes that are differentially expressed between treatment and control groups.\n3. **Visualization:** Generates volcano plots and box plots to visualize DEG results.\n4. **Gene Set Enrichment Analysis:** Performs GSEA to identify significantly enriched pathways based on the DEG results.\n5. **Visualization of GSEA results:** Generates bar plots and individual GSEA plots to visualize GSEA results.\n\nThis script is designed for researchers working with RNA-seq data. It leverages the `omicverse` library to simplify the steps of loading, preprocessing, analyzing, and visualizing the data, making the analysis more efficient and easier to interpret. The use of multiple visualizations facilitates the interpretation of the complex datasets.", + "file": "t_deseq2_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_gptanno_annotated.json b/OvStudent/Converted_Jsons/t_gptanno_annotated.json new file mode 100644 index 00000000..3d3232da --- /dev/null +++ b/OvStudent/Converted_Jsons/t_gptanno_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs single-cell RNA sequencing (scRNA-seq) data analysis, including preprocessing, clustering, visualization, and cell type annotation using both marker genes and large language models (LLMs). Here's a breakdown of the script's functionality and structure:\n\n**1. Setup and Library Imports (Lines 1-5):**\n\n* **`import omicverse as ov`**: Imports the `omicverse` library (presumably a specialized scRNA-seq analysis library) and assigns it the alias `ov`.\n* **`print(f'omicverse version:{ov.__version__}')`**: Prints the version of the `omicverse` library being used.\n* **`import scanpy as sc`**: Imports the `scanpy` library, a popular scRNA-seq analysis tool, and assigns it the alias `sc`.\n* **`print(f'scanpy version:{sc.__version__}')`**: Prints the version of the `scanpy` library.\n* **`ov.ov_plot_set()`**: Sets a specific plotting style using the `omicverse` library, ensuring consistent visual outputs.\n\n**2. Data Loading (Lines 10-13):**\n\n* **`adata = sc.read_10x_mtx(...)`**: Reads data from a 10x Genomics output matrix (`.mtx`) file into an `AnnData` object (a data structure used by `scanpy` and `omicverse`).\n * **`'data/filtered_gene_bc_matrices/hg19/'`**: Specifies the directory containing the 10x matrix file.\n * **`var_names='gene_symbols'`**: Sets gene symbols as the names of the variables (genes).\n * **`cache=True`**: Enables caching the loaded data for faster subsequent reads.\n\n**3. Data Preprocessing (Lines 17-34):**\n\n* **`adata = ov.pp.qc(adata, tresh={...})`**: Performs quality control (QC) filtering:\n * Filters out cells based on: mitochondrial percentage (`mito_perc` less than 0.05), number of UMIs (`nUMIs` greater than 500), and the number of detected genes (`detected_genes` greater than 250).\n* **`adata = ov.pp.preprocess(adata, mode='shiftlog|pearson', n_HVGs=2000)`**: Applies preprocessing steps:\n * Likely includes normalization and transformation (e.g. shiftlog).\n * Selects the top 2000 highly variable genes (HVGs) using a Pearson method.\n* **`adata.raw = adata`**: Stores the raw (preprocessed) data into the `.raw` attribute. This is useful to recover the raw expression data later.\n* **`adata = adata[:, adata.var.highly_variable_features]`**: Filters the AnnData object to keep only the HVGs.\n* **`ov.pp.scale(adata)`**: Scales the gene expression data to zero mean and unit variance for each gene.\n* **`ov.pp.pca(adata, layer='scaled', n_pcs=50)`**: Performs Principal Component Analysis (PCA) on the scaled data and reduces to 50 principal components (PCs).\n* **`sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, use_rep='scaled|original|X_pca')`**: Constructs a neighborhood graph based on the scaled data, using both the original expression, and the PCA embeddings.\n\n**4. Clustering and Gene Ranking (Lines 37-42):**\n\n* **`sc.tl.leiden(adata)`**: Performs Leiden clustering, identifying clusters of cells based on gene expression similarity.\n* **`sc.tl.dendrogram(adata, 'leiden', use_rep='scaled|original|X_pca')`**: Computes a dendrogram based on the Leiden clusters, which can show the similarity relationships among clusters.\n* **`sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', method='wilcoxon', use_raw=False)`**: Ranks genes based on differential expression between Leiden clusters using the Wilcoxon test, does not use the raw data. This identifies genes that are characteristic of each cluster.\n\n**5. Embedding and Visualization (Lines 45-54):**\n\n* **`adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])`**: Computes a Manifold Diffusion Embedding (MDE) and stores it in `adata.obsm` for visualization.\n* **`adata`**: Displays the contents of the AnnData object.\n* **`ov.pl.embedding(...)`**: Generates an embedding plot:\n * **`basis='X_mde'`**: Uses MDE for the 2D coordinates.\n * **`color=['leiden']`**: Colors the plot according to the Leiden clusters.\n * Other parameters configure the appearance of the plot (legend, frame, color palette).\n\n**6. Cell Type Annotation using LLMs (Lines 56-124):**\n\n* **Marker Gene Definition (Lines 57-58, 93-94):** Defines a dictionary of marker genes (`all_markers`) associated with different clusters.\n* **API Key Setup (Lines 60, 72, 96, 102, 108, 114):** Sets the API key for accessing the LLMs using the `os` library and environment variables.\n* **LLM-Based Cell Type Prediction (`ov.single.gptcelltype` and `ov.single.gptcelltype_local`) (Lines 61-63, 73-75, 97-99, 103-105, 109-111, 115-117, 122-124):**\n * These functions use LLMs (like `qwen-plus`, `gpt-4o`, `moonshot-v1-8k`) to predict cell types based on the input marker genes.\n * The script tests several different LLMs from different providers (OpenAI, Qwen, Kimi) for the same marker gene input.\n * `tissuename`, `speciename`, and `topgenenumber` parameters provide context and control.\n * `ov.single.gptcelltype` likely calls a remote API endpoint for the LLM.\n * `ov.single.gptcelltype_local` uses a locally hosted LLM at the defined model path.\n* **Marker Gene Generation (Lines 66-68):**\n * **`all_markers = ov.single.get_celltype_marker(...)`**: Identifies marker genes from the `adata` object, based on the Leiden clusters, the ranked genes, a fold change threshold, and the number of top genes.\n* **Result Processing (Lines 78-81):**\n * Extracts the predicted cell type names from the results returned by the LLM.\n* **Annotation Integration (Line 83):**\n * **`adata.obs['gpt_celltype'] = adata.obs['leiden'].map(new_result).astype('category')`**: Adds the predicted cell types as a new category (`gpt_celltype`) to the AnnData object.\n* **Cell Type Visualization (Lines 85-91):**\n * Generates another embedding plot with both Leiden clusters and the predicted `gpt_celltype` annotations.\n\n**In summary, this script:**\n\n1. Loads and preprocesses scRNA-seq data.\n2. Performs clustering and ranks genes.\n3. Visually represents the data using embeddings.\n4. Leverages LLMs to predict cell types based on both user-defined marker genes and marker genes automatically identified from the clustering analysis.\n5. Evaluates multiple LLMs from multiple providers (OpenAI, Qwen, Kimi), including a local model.\n6. Integrates predicted cell types as an annotation in the AnnData object and visualizes them on the same embedding as Leiden clustering.\n\n**Key Features:**\n\n* Uses both `omicverse` and `scanpy` for single-cell analysis, leveraging the functionalities of both libraries.\n* Demonstrates a workflow incorporating LLMs for cell type annotation, utilizing both marker gene sets and the ranked genes from clustering.\n* Explores multiple large language models with different providers, highlighting the flexibility of using LLMs for cell annotation.\n* Integrates the LLM predictions into the AnnData object.\n* Includes extensive visualizations throughout the analysis, demonstrating different aspects of the data.\n\nThis script is well-commented, making it easy to understand the purpose of each step. It provides a good example of how to perform single-cell RNA sequencing analysis and integrate it with the power of large language models for cell type annotation.", + "file": "t_gptanno_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_metacells_annotated.json b/OvStudent/Converted_Jsons/t_metacells_annotated.json new file mode 100644 index 00000000..bfb9a78b --- /dev/null +++ b/OvStudent/Converted_Jsons/t_metacells_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell analysis workflow, primarily using the `omicverse` (ov), `scanpy` (sc), and `scvelo` (scv) libraries. It focuses on generating \"meta-cells\" using the MetaCell algorithm, evaluating their quality, and visualizing the results alongside the original cells. Here's a breakdown of the functionality and structure:\n\n**1. Initialization and Data Loading:**\n\n* **Imports:** Imports necessary libraries: `omicverse` (as `ov` for single-cell analysis), `scanpy` (as `sc` for single-cell analysis), `scvelo` (as `scv` for RNA velocity analysis), `seaborn` (as `sns` for statistical visualizations), and `matplotlib.pyplot` (as `plt` for plotting).\n* **Plot Settings:** Sets default plotting parameters using `ov.plot_set()`.\n* **Data Loading:** Loads the pancreas dataset using `scv.datasets.pancreas()` and stores it in `adata`, an `AnnData` object. This dataset likely contains gene expression data for cells from the pancreas.\n* **AnnData Display:** Shows the loaded `AnnData` object.\n\n**2. Quality Control and Preprocessing:**\n\n* **Quality Control (QC):** Performs QC using `ov.pp.qc()`. This step filters cells based on:\n * Percentage of mitochondrial reads (`mito_perc`): removes cells with > 20% mitochondrial reads, indicating potential cell damage or stress.\n * Number of UMIs (`nUMIs`): removes cells with fewer than 500 UMIs (Unique Molecular Identifiers), indicating low sequencing depth.\n * Number of detected genes (`detected_genes`): removes cells with fewer than 250 detected genes.\n * Mitochondrial gene filtering (`mt_startswith='mt-'`): removes genes starting with 'mt-', likely mitochondrial genes.\n* **Preprocessing:** Preprocesses the data using `ov.pp.preprocess()` with:\n * `mode='shiftlog|pearson'`: Applies shiftlog normalization and Pearson residual normalization.\n * `n_HVGs=2000`: Identifies and selects the top 2000 highly variable genes (HVGs). This reduces the dimensionality of the data and focuses on the most informative genes.\n* **Storing Raw Data:** Stores the preprocessed `adata` as the `raw` attribute, which is good practice for later use with preprocessed data.\n* **HVG Filtering:** Filters the `adata` to keep only the highly variable genes previously identified.\n\n**3. MetaCell Generation and Training:**\n\n* **Data Scaling:** Scales the expression data in `adata.X` using `ov.pp.scale()`, which ensures all features have a similar range of values.\n* **PCA:** Performs Principal Component Analysis (PCA) using `ov.pp.pca()`, reducing data dimensionality to 50 principal components (`n_pcs=50`). It uses the scaled data as input.\n* **MetaCell Object Creation:** Creates a `MetaCell` object using `ov.single.MetaCell()`. It uses the scaled data, original data, and PCA components (`use_rep='scaled|original|X_pca'`), and tries to utilize a GPU for faster processing (`use_gpu='cuda:0'`). No specific number of metacells is specified initially (`n_metacells=None`).\n* **MetaCell Initialization and Training:**\n * Initializes the archetypes for the MetaCell model using `meta_obj.initialize_archetypes()`.\n * Trains the MetaCell model using `meta_obj.train()` for a minimum of 10 and a maximum of 50 iterations.\n* **Saving and Loading Model:** The trained MetaCell model is saved using `meta_obj.save()` and then reloaded using `meta_obj.load()`.\n\n**4. MetaCell Analysis and Evaluation:**\n\n* **Prediction of Cell Assignments:** Predicts cell assignments using the trained MetaCell model with `meta_obj.predicted()`. This step uses:\n * `method='soft'`: Assigns soft memberships to cells across metacells.\n * `celltype_label='clusters'`: Uses cluster labels available in the AnnData object for cell type information.\n * `summarize_layer='lognorm'`: Summarizes log normalized data for calculating cell to metacell relationship.\n* **Cell Type Purity Calculation:** Calculates the cell type purity of each metacell based on the provided cluster labels using `meta_obj.compute_celltype_purity()`.\n* **Separation and Compactness Score Calculation:** Calculates separation and compactness scores for the metacells using `meta_obj.separation()` and `meta_obj.compactness()` respectively, based on scaled, original and pca representations.\n* **Visualization of Evaluation Metrics:**\n * Creates a figure with three subplots using `plt.subplots()`.\n * Generates boxplots using `sns.boxplot()` to visualize:\n * SEACell purity scores.\n * MetaCell compactness scores.\n * MetaCell separation scores.\n * Applies customization to the plot, including removal of spines, tight layouts, and specific title.\n\n**5. Embedding and MetaCell Visualization:**\n\n* **Embedding Plot:** Generates a UMAP (Uniform Manifold Approximation and Projection) embedding plot using `ov.pl.embedding()`. The plot is:\n * Colored by cluster labels (`color=['clusters']`).\n * Has a small frame and a title of \"Meta cells\".\n * Has customized legend font settings, marker size, transparency, and outline.\n* **MetaCell Overlay:** Overlays the predicted metacells on the same plot using `ov.single.plot_metacells()` with a specific color.\n\n**6. Comparison of MetaCell and Original Cell Embeddings**\n\n* **Mean S Score Value:** Mean S score values are extracted from the meta cells using the function `ov.single.get_obs_value`, and stored to the output data object for further analysis. \n* **Original Data Preprocessing and Dimensionality Reduction:**\n * Reimports `scanpy` as `sc`.\n * Copy of the AnnData object to the raw attribute.\n * Highly Variable Gene calculation using Scanpy, using the top 2000 genes, and filtering data to only those genes.\n * Data Scaling, PCA (30 Principal Components) using `ov` functions.\n * Graph calculation using `ov.pp.neighbors`\n * UMAP dimensionality reduction using `ov.pp.umap()`\n* **Cell Type Annotation and Visualization:**\n * Converts 'celltype' column in the AnnData object to a categorical type\n * Reorders categories in celltype to match clusters from `adata`.\n * Copies color palette from 'clusters' to 'celltype'\n * Creates an embedding plot using `ov.pl.embedding()`:\n * Colored by 'celltype' (categorical) and 'S_score' (numerical).\n * Specific parameters like frame, colormap, and spacing are added to control layout.\n\n**Summary of the Script's Functionality:**\n\nThe script implements a MetaCell analysis workflow:\n\n1. **Loads single-cell RNA-seq data.**\n2. **Performs quality control and preprocessing steps (normalization, feature selection).**\n3. **Trains a MetaCell model to generate meta-cells from single cells.**\n4. **Evaluates the quality of the resulting meta-cells.**\n5. **Visualizes the meta-cells and the original cells in embedding space, incorporating annotation.**\n6. **Compares the original cell embeddings with meta cell output data object, and displays gene expression scores.**\n\nThis script helps to identify and characterize meta-cells, which might help reduce noise, uncover underlying patterns, and improve the understanding of single-cell data, by grouping similar cells into meta cells. The script also evaluates the performance of this cell aggregation, and visualizes the embeddings of the cells, along with annotation and S-score of the meta cells.", + "file": "t_metacells_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_metatime_annotated.json b/OvStudent/Converted_Jsons/t_metatime_annotated.json new file mode 100644 index 00000000..129fedf4 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_metatime_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a series of single-cell analysis steps using the `omicverse` and `scanpy` libraries. Here's a breakdown of its functionality and structure:\n\n**Overall Purpose:**\n\nThe script takes a pre-processed single-cell RNA-seq dataset (represented as an AnnData object) and performs the following:\n\n1. **Dimensionality Reduction:** Reduces the dimensionality of the data using Minimum Distance Embedding (MDE).\n2. **Overclustering:** Performs overclustering to identify potentially more refined cell groups.\n3. **MetaTiME Categorization:** Predicts cell categories based on a pre-trained MetaTiME model.\n4. **Visualization:** Generates embedding plots to visualize the data and cell categories.\n\n**Detailed Line-by-Line Breakdown:**\n\n1. `import omicverse as ov`: **Imports the `omicverse` library and assigns it the alias `ov`.** `omicverse` likely provides functions for single-cell data analysis, specifically focused on the TiME (Tumor Immune MicroEnvironment).\n2. `ov.utils.ov_plot_set()`: **Sets up plotting configurations** provided by the `omicverse` library, likely configuring things like fonts, colors, and plot styles.\n3. `import scanpy as sc`: **Imports the `scanpy` library and assigns it the alias `sc`.** `scanpy` is a popular Python library for single-cell data analysis, providing functionalities for data loading, pre-processing, visualization, and more.\n4. `adata=sc.read('TiME_adata_scvi.h5ad')`: **Reads an AnnData object from a file named `TiME_adata_scvi.h5ad` and stores it in the variable `adata`.** AnnData is a data structure used by `scanpy` to store single-cell data (cell expression, cell metadata, etc.). The file extension suggests it's an already processed dataset that likely used scVI for integration.\n5. `adata`: **Displays the contents of the `adata` object.** This provides a summary of the data, including the number of cells, genes, and available metadata.\n6. `sc.pp.neighbors(adata, use_rep=\"X_scVI\")`: **Computes the neighborhood graph for the cells,** based on the representation stored in `adata.obsm[\"X_scVI\"]`. `X_scVI` typically represents a lower-dimensional embedding learned using a model like scVI. This step establishes cell-cell relationships based on their similarity.\n7. `adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"X_scVI\"])`: **Calculates the Minimum Distance Embedding (MDE) using the `X_scVI` representation and stores it in the `adata.obsm[\"X_mde\"]` field.** MDE is a dimensionality reduction technique that aims to preserve pairwise distances between data points.\n8. `sc.pl.embedding(...)`: **Generates and displays a scatter plot of the MDE embedding.**\n * `adata`: The AnnData object containing the data.\n * `basis=\"X_mde\"`: Specifies that `X_mde` should be used as the basis for plotting the embedding.\n * `color=[\"patient\"]`: Colors the points in the plot according to the `patient` column in `adata.obs`, allowing visualization of cell distribution across different patients.\n * `frameon=False`: Removes the frame (box) around the plot.\n * `ncols=1`: Specifies that there is only one column for subplots (in case we had multiple plots).\n9. `TiME_object=ov.single.MetaTiME(adata,mode='table')`: **Creates a `MetaTiME` object using the `adata` object and sets the mode to 'table'.** This step initializes a `MetaTiME` object, possibly preparing the data for further analysis related to the Tumor Immune MicroEnvironment.\n10. `TiME_object.overcluster(resolution=8,clustercol = 'overcluster')`: **Performs overclustering on the MetaTiME object with a resolution parameter of 8. The result is stored in the 'overcluster' column of the AnnData object.** Overclustering aims to identify smaller and more fine-grained subgroups of cells than existing cell type annotations.\n11. `TiME_object.predictTiME(save_obs_name='MetaTiME')`: **Predicts the MetaTiME categories for each cell and saves the predictions to the `MetaTiME` column in the `adata.obs` DataFrame.** This step uses a pre-trained model to classify cells into predefined TiME categories (e.g., different immune cell types or tumor cell states).\n12. `fig,ax=TiME_object.plot(cluster_key='MetaTiME',basis='X_mde',dpi=80)`: **Generates an embedding plot colored by the predicted `MetaTiME` category, using the `X_mde` embedding, and stores the figure and axes in variables `fig` and `ax`**. The `dpi` parameter sets the resolution of the plot. This visualization allows seeing the distribution of the predicted TiME categories in the lower dimensional space.\n13. `sc.pl.embedding(...)`: **Generates and displays another scatter plot of the MDE embedding.** This is similar to the first embedding plot, but it's colored differently.\n * `basis=\"X_mde\"`: Specifies that `X_mde` should be used as the basis for plotting the embedding.\n * `color=[\"Major_MetaTiME\"]`: Colors the points in the plot according to the `Major_MetaTiME` column in `adata.obs`. This allows to visualize cells colored by a coarser grouping of the predicted MetaTiME labels.\n * `frameon=False`: Removes the frame (box) around the plot.\n * `ncols=1`: Specifies that there is only one column for subplots.\n\n**In Summary:**\n\nThe script loads single-cell RNA-seq data, performs dimensionality reduction with MDE based on a learned `X_scVI` representation, performs overclustering, predicts `MetaTiME` categories, and visualizes the data and results in embedding plots colored by \"patient\", `MetaTiME`, and `Major_MetaTiME`. It utilizes the `omicverse` library for TiME specific functionalities and `scanpy` for standard single-cell analysis tasks. This script seems designed to explore the tumor immune microenvironment in the context of a specific dataset.", + "file": "t_metatime_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_mofa_annotated.json b/OvStudent/Converted_Jsons/t_mofa_annotated.json new file mode 100644 index 00000000..7e533c2d --- /dev/null +++ b/OvStudent/Converted_Jsons/t_mofa_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script uses the `omicverse` library along with `scanpy` to perform multi-omics factor analysis and visualization. It specifically focuses on using Multi-Omics Factor Analysis (MOFA) to integrate RNA-seq and ATAC-seq data, followed by visualization and analysis of the resulting factors in the context of cell types. Here's a breakdown of the functionality and structure:\n\n**Overall Structure:**\n\nThe script can be broadly divided into three main sections:\n\n1. **Data Loading and MOFA Model Training:** (Lines 1-11) This part focuses on importing data, preprocessing, and running MOFA.\n2. **Factor Loading and Analysis:** (Lines 13-27) Here, pre-trained MOFA model is loaded, and factors are extracted and incorporated with an existing RNA-seq dataset.\n3. **Visualization and Interpretation:** (Lines 28-59) This section involves generating various plots to understand the results of MOFA, including factor distributions, factor correlations, gene weights, and UMAP embedding.\n\n**Detailed Functionality:**\n\n1. **Import Libraries:**\n * `import omicverse as ov`: Imports the `omicverse` library, a package for multi-omics data analysis. It's aliased as `ov` for convenience.\n * `import scanpy as sc`: Imports the `scanpy` library, used for single-cell data analysis, especially for UMAP dimensionality reduction\n\n2. **Load Multi-omics Data (Lines 2-3):**\n * `rna = ov.utils.read('data/sample/rna_p_n_raw.h5ad')`: Loads an RNA-seq dataset from the file 'data/sample/rna_p_n_raw.h5ad' into a variable named `rna`. The '.h5ad' extension indicates this is likely an `AnnData` object, a common data format in single-cell genomics.\n * `atac = ov.utils.read('data/sample/atac_p_n_raw.h5ad')`: Similarly, loads an ATAC-seq dataset from 'data/sample/atac_p_n_raw.h5ad' into the `atac` variable. Both datasets are assumed to contain the same cells with both RNA and ATAC measurements for each cell.\n\n3. **Prepare Data for MOFA (Lines 7-8):**\n * `test_mofa = ov.single.pyMOFA(omics=[rna, atac], omics_name=['RNA', 'ATAC'])`: Creates a `pyMOFA` object from `omicverse` using the loaded RNA and ATAC datasets. It assigns names 'RNA' and 'ATAC' to the respective omics layers.\n\n4. **Run MOFA (Lines 10-11):**\n * `test_mofa.mofa_preprocess()`: Performs pre-processing of the data in preparation for the MOFA algorithm.\n * `test_mofa.mofa_run(outfile='models/brac_rna_atac.hdf5')`: Runs the MOFA algorithm. The result, the fitted model, is saved to 'models/brac_rna_atac.hdf5'.\n\n5. **Load Pre-trained MOFA Model (Lines 16,18):**\n * `rna=ov.utils.read('data/sample/rna_test.h5ad')`: Loads a different RNA-seq dataset for analysis (likely a processed version compared to Line 2).\n * `rna=ov.single.factor_exact(rna,hdf5_path='data/sample/MOFA_POS.hdf5')`: Loads a pre-trained MOFA model from 'data/sample/MOFA_POS.hdf5'. Crucially, this function applies the MOFA model to the loaded RNA-seq dataset `rna` in order to retrieve factor loading values for each cell in the `rna` object. These factor values are added to the `rna` object.\n\n6. **Factor Correlation (Line 21):**\n * `ov.single.factor_correlation(adata=rna, cluster='cell_type', factor_list=[1, 2, 3, 4, 5])`: Calculates and visualizes the correlation between the MOFA factors (1 to 5) and the cell types defined in the `rna` object using its \"cell_type\" metadata.\n\n7. **Extract Weights for a Specific Factor (Line 23):**\n * `ov.single.get_weights(hdf5_path='data/sample/MOFA_POS.hdf5',view='RNA',factor=1)`: Extracts and prints the weights of genes for the first factor from the RNA-seq data from a pre-trained MOFA model in `data/sample/MOFA_POS.hdf5`.\n\n8. **Analysis with `pyMOFAART` Object (Lines 25-37):**\n * `pymofa_obj = ov.single.pyMOFAART(model_path='data/sample/MOFA_POS.hdf5')`: Creates a `pyMOFAART` object to interact with a pre-trained MOFA model.\n * `pymofa_obj.get_factors(rna)`: Extracts the MOFA factor values for each cell in the RNA-seq dataset and adds them as a layer to the `rna` object.\n * `pymofa_obj.plot_r2()`: Plots the R-squared values, showing the amount of variance explained by each factor in the MOFA model.\n * `pymofa_obj.get_r2()`: Retrieves R-squared data.\n * `pymofa_obj.plot_cor(rna, 'cell_type')`: Generates a correlation plot between the factors and cell types.\n * `pymofa_obj.plot_factor(rna, 'cell_type', 'Epi', figsize=(3, 3), factor1=6, factor2=10)`: Visualizes the distribution of two specific factors (6 and 10) within a specific cell type (\"Epi\")\n\n9. **UMAP Visualization with Scanpy (Lines 39-51):**\n * `sc.pp.neighbors(rna)`: Compute nearest neighbors for cells.\n * `sc.tl.umap(rna)`: Calculate UMAP embedding.\n * `sc.pl.embedding(rna, basis=\"X_umap\", color=[\"factor6\", \"cell_type\"], frameon=False, ncols=2, show=False, cmap='Greens', vmin=0)`: Creates a UMAP embedding plot, coloring cells by the value of factor 6 and also by cell type using green color map.\n\n10. **Weight and Feature Visualization (Lines 54-59):**\n * `pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=6,factor2=10,)`: Visualizes weight distribution of genes for the given factor using density distribution plots\n * `pymofa_obj.plot_weights(view='RNA', factor=6, color='#5de25d', ascending=True)`: Displays the weights (contributions) of the genes for factor 6.\n * `pymofa_obj.plot_top_feature_heatmap(view='RNA')`: Generates a heatmap of top-weighted genes for each factor in the RNA-seq data.\n\n**Key Concepts:**\n\n* **Multi-Omics Factor Analysis (MOFA):** A statistical method that aims to identify underlying factors that drive variation across multiple omics datasets. This is valuable for identifying shared biology between different layers of data like RNA expression and chromatin accessibility.\n* **AnnData:** A data format from Scanpy, commonly used to store and organize single-cell data.\n* **Factor Loading:** A measure of how much a specific factor is associated with a given observation (cell).\n* **Factor Weight:** Measures the contribution of genes to a given factor.\n* **UMAP:** A dimensionality reduction technique used to visualize high-dimensional data in a low-dimensional space.\n\n**In Summary:**\n\nThis script is designed to:\n\n1. Integrate RNA-seq and ATAC-seq data using MOFA.\n2. Analyze the learned factors in relation to cell types and gene contributions.\n3. Visualize the results using plots like factor distributions, correlations, UMAP, weight distributions, and gene heatmaps.\n\nThe script effectively uses the `omicverse` library to handle the complexity of multi-omics integration and analysis, and leverages `scanpy` for visualization. The final plots and analysis help identify the underlying biological processes captured by the MOFA factors.", + "file": "t_mofa_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_mofa_glue_annotated.json b/OvStudent/Converted_Jsons/t_mofa_glue_annotated.json new file mode 100644 index 00000000..f4bc0e03 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_mofa_glue_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a multi-omics analysis using the `omicverse` library, focusing on integrating RNA and ATAC sequencing data. It leverages GLUE (Graph-Linked Unified Embedding) to find related cells across the two modalities, then builds and analyzes a multi-omics factor analysis (MOFA) model. Here's a breakdown of the functionality and structure:\n\n**Overall Workflow:**\n\n1. **Data Loading and Preparation:**\n * Loads RNA and ATAC data from H5AD files using `omicverse`.\n * Creates a `GLUE_pair` object to represent the coupled RNA-ATAC dataset.\n2. **GLUE Analysis:**\n * Calculates correlations between RNA and ATAC data within the `GLUE_pair`.\n * Finds neighboring cells between RNA and ATAC data based on GLUE results, producing `res_pair`.\n * Saves the pairing results to a CSV file.\n * Subsets and re-indexes the original RNA and ATAC data based on the paired cells.\n3. **MuData Creation:**\n * Creates a `MuData` object, encapsulating the paired RNA and ATAC data into one object.\n * Saves the `MuData` object to an H5MU file.\n4. **Data Filtering & Sampling:**\n * Extracts RNA and ATAC data from the `MuData` object.\n * Filters both data types to retain only highly variable features.\n * Re-indexes filtered RNA and ATAC based on GLUE results.\n * Subsamples 5000 random cells for comparison and saves the sample indices.\n5. **Adjusted Rand Index (ARI) Calculation:**\n * Calculates the adjusted Rand index (ARI) between cell types present in the sub-sampled RNA and ATAC data.\n * Calculates the ARI between cell types in the whole dataset.\n * Prints the two ARI values for comparison.\n6. **MOFA Model Creation and Training:**\n * Creates a `pyMOFA` object to represent the multi-omics model using the processed RNA and ATAC data.\n * Preprocesses data for MOFA.\n * Trains the MOFA model and saves it to a file.\n7. **MOFA Model Analysis & Visualization:**\n * Loads the saved MOFA model using `pyMOFAART`.\n * Extracts factor values for the RNA data.\n * Plots the R-squared values for each factor.\n * Gets the R-squared values from the MOFA model.\n * Plots the correlation of factors with cell types.\n * Gets the correlation of factors with cell types.\n * Visualizes the association of specific factors with a specific cell type.\n * Computes Minimum Distance Embedding for data visualization.\n * Plots the embedding colored by factors and cell types.\n * Plots gene weights for factors.\n * Plots the weights for the specified factor in RNA data.\n * Plots a heatmap of the top features.\n\n**Line-by-Line Structure:**\n\n* **Lines 1-2:** Imports the `omicverse` library and sets plotting parameters.\n* **Lines 4-5:** Loads RNA and ATAC data from h5ad files.\n* **Lines 7-8:** Creates a `GLUE_pair` object and calculates correlations.\n* **Line 10-11:** Finds neighboring cells and saves results to a CSV.\n* **Lines 13-16:** Subsets and re-indexes data based on GLUE results.\n* **Lines 19-22:** Imports the `MuData` class and creates a `MuData` object, saving it as well.\n* **Lines 26-31:** Extracts RNA and ATAC data from `MuData`, filters for high variability, and re-indexes based on GLUE.\n* **Lines 33-39:** Samples random cells, calculates and prints ARI scores.\n* **Lines 42-45:** Creates a `pyMOFA` object, preprocesses data, runs MOFA, and saves the model.\n* **Lines 47-50:** Loads a pre-trained MOFA model and obtains factor values.\n* **Lines 52-60:** Visualizes and extracts information regarding MOFA model performance and factor-cell type correlations.\n* **Lines 62-65:** Imports MDE and scanpy, computes the neighborhood graph, and calculates the minimum-distance embedding.\n* **Line 67:** Plots the embedding colored by factors and cell types.\n* **Lines 76-78:** Visualizes factor and gene weights.\n* **Line 81:** Plots a feature heatmap.\n\n**Key Functionalities and Concepts:**\n\n* **`omicverse`:** Provides tools for single-cell and multi-omics analysis, including data loading, integration (GLUE), and factor analysis (MOFA).\n* **`GLUE_pair`:** Represents a pair of omics data (RNA & ATAC), allowing for inter-modality analysis and finding related cells.\n* **`MuData`:** An object for holding multiple omics datasets, enabling efficient storage and analysis of multi-omics data.\n* **MOFA:** Multi-Omics Factor Analysis, a dimensionality reduction technique that identifies shared and unique factors across multiple omics datasets.\n* **Adjusted Rand Index (ARI):** Measures the similarity between two clusterings (cell types in this case).\n* **Minimum Distance Embedding (MDE):** A non-linear dimensionality reduction technique.\n\n**In summary, this script demonstrates a comprehensive approach to multi-omics integration using `omicverse`, starting with GLUE-based data pairing, then moving into feature selection, dimensionality reduction via MOFA, and finaly analysis with different visualization and evaluation methods.** The script's structure is linear, starting with data loading and ending with model evaluation and visualization, making it relatively straightforward to follow and understand.", + "file": "t_mofa_glue_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_network_annotated.json b/OvStudent/Converted_Jsons/t_network_annotated.json new file mode 100644 index 00000000..a77f27f1 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_network_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script uses the `omicverse` library to perform network analysis on a set of genes. Here's a breakdown of its functionality and structure:\n\n**Functionality:**\n\nThe script's primary purpose is to:\n\n1. **Define a set of genes:** It creates a list of gene names representing the entities to be analyzed.\n2. **Categorize and color genes:** It assigns each gene to a type ('Type1' or 'Type2') and a corresponding color, which will be used for visualization.\n3. **Retrieve protein-protein interaction (PPI) data:** It fetches interaction data for the specified genes from the STRING database for a specific species (4932, which likely refers to a yeast).\n4. **Construct a pyPPI object:** It creates an object that encapsulates the gene information, interaction data, types, and colors, facilitating network analysis.\n5. **Perform interaction analysis:** It calculates various network metrics based on the retrieved interaction data.\n6. **Visualize the interaction network:** It generates a network graph that visually represents the relationships between the genes, using colors to indicate gene types.\n\n**Structure:**\n\nThe script is organized into a sequential series of operations:\n\n1. **Import and Setup (Lines 1-2):**\n * `import omicverse as ov`: Imports the `omicverse` library and assigns it the alias `ov` for easy access.\n * `ov.utils.ov_plot_set()`: Sets the plotting configurations for the `omicverse` library, ensuring consistent visual output.\n\n2. **Gene Definition (Lines 4-7):**\n * `gene_list=['FAA4','POX1','FAT1','FAS2','FAS1','FAA1','OLE1','YJU3','TGL3','INA1','TGL5']`: Creates a list called `gene_list` containing the names of the genes of interest.\n * `gene_type_dict=dict(zip(gene_list,['Type1']*5+['Type2']*6))`: Creates a dictionary `gene_type_dict` mapping each gene in `gene_list` to a type. The first 5 genes are assigned 'Type1', and the remaining 6 are assigned 'Type2'.\n * `gene_color_dict=dict(zip(gene_list,['#F7828A']*5+['#9CCCA4']*6))`: Creates a dictionary `gene_color_dict` mapping each gene to a specific color. The first 5 genes are assigned the color '#F7828A', and the rest are assigned '#9CCCA4'.\n\n3. **Interaction Data Retrieval (Lines 9-10):**\n * `G_res=ov.bulk.string_interaction(gene_list,4932)`: Uses the `omicverse` library to query the STRING database for interaction data among the genes in `gene_list` for species 4932. The result is stored in the `G_res` variable.\n * `G_res.head()`: Displays the first few rows of the retrieved interaction data, providing a glimpse of the raw information.\n\n4. **pyPPI Object Initialization (Lines 12-15):**\n * `ppi=ov.bulk.pyPPI(...)`: Creates an object of the `pyPPI` class within `omicverse`. The `pyPPI` object is used to manage and analyze protein-protein interaction data, including:\n * `gene=gene_list`: The list of genes.\n * `gene_type_dict=gene_type_dict`: The dictionary mapping gene names to types.\n * `gene_color_dict=gene_color_dict`: The dictionary mapping gene names to colors.\n * `species=4932`: The species ID from STRING database.\n\n5. **Interaction Analysis (Line 18):**\n * `ppi.interaction_analysis()`: Performs network analysis calculations on the `ppi` object. This function likely computes metrics such as degree, betweenness, and closeness centralities for each gene within the network based on the interaction data.\n\n6. **Network Plotting (Line 20):**\n * `ppi.plot_network()`: Visualizes the network stored within the `ppi` object. The graph will show nodes representing genes connected by edges representing their interactions. The node colors will correspond to the `gene_color_dict`.\n\n**In summary, this script is designed to perform a basic protein-protein interaction analysis using the `omicverse` library. It retrieves interaction data for a specific set of genes, constructs an interaction network, analyzes the network, and visualizes the network with colored nodes representing different types of genes. It is a concise example of how to use omicverse for exploring molecular interactions.**", + "file": "t_network_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_nocd_annotated.json b/OvStudent/Converted_Jsons/t_nocd_annotated.json new file mode 100644 index 00000000..e985d6eb --- /dev/null +++ b/OvStudent/Converted_Jsons/t_nocd_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell RNA sequencing (scRNA-seq) analysis using the `omicverse` library, leveraging the `scanpy` library for data manipulation and visualization. Here's a breakdown of its functionality and structure:\n\n**Functionality:**\n\n1. **Importing Libraries:** The script starts by importing necessary libraries:\n - `omicverse` (as `ov`): This is likely a custom library for single-cell analysis, providing high-level functions like `scnocd` and `scanpy_lazy`.\n - `anndata`: Used for storing and manipulating annotated data matrices, the fundamental data structure in scanpy.\n - `scanpy` (as `sc`): A widely used library for scRNA-seq analysis, providing tools for preprocessing, dimensionality reduction, clustering, and visualization.\n - `matplotlib.pyplot` (as `plt`): For generating static plots.\n - `numpy` (as `np`): For numerical computations.\n - `pandas` (as `pd`): For working with data in tabular formats (not directly used in the code but often used in scanpy and omicverse).\n\n2. **Setting up Environment:**\n - `%matplotlib inline`: Specifies that matplotlib plots should be displayed inline in a notebook environment.\n - `sc.settings.verbosity = 3`: Sets the verbosity of scanpy to provide hints during processing.\n - `sc.settings.set_figure_params(dpi=80, facecolor='white')`: Configures the DPI (resolution) and background color for scanpy plots.\n - Creation of Custom Colormap: Defines a list of hex color codes (`sc_color`) and uses them to create a custom matplotlib colormap (`sc_color_cmap`) named `Custom`.\n\n3. **Loading Data:**\n - `adata = anndata.read('sample/rna.h5ad')`: Loads an AnnData object (containing gene expression data and associated metadata) from the 'sample/rna.h5ad' file.\n - `adata`: Displays the loaded AnnData object's information.\n\n4. **Preprocessing:**\n - `adata=ov.single.scanpy_lazy(adata)`: Performs a set of standard preprocessing steps on the AnnData object using the `omicverse` library's `scanpy_lazy` function. This likely includes things like filtering low-quality cells and genes, log-normalization, and possibly feature scaling.\n\n5. **scNOCD Analysis:**\n - `scbrca=ov.single.scnocd(adata)`: Initializes an `scnocd` object from the `omicverse` library using the preprocessed AnnData object. `scnocd` likely implements a specific analysis approach. (scNOCD could refer to a method based on Network-based Outlier Cell Detection).\n - `scbrca.matrix_transform()`: Performs matrix transformation as part of the scNOCD process.\n - `scbrca.matrix_normalize()`: Performs matrix normalization, which might be separate from preprocessing or might be an alternative normalization after preprocessing.\n - `scbrca.GNN_configure()`: Configures the parameters of the Graph Neural Network (GNN) to be used within the scNOCD workflow.\n - `scbrca.GNN_preprocess()`: Prepares the data for the GNN model.\n - `scbrca.GNN_model()`: Runs the GNN model on the preprocessed data.\n - `scbrca.GNN_result()`: Extracts results from the GNN processing step.\n - `scbrca.GNN_plot()`: Generates visualizations related to the GNN model or its output.\n - `scbrca.cal_nocd()`: Calculates nocd scores, likely based on the GNN results and some other parameters within `scnocd`\n - `scbrca.calculate_nocd()`: Calculate nocd scores, potentially providing an alternative way or additional nocd scores.\n\n6. **Visualization:**\n - `sc.pl.umap(scbrca.adata, color=['leiden','nocd'],wspace=0.4,palette=sc_color)`: Generates a UMAP plot, colored first by the `leiden` cluster labels and second by the `nocd` scores. It also adjusts spacing between subplots and sets the color palette.\n - `sc.pl.umap(scbrca.adata, color=['leiden','nocd_n'],wspace=0.4,palette=sc_color)`: Same as above, but colored by 'leiden' and `nocd_n` scores. This likely shows a normalized or otherwise modified version of the nocd scores.\n\n**Structure:**\n\nThe script follows a linear, procedural style:\n\n1. **Setup:** Imports libraries and configures the analysis environment.\n2. **Data Loading:** Loads the input data.\n3. **Preprocessing:** Applies initial preprocessing steps.\n4. **scNOCD Workflow:** Initiates and executes the core `scnocd` analysis steps, including data transformation, GNN processing and outlier score calculation.\n5. **Visualization:** Generates UMAP plots for visualizing the analysis results with clusters and scores.\n\n**In Summary:**\n\nThis script performs a comprehensive single-cell RNA-seq analysis using a custom `omicverse` library and the popular `scanpy` package. It likely aims to identify outlier cells using a network-based method implemented within the `scnocd` function, followed by visualization of the results. The script loads data, preprocesses it, calculates outlier scores using a GNN, and then visualizes these scores alongside identified cell clusters. The `nocd` and `nocd_n` scores likely represent different ways of calculating or scaling the same underlying outlier score.", + "file": "t_nocd_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_preprocess_annotated.json b/OvStudent/Converted_Jsons/t_preprocess_annotated.json new file mode 100644 index 00000000..6c74c9a7 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_preprocess_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs single-cell RNA sequencing (scRNA-seq) data analysis using the `omicverse` and `scanpy` libraries. It reads in data, performs quality control, normalization, dimensionality reduction, clustering, differential gene expression analysis, and visualization. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading (Lines 1-12):**\n\n* **Imports:**\n * `omicverse` as `ov`: Provides a high-level API for scRNA-seq analysis, building on top of `scanpy`.\n * `scanpy` as `sc`: A foundational library for single-cell analysis in Python.\n* **Plotting Style:** `ov.ov_plot_set()`: Sets a default plotting style for `omicverse` visualizations.\n* **Data Loading:**\n * `sc.read_10x_mtx()`: Reads 10x Genomics matrix data (containing gene expression counts for each cell) from a specified directory (`data/filtered_gene_bc_matrices/hg19/`) into an `AnnData` object. The `AnnData` is a data structure used by `scanpy` and `omicverse` to store data and metadata related to single-cell experiments.\n * `var_names='gene_symbols'`: Ensures gene symbols are used as feature (variable) names in the `AnnData`.\n * `cache=True`: Enables caching for faster subsequent reads of the data.\n\n**2. Data Preprocessing (Lines 14-29):**\n\n* **Making Names Unique:**\n * `adata.var_names_make_unique()`: Ensures feature names (gene symbols) are unique, handling duplicates if necessary.\n * `adata.obs_names_make_unique()`: Ensures cell names are unique.\n* **Quality Control (QC):**\n * `ov.pp.qc()`: Performs quality control filtering using thresholds for:\n * `mito_perc`: Percentage of reads mapping to mitochondrial genes (cells with high mitochondrial percentage might be of low quality).\n * `nUMIs`: Number of unique molecular identifiers (UMIs) or total reads per cell.\n * `detected_genes`: Number of genes detected in each cell. Cells with very low number of detected genes might be of poor quality.\n* **Storing Counts Layer:**\n * `ov.utils.store_layers(adata,layers='counts')`: Saves the raw count data in a separate layer called 'counts' in the `AnnData` object. This preserves the raw data before any transformations.\n* **Preprocessing:**\n * `ov.pp.preprocess()`: Normalizes and transforms the data for analysis.\n * `mode='shiftlog|pearson'`: Applies a shiftlog transformation and Pearson scaling (or using pearson as HVGs).\n * `n_HVGs=2000`: Selects the 2000 most highly variable genes (HVGs) for downstream analysis.\n* **Storing Raw Data and Filtering:**\n * `adata.raw = adata`: Stores a copy of the current state of data into the `raw` attribute of `AnnData`.\n * `adata = adata[:, adata.var.highly_variable_features]`: Filters the `AnnData` to keep only the highly variable genes selected in the preprocessing step.\n\n**3. Data Handling and Validation (Lines 31-42):**\n\n* **Copying AnnData and Layer Retrieval:**\n * `adata_counts=adata.copy()`: Makes a copy of the `AnnData` object.\n * `ov.utils.retrieve_layers(adata_counts,layers='counts')`: Retrieve stored layers named `counts` in `adata_counts`.\n* **Data Inspection:**\n * `print('normalize adata:',adata.X.max())`: Prints the maximum value in the normalized data matrix (`adata.X`).\n * `print('raw count adata:',adata_counts.X.max())`: Prints the maximum value of raw counts data in adata_counts.\n* **Re-Copying Raw Data and Layer Retrieval**\n * `adata_counts=adata.raw.to_adata().copy()`: Makes a copy of the data stored in `adata.raw` into `adata_counts`\n * `ov.utils.retrieve_layers(adata_counts,layers='counts')`: Retrieve stored layers named `counts` in `adata_counts`.\n* **Data Inspection:**\n * `print('normalize adata:',adata.X.max())`: Prints the maximum value in the normalized data matrix (`adata.X`).\n * `print('raw count adata:',adata_counts.X.max())`: Prints the maximum value of raw counts data in adata_counts.\n * This section validates that the saved counts layers are correct by printing their maximum values.\n\n**4. Dimensionality Reduction (Lines 44-60):**\n\n* **Scaling:**\n * `ov.pp.scale(adata)`: Scales the data such that each gene has a mean of 0 and a variance of 1.\n* **Principal Component Analysis (PCA):**\n * `ov.pp.pca(adata, layer='scaled', n_pcs=50)`: Performs PCA on the scaled data, reducing its dimensionality to 50 principal components.\n* **Assigning PCA Embedding and Plotting:**\n * `adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']`: Assigns the scaled pca embedding to the `X_pca` slot in the AnnData object.\n * `ov.utils.embedding(...)`: Generates an embedding plot using `X_pca` for the visualization, colored by the expression level of the gene 'CST3'.\n* **Neighborhood Graph and Multidimensional Energy Scaling (MDE):**\n * `sc.pp.neighbors()`: Calculates a nearest-neighbor graph based on the PCA embedding.\n * `ov.utils.mde()`: Computes a MDE embedding based on the PCA embedding.\n\n**5. Further Dimensionality Reduction and Clustering (Lines 62-79):**\n\n* **MDE Embedding Plot:**\n * `ov.utils.embedding(...)`: Generates an embedding plot using `X_mde` for visualization, colored by 'CST3' gene expression.\n* **UMAP:**\n * `sc.tl.umap()`: Performs UMAP dimensionality reduction.\n* **UMAP Embedding Plot:**\n * `ov.utils.embedding(...)`: Generates an embedding plot using `X_umap` for visualization, colored by 'CST3' gene expression.\n* **Leiden Clustering:**\n * `sc.tl.leiden(adata)`: Performs Leiden clustering, a graph-based community detection algorithm.\n* **MDE Embedding Plot (with Cluster and Gene Coloring):**\n * `ov.utils.embedding(...)`: Generates an embedding plot using `X_mde` for visualization, colored by Leiden cluster assignments, 'CST3' gene expression and `NKG7` gene expression.\n\n**6. Convex Hull and Labeling of Clusters (Lines 81-116):**\n\n* **Matplotlib Imports:**\n * `import matplotlib.pyplot as plt`: Imports the matplotlib plotting library.\n* **Setting up Matplotlib Axes**\n * `fig,ax=plt.subplots( figsize = (4,4))`: Creates a matplotlib figure and axes object.\n* **Embedding plot and Convex Hull:**\n * `ov.utils.embedding()`: Creates an embedding plot for X_mde with leiden coloring.\n * `ov.utils.plot_ConvexHull()`: Plots a convex hull around the cluster '0' in the `X_mde` embedding.\n* **Matplotlib Imports**\n * `from matplotlib import patheffects`: Imports the patheffects module of the matplotlib plotting library.\n * `import matplotlib.pyplot as plt`: Imports the matplotlib plotting library.\n * `fig, ax = plt.subplots(figsize=(4,4))`: Creates a matplotlib figure and axes object.\n* **Embedding plot and Label generation:**\n * `ov.utils.embedding()`: Generates an embedding plot for X_mde with leiden coloring.\n * `ov.utils.gen_mpl_labels()`: Generates and adds labels to the embedding plot for each Leiden cluster, excluding 'None'. The labels have bold text, a white border (patheffects) and an arrow to the cluster centroid.\n\n**7. Marker Gene Analysis and Visualization (Lines 118-169):**\n\n* **Marker Genes List:** Defines a list of marker genes associated with different cell types.\n* **Dotplot of Marker Genes:**\n * `sc.pl.dotplot()`: Creates a dotplot showing expression levels of the marker genes across different Leiden clusters.\n* **Dendrogram and Differential Expression Analysis:**\n * `sc.tl.dendrogram()`: Computes a dendrogram based on the leiden clusters.\n * `sc.tl.rank_genes_groups()`: Identifies differentially expressed genes between clusters using a t-test on the scaled PCA embeddings. Results are stored with key `leiden_ttest`.\n * `sc.pl.rank_genes_groups_dotplot()`: Generates dotplot of top ranked genes for each group from the t-test result.\n * `sc.tl.rank_genes_groups()`: Identifies differentially expressed genes between clusters using a t-test on the scaled PCA embeddings, without specifying key_added argument (so using default).\n * `ov.single.cosg()`: Runs consensus scoring of gene groups.\n * `sc.pl.rank_genes_groups_dotplot()`: Generates a dotplot of top ranked genes from `cosg` result for each cluster.\n* **Storing Ranked Genes Results:**\n * A loop iterates over each Leiden cluster, and the differential gene expression results (using t-test) are stored in the `data_dict` dictionary. The `sc.get.rank_genes_groups_df()` retrieves the results of the t-test for each cluster.\n* **Printing data dictionary:**\n * The script prints the keys of the `data_dict` to validate.\n * The head of the dataframe stored in the dictionary is shown.\n* **Creating color dictionary:**\n * The script creates a color dictionary for the leiden clusters using the `adata.uns['leiden_colors']` to map cluster name with colors.\n* **Stacked Volcano Plot:**\n * `ov.utils.stacking_vol()`: Generates a stacked volcano plot, showing differential gene expression results for each Leiden cluster. Each subplot shows a volcano plot for one Leiden cluster.\n* **Setting Y axis limit:**\n * The script sets a standard y axis limit for all the volcano plot subplots.\n* **Setting plot title:**\n * The script sets a title for the whole plot.\n\n**In Summary:**\n\nThis script provides a comprehensive workflow for analyzing scRNA-seq data:\n\n1. **Data Loading and QC:** Reads and cleans the data.\n2. **Preprocessing:** Normalizes and selects variable genes.\n3. **Dimensionality Reduction:** Reduces data dimensionality using PCA, MDE, and UMAP.\n4. **Clustering:** Groups cells into clusters using Leiden.\n5. **Visualization:** Visualizes data using embedding plots, convex hulls, and dot plots.\n6. **Differential Gene Expression:** Identifies marker genes for each cluster using t-test and cosg and visualizes the results in dot plots and stacked volcano plots.\n\nThis script is a good example of how to use `omicverse` and `scanpy` for a typical scRNA-seq analysis. It highlights the common steps involved in the analysis workflow.", + "file": "t_preprocess_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_preprocess_cpu_annotated.json b/OvStudent/Converted_Jsons/t_preprocess_cpu_annotated.json new file mode 100644 index 00000000..748ea8b1 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_preprocess_cpu_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive single-cell RNA sequencing (scRNA-seq) analysis using the `scanpy` and `omicverse` libraries. Here's a breakdown of its functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **Lines 1-2:** Imports the necessary libraries:\n * `scanpy` (`sc`) for single-cell analysis.\n * `omicverse` (`ov`) which provides higher level functions built on top of scanpy.\n* **Line 3:** Sets the plot style using `ov.plot_set()`.\n* **Lines 5-6:** Downloads and extracts the 10x Genomics PBMC3k dataset:\n * `!wget ...` downloads the compressed data.\n * `!cd data; tar -xzf ...` extracts the contents.\n* **Lines 8-11:** Reads the 10x matrix data using `sc.read_10x_mtx()`:\n * Specifies the directory containing the matrix file (`'data/filtered_gene_bc_matrices/hg19/'`).\n * Sets gene symbols as variable names (`var_names='gene_symbols'`).\n * Enables caching for faster loading (`cache=True`).\n* **Line 12:** Displays the loaded AnnData object (`adata`).\n* **Lines 14-15:** Makes variable and observation names unique to avoid errors.\n\n**2. Quality Control and Preprocessing:**\n\n* **Lines 17-21:** Performs quality control (QC) using `ov.pp.qc()`:\n * Filters cells based on mitochondrial percentage (`mito_perc`), number of UMIs (`nUMIs`), and number of detected genes (`detected_genes`).\n * Identifies and removes potential doublets using `doublets_method='sccomposite'`.\n * Note: This function integrates scanpy and other tools for QC.\n* **Line 22:** Displays the updated AnnData object after QC.\n* **Lines 24-25:** Performs preprocessing using `ov.pp.preprocess()`:\n * Uses a shifting logarithm, and Pearson normalization.\n * Selects the top 2000 highly variable genes (`n_HVGs=2000`).\n* **Line 26:** Shows the processed AnnData object.\n* **Lines 28-30:** Stores raw counts and uses highly variable features:\n * Saves the current adata object to `adata.raw`.\n * Keeps only highly variable genes.\n* **Line 31:** Displays the AnnData object after selecting for highly variable genes\n* **Line 34:** Scales the data using `ov.pp.scale()` and uses the scaled data for downstream analysis.\n* **Line 35:** Displays the AnnData object after scaling.\n\n**3. Dimensionality Reduction and Visualization:**\n\n* **Lines 37-38:** Performs Principal Component Analysis (PCA) using `ov.pp.pca()`:\n * Uses the scaled data (`layer='scaled'`).\n * Calculates the first 50 principal components (`n_pcs=50`).\n* **Line 39:** Displays the AnnData object with PCA embedding.\n* **Line 41:** Assigns PCA embedding into the main adata object for convenience.\n* **Lines 42-45:** Visualizes the PCA embedding using `ov.pl.embedding()`:\n * Plots using PCA coordinates (`basis='X_pca'`).\n * Colors the points by the expression of the `CST3` gene (`color='CST3'`).\n * Adjusts plotting frame.\n* **Lines 47-49:** Computes the neighborhood graph using `ov.pp.neighbors()`.\n* **Lines 51-52:** Performs Uniform Manifold Approximation and Projection (UMAP) using `ov.pp.umap()`.\n* **Lines 54-57:** Visualizes the UMAP embedding using `ov.pl.embedding()`, similar to the PCA visualization but using UMAP coordinates.\n* **Lines 59-60:** Performs Minimum Distance Embedding (MDE) using `ov.pp.mde()`.\n* **Lines 62-65:** Visualizes the MDE embedding using `ov.pl.embedding()`.\n\n**4. Cell Cycle Scoring and Leiden Clustering:**\n\n* **Line 67:** Stores raw counts into a new AnnData object.\n* **Line 68:** Scores cells for cell cycle phases using `ov.pp.score_genes_cell_cycle()` which loads a set of human genes known to be involved in cell cycle.\n* **Lines 70-73:** Visualizes the MDE embedding, colored by cell cycle phase using the newly made adata_raw object.\n* **Line 75:** Performs Leiden clustering using `ov.pp.leiden()` with a resolution of 1.\n* **Lines 77-80:** Visualizes MDE colored by Leiden clusters, `CST3`, and `NKG7` gene expression.\n\n**5. Cluster Visualization with Convex Hulls and Labels:**\n\n* **Lines 82-90:** Generates a scatter plot with MDE embedding colored by Leiden clusters using `ov.pl.embedding()` and matplotlib.\n* **Lines 92-96:** Overlays the convex hull of cluster '0' on MDE using `ov.pl.ConvexHull()`.\n* **Lines 99-118:** Generates a scatter plot with MDE and labels each cluster with its corresponding cluster number using `ov.utils.gen_mpl_labels()`.\n\n**6. Marker Gene Analysis:**\n\n* **Lines 121-123:** Defines a list of marker genes to be visualized.\n* **Lines 125-126:** Generates a dot plot of marker gene expression across the Leiden clusters using `sc.pl.dotplot()`.\n* **Lines 128-130:** Compute dendrogram and rank differentially expressed genes using `scanpy` function `sc.tl.dendrogram` and `sc.tl.rank_genes_groups`.\n* **Lines 131-133:** Generates a dot plot of differentially expressed genes in each cluster using `sc.pl.rank_genes_groups_dotplot()`.\n* **Lines 135-136:** Ranks differentially expressed genes using `scanpy` function `sc.tl.rank_genes_groups`.\n* **Lines 137-140:** Calculates marker genes for each cluster using `ov.single.cosg()` and plot result using `sc.pl.rank_genes_groups_dotplot()`.\n\n**7. Volcano Plot Visualization:**\n\n* **Lines 142-145:** Creates a dictionary (`data_dict`) to store the ranked gene results for each cluster using the results from t-test.\n* **Lines 147-149:** Display dictionary's keys and check the first few rows of data_dict.\n* **Lines 151-153:** Creates a dictionary to store the cluster colors from leiden output.\n* **Lines 155-163:** Generates a stacked volcano plot to visualize differentially expressed genes for each cluster using `ov.utils.stacking_vol()`.\n* **Lines 166-169:** Initializes the min and max of logfoldchange.\n* **Lines 170-171:** Sets the limits of each subplot.\n* **Line 172:** Sets the plot title.\n\n**In summary, this script:**\n\n1. Loads and preprocesses scRNA-seq data.\n2. Performs QC, dimensionality reduction (PCA, UMAP, MDE).\n3. Clusters cells using Leiden algorithm.\n4. Visualizes results through embeddings, convex hulls, and cluster labels.\n5. Identifies and visualizes marker genes for each cluster.\n6. Generates volcano plots to show differentially expressed genes for each cluster.\n\nThis script provides a comprehensive pipeline for basic single-cell data analysis and allows a user to explore and interpret the data, and understand cell populations within the data set.", + "file": "t_preprocess_cpu_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_preprocess_gpu_annotated.json b/OvStudent/Converted_Jsons/t_preprocess_gpu_annotated.json new file mode 100644 index 00000000..bd38f52f --- /dev/null +++ b/OvStudent/Converted_Jsons/t_preprocess_gpu_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "Okay, let's break down this Python script, which appears to be designed for single-cell RNA sequencing (scRNA-seq) data analysis using the `omicverse` and `scanpy` libraries.\n\n**Overall Purpose:**\n\nThe script reads scRNA-seq data, performs quality control (QC), preprocessing, dimensionality reduction, clustering, and differential gene expression analysis. It then visualizes the results using embeddings and dot plots, and generates a stacking volcano plot to display differentially expressed genes for each cluster.\n\n**Detailed Functionality and Structure:**\n\n1. **Imports:**\n * **`import omicverse as ov`**: Imports the `omicverse` library, aliasing it as `ov`. `omicverse` is a library built on top of `scanpy` that provides utilities for GPU processing and other analysis pipelines.\n * **`import scanpy as sc`**: Imports the `scanpy` library, aliasing it as `sc`. `scanpy` is a fundamental library for single-cell analysis.\n * **`import matplotlib.pyplot as plt`**: Imports the matplotlib plotting library for generating visualizations.\n * **`from matplotlib import patheffects`:** Imports specific path effects from matplotlib used for text styling.\n\n2. **Initialization:**\n * **`ov.plot_set()`**: Sets plotting parameters from the `omicverse` library.\n * **`ov.settings.gpu_init()`**: Attempts to initialize GPU usage if a GPU is available, leveraging the GPU to speed up calculations.\n\n3. **Data Loading:**\n * **`adata = sc.read_10x_mtx(...)`**: Reads a 10x Genomics matrix file into an `AnnData` object (a core data structure in `scanpy`)\n * `'data/filtered_gene_bc_matrices/hg19/'`: Specifies the path to the directory containing the matrix file.\n * `var_names='gene_symbols'`: Uses gene symbols from the matrix as variable (gene) names.\n * `cache=True`: Caches the read data for faster loading later.\n\n4. **Data Preparation:**\n * **`adata.var_names_make_unique()`**: Ensures all variable (gene) names are unique by appending suffixes if needed.\n * **`adata.obs_names_make_unique()`**: Ensures all observation (cell) names are unique.\n * **`ov.pp.anndata_to_GPU(adata)`**: Moves the `AnnData` object to the GPU for accelerated processing.\n\n5. **Quality Control:**\n * **`adata = ov.pp.qc(adata, ...)`**: Performs quality control on the data:\n * `tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}`: Sets thresholds for filtering cells based on mitochondrial gene percentage, number of UMIs, and number of genes detected per cell.\n * `batch_key=None`: No batch correction is specified in this case.\n\n6. **Preprocessing:**\n * **`adata = ov.pp.preprocess(adata, ...)`**: Preprocesses the data:\n * `mode='shiftlog|pearson'`: Applies a shiftlog transformation and Pearson normalization for normalization.\n * `n_HVGs=2000`: Selects 2000 highly variable genes.\n * **`adata.raw = adata`**: Stores the preprocessed data in the `raw` slot of the AnnData object.\n * **`adata = adata[:, adata.var.highly_variable_features]`**: Subsets the AnnData object to include only the highly variable genes.\n\n7. **Scaling**\n * **`ov.pp.scale(adata)`**: Scales data to have zero mean and unit variance.\n\n8. **Dimensionality Reduction (PCA):**\n * **`ov.pp.pca(adata, ...)`**: Performs Principal Component Analysis (PCA):\n * `layer='scaled'`: PCA is done on the scaled data.\n * `n_pcs=50`: Reduces the data to 50 principal components.\n * **`adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']`**: Moves the PCA results to the conventional location for downstream processing and plotting.\n\n9. **Embedding Visualization (Initial):**\n * **`ov.utils.embedding(adata, ...)`**: Generates a scatter plot of the PCA embedding, colored by expression of gene \"CST3\", with a small frame.\n\n10. **Neighborhood Graph and Manifold Learning**\n * **`ov.pp.neighbors(adata, ...)`**: Builds a nearest-neighbor graph in PCA space.\n * `n_neighbors=15`, `n_pcs=50`: Specifies the number of neighbors and principal components for graph construction.\n * `use_rep='scaled|original|X_pca'`: Uses the previously computed PCA embedding.\n * `method='cagra'`: Uses `cagra` method for computing connectivity.\n * **`adata.obsm[\"X_mde\"] = ov.utils.mde(...)`**: Computes a Manifold Dimensionality Embedding of the PCA data using `omicverse`'s `mde` method.\n\n11. **Embedding Visualization (MDE):**\n * **`ov.pl.embedding(adata, ...)`**: Generates a scatter plot of the MDE embedding colored by expression of gene \"CST3\", with a small frame.\n\n12. **UMAP Embedding:**\n * **`ov.pp.umap(adata)`**: Performs Uniform Manifold Approximation and Projection (UMAP) for non-linear dimensionality reduction.\n\n13. **Embedding Visualization (UMAP):**\n * **`ov.pl.embedding(adata, ...)`**: Generates a scatter plot of the UMAP embedding, colored by expression of gene \"CST3\", with a small frame.\n\n14. **Clustering (Leiden):**\n * **`ov.pp.leiden(adata)`**: Performs Leiden clustering.\n\n15. **Move to CPU**\n * **`ov.pp.anndata_to_CPU(adata)`**: Move the AnnData object to the CPU for downstream operations\n\n16. **Embedding Visualization with Clustering:**\n * **`ov.pl.embedding(adata, ...)`**: Generates a scatter plot of the MDE embedding, colored by \"leiden\" clusters, as well as expression of \"CST3\" and \"NKG7\" genes, with a small frame.\n\n17. **Volcano Plot setup and helper function**\n * **`fig,ax=plt.subplots( figsize = (4,4))`**: Initializes a figure and axes for plotting.\n * **`ov.pl.embedding(adata, ...)`**: Creates an initial embedding on `ax` object, colored by leiden, and sets show=False so that we can manipulate the plot object prior to display.\n * **`ov.pl.ConvexHull(adata, ...)`**: Draws a convex hull around the selected cluster of interest.\n\n18. **Plot customization of MDE plot and label generation**\n * **`fig, ax = plt.subplots(figsize=(4,4))`**: Initializes a figure and axes for plotting.\n * **`ov.pl.embedding(adata, ...)`**: Plots the embedding, colored by the 'leiden' clusters, with customizations.\n * **`ov.utils.gen_mpl_labels(...)`**: Adds cluster labels to the embedding plot for selected clusters.\n\n19. **Marker Gene Analysis:**\n * **`marker_genes = [...]`**: Defines a list of marker genes for dot plot analysis.\n * **`sc.pl.dotplot(adata, ...)`**: Creates a dot plot showing expression of the marker genes across leiden clusters.\n\n20. **Differential Gene Expression Analysis:**\n * **`sc.tl.dendrogram(adata, ...)`**: Calculates a dendrogram for the leiden clusters based on PCA space.\n * **`sc.tl.rank_genes_groups(adata, ..., method='t-test', ...)`**: Calculates differentially expressed genes for each cluster using t-tests on PCA space.\n * **`sc.pl.rank_genes_groups_dotplot(adata, ...)`**: Creates a dot plot of differentially expressed genes for each cluster.\n * **`ov.single.cosg(adata, ...)`**: Performs the COSG method of differential gene expression for each cluster.\n * **`sc.pl.rank_genes_groups_dotplot(adata, ...)`**: Creates a dot plot of differentially expressed genes using the `cosg` results.\n\n21. **Data Extraction and Stacking Volcano Plot:**\n * **`data_dict = {}`**: Creates an empty dictionary to store differential gene expression results for each cluster.\n * The code iterates through each `leiden` cluster and fetches DEGs using `sc.get.rank_genes_groups_df`.\n * **`type_color_dict = ...`**: Create a dictionary to store the colors for each leiden cluster.\n * **`fig, axes = ov.utils.stacking_vol(...)`**: Generates a stacking volcano plot using the `omicverse` utility function, displaying log fold changes and p-values for each cluster.\n * Includes parameters for filtering by p-value and log2foldchange, for color and font formatting.\n\n22. **Stacking Volcano plot custom axes**\n * The code iterates through each `leiden` cluster and uses the stored dictionary results to set the y-axis limit on each subplot.\n * **`plt.suptitle('Stacking_vol',fontsize=12)`**: Sets a title for the complete stacked volcano plot.\n\n**In summary,** this script provides a comprehensive workflow for scRNA-seq analysis. It uses `omicverse` for GPU acceleration and specialized utilities while leveraging the core functionalities of `scanpy` for data handling, preprocessing, and visualization. The pipeline culminates in the generation of a stacking volcano plot for examining differential gene expression in the identified cell clusters. This script would be the basis of a common single cell analysis notebook workflow.", + "file": "t_preprocess_gpu_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_scdeg_annotated.json b/OvStudent/Converted_Jsons/t_scdeg_annotated.json new file mode 100644 index 00000000..3296211e --- /dev/null +++ b/OvStudent/Converted_Jsons/t_scdeg_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell RNA sequencing (scRNA-seq) analysis workflow using the `omicverse`, `scanpy`, and `scvelo` libraries. It encompasses data loading, preprocessing, quality control, dimensionality reduction, differential expression analysis (both at the cell and meta-cell level), and visualization. Here's a breakdown of its structure and functionality:\n\n**1. Library Imports:**\n\n - `import omicverse as ov`: Imports the `omicverse` library for scRNA-seq analysis, aliased as `ov`.\n - `import scanpy as sc`: Imports the `scanpy` library for single-cell analysis, aliased as `sc`.\n - `import scvelo as scv`: Imports the `scvelo` library for RNA velocity analysis, aliased as `scv`.\n - `import matplotlib.pyplot as plt`: Imports `matplotlib.pyplot` for creating plots.\n\n**2. Setup and Data Loading:**\n\n - `ov.utils.ov_plot_set()`: Sets a default plotting style for `omicverse`.\n - `adata = scv.datasets.pancreas()`: Loads the pancreas dataset from `scvelo` and stores it as an AnnData object (`adata`), which is a common data structure in single-cell analysis.\n - `adata`: Displays the `adata` object, showing its structure and basic information.\n - `adata.X.max()`: Calculates the maximum value in the count matrix (`adata.X`), which often represents gene expression levels.\n\n**3. Data Preprocessing and Quality Control:**\n\n - `adata=ov.pp.qc(...)`: Performs quality control on the `adata` object, filtering cells based on the following thresholds:\n - `mito_perc`: Percentage of mitochondrial gene counts (set to 5%)\n - `nUMIs`: Total number of counts per cell (set to 500)\n - `detected_genes`: Number of genes detected in a cell (set to 250)\n - `adata=ov.pp.preprocess(...)`: Preprocesses the data with:\n - `mode='shiftlog|pearson'`: Shift-log normalization followed by Pearson residual calculation.\n - `n_HVGs=2000`: Selects the top 2000 highly variable genes (HVGs) across the cells.\n - `adata.raw = adata`: Stores a copy of the preprocessed `adata` object in `adata.raw` for later use.\n - `adata = adata[:, adata.var.highly_variable_features]`: Filters the `adata` object to keep only the highly variable genes selected earlier.\n - `ov.pp.scale(adata)`: Scales the expression values of each gene across cells to have zero mean and unit variance.\n\n**4. Dimensionality Reduction:**\n\n - `ov.pp.pca(adata, layer='scaled', n_pcs=50)`: Performs Principal Component Analysis (PCA) on the scaled data, reducing the dimensionality to 50 principal components.\n - `adata.X.max()`: Calculates the maximum value in the PCA reduced matrix.\n\n**5. Differential Expression Analysis (Cell-Level):**\n\n - `test_adata=adata[adata.obs['clusters'].isin(['Alpha','Beta'])]`: Creates a new `adata` object containing only cells from the 'Alpha' and 'Beta' clusters.\n - `test_adata`: Displays the `test_adata` object.\n - `dds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T)`: Performs differential expression analysis using `pyDEG` on the log-normalized expression data of `test_adata`. The `.T` transposes the data so that genes are rows.\n - `dds.drop_duplicates_index()`: Removes any duplicate gene names in the index of `dds`.\n - `print('... drop_duplicates_index success')`: Confirmation message.\n - `treatment_groups=test_adata.obs[test_adata.obs['clusters']=='Alpha'].index.tolist()`: Creates a list of cell indices that belong to the 'Alpha' cluster, designated as the treatment group.\n - `control_groups=test_adata.obs[test_adata.obs['clusters']=='Beta'].index.tolist()`: Creates a list of cell indices that belong to the 'Beta' cluster, designated as the control group.\n - `result=dds.deg_analysis(...)`: Performs a differential expression analysis between the Alpha (treatment) and Beta (control) cells using a t-test.\n - `result.sort_values('qvalue').head()`: Sorts the results by the q-value (adjusted p-value) and displays the top few rows.\n - `dds.foldchange_set(...)`: Sets fold-change, p-value, and log p-value thresholds for `dds` object.\n - `dds.plot_volcano(...)`: Generates a volcano plot of the differential expression results, showing log2 fold change vs -log10(p-value).\n - `dds.plot_boxplot(...)`: Generates box plots showing the expression levels of genes Irx1 and Adra2a in the Alpha and Beta groups.\n - `ov.utils.embedding(...)`: Generates a UMAP embedding plot, colored by cluster, as well as the expression of Irx1 and Adra2a genes.\n\n**6. Meta-Cell Analysis (Cell Aggregation):**\n\n - `meta_obj=ov.single.MetaCell(...)`: Creates a `MetaCell` object which groups similar cells into meta-cells, to reduce noise and improve analysis.\n - `use_rep='scaled|original|X_pca'`: Specifies the representations to use when generating metacells.\n - `n_metacells=150`: Specifies the number of metacells to form.\n - `use_gpu=True`: Specifies whether to use GPU for processing.\n - `meta_obj.initialize_archetypes()`: Initializes the archetypes for the `MetaCell` object.\n - `meta_obj.train(min_iter=10, max_iter=50)`: Trains the `MetaCell` object.\n - `meta_obj.save('seacells/model.pkl')`: Saves the trained `MetaCell` model.\n - `meta_obj.load('seacells/model.pkl')`: Loads the saved `MetaCell` model.\n - `ad=meta_obj.predicted(...)`: Predicts cell type labels for meta-cells using a soft assignment method and summarizes to the log normalized layer.\n - `ad.X.min(),ad.X.max()`: Prints the minimum and maximum values in the cell-type prediction matrix.\n - `fig, ax = plt.subplots(figsize=(4,4))`: Create matplotlib figure and axes for plotting.\n - `ov.utils.embedding(...)`: Generates an UMAP embedding plot, colored by the meta-cell clusters.\n - `ov.single._metacell.plot_metacells(...)`: Overlays meta-cell boundaries on the embedding plot.\n\n**7. Differential Expression Analysis (Meta-Cell Level):**\n\n - `test_adata=ad[ad.obs['celltype'].isin(['Alpha','Beta'])]`: Creates a new `adata` object containing only meta-cells with 'Alpha' and 'Beta' labels.\n - `test_adata`: Displays the `test_adata` object.\n - `dds_meta=ov.bulk.pyDEG(test_adata.to_df().T)`: Performs differential expression analysis on the meta-cell data.\n - `dds_meta.drop_duplicates_index()`: Removes any duplicate gene names in the index of `dds_meta`.\n - `print('... drop_duplicates_index success')`: Confirmation message.\n - `treatment_groups=test_adata.obs[test_adata.obs['celltype']=='Alpha'].index.tolist()`: Creates a list of meta-cell indices belonging to the 'Alpha' cell type (treatment).\n - `control_groups=test_adata.obs[test_adata.obs['celltype']=='Beta'].index.tolist()`: Creates a list of meta-cell indices belonging to the 'Beta' cell type (control).\n - `result=dds_meta.deg_analysis(...)`: Performs a differential expression analysis between the Alpha and Beta meta-cells using a t-test.\n - `result.sort_values('qvalue').head()`: Sorts the results by the q-value (adjusted p-value) and displays the top few rows.\n - `dds_meta.foldchange_set(...)`: Sets fold-change, p-value, and log p-value thresholds for `dds_meta` object.\n - `dds_meta.plot_volcano(...)`: Generates a volcano plot of the meta-cell differential expression results.\n - `dds_meta.plot_boxplot(...)`: Generates box plots showing the expression levels of genes Ctxn2 and Mnx1 in the Alpha and Beta meta-cell groups.\n - `ov.utils.embedding(...)`: Generates a UMAP embedding plot, colored by cluster, as well as the expression of Ctxn2 and Mnx1 genes.\n\n**Overall Functionality:**\n\nThis script demonstrates a typical scRNA-seq workflow, starting from raw count data, performing quality control, normalization, dimensionality reduction, and then proceeding to differential expression analysis and visualization at both single-cell and meta-cell levels. It uses various functionalities from the `omicverse`, `scanpy`, and `scvelo` libraries to perform these steps. The meta-cell analysis helps by reducing noise and improving computational efficiency while still retaining cell type-specific information. The differential expression analyses identify genes that are significantly differently expressed between different groups. Finally, the embedding plots visualize the data in a lower-dimensional space and display the expression levels of the significant genes.", + "file": "t_scdeg_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_scdrug_annotated.json b/OvStudent/Converted_Jsons/t_scdrug_annotated.json new file mode 100644 index 00000000..7266d3c6 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_scdrug_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a series of single-cell RNA sequencing (scRNA-seq) data analysis tasks, focusing on copy number variation (CNV) inference, cell type identification, and drug response prediction. Here's a breakdown of its functionality and structure:\n\n**1. Import Libraries (Lines 1-5, 98):**\n\n - `omicverse as ov`: Imports the `omicverse` library, likely providing a suite of tools for omics data analysis.\n - `scanpy as sc`: Imports `scanpy`, a popular Python library for single-cell analysis.\n - `infercnvpy as cnv`: Imports `infercnvpy`, a library for inferring copy number variations from scRNA-seq data.\n - `matplotlib.pyplot as plt`: Imports `matplotlib` for plotting.\n - `os`: Imports the `os` module for operating system-related tasks (like file path manipulation).\n - `import ov` (line 98) is a redundant import of `omicverse`\n\n**2. Set up Scanpy (Lines 7-8):**\n\n - `sc.settings.verbosity = 3`: Sets the verbosity level of `scanpy` to 3 (high verbosity).\n - `sc.settings.set_figure_params(dpi=80, facecolor='white')`: Configures `matplotlib` plotting parameters for `scanpy`, setting dpi to 80 and face color to white.\n\n**3. Load and Prepare Data (Lines 11-26):**\n\n - `adata = cnv.datasets.maynard2020_3k()`: Loads a sample scRNA-seq dataset (likely from the Maynard et al. 2020 study) using `infercnvpy`.\n - `ov.utils.get_gene_annotation(...)`: Annotates genes in the `adata` object using a GTF file (\"gencode.v43.basic.annotation.gtf.gz\"). It uses gene names from GTF file to map gene id.\n - The next 4 lines transfer annotation information in adata.var dataframe. Chromosome, start, end, and ensembl gene ID (ensg) are taken from `chrom`, `chromStart`, `chromEnd`, and `gene_id` respectively.\n - `adata.var.loc[:, [\"ensg\", \"chromosome\", \"start\", \"end\"]].head()`: Displays the first few rows of the `adata.var` DataFrame, showing the extracted annotation information for genes.\n - `adata`: Prints the `adata` object.\n\n**4. CNV Inference (Lines 29-51):**\n\n - `cnv.tl.infercnv(...)`: Performs CNV inference using `infercnvpy`.\n - `reference_key=\"cell_type\"`: Uses cell type annotations as a reference for CNV estimation.\n - `reference_cat=[...]`: Specifies a list of cell types that will serve as a reference during CNV calculation.\n - `window_size=250`: Sets the window size for CNV analysis.\n - `cnv.tl.pca(adata)`, `cnv.pp.neighbors(adata)`, `cnv.tl.leiden(adata)`, `cnv.tl.umap(adata)`: Perform PCA, neighbor graph calculation, Leiden clustering, and UMAP dimensionality reduction on the inferred CNV profiles.\n - `cnv.tl.cnv_score(adata)`: Calculates a CNV score for each cell.\n\n**5. Visualize and Classify Cells Based on CNV (Lines 53-63):**\n\n - `sc.pl.umap(adata, color=\"cnv_score\", show=False)`: Plots the UMAP projection with cells colored by their CNV score.\n - `adata.obs[\"cnv_status\"] = \"normal\"`: Initializes a column named \"cnv_status\" in `adata.obs` and assigns \"normal\" to all cells.\n - The next two lines update `cnv_status` with \"tumor\" for cells with a CNV score > 0.03.\n - `sc.pl.umap(adata, color=\"cnv_status\", show=False)`: Plots the UMAP with cells colored by their CNV status (\"tumor\" or \"normal\").\n - `tumor=adata[adata.obs['cnv_status']=='tumor']`: Creates a new `adata` object that only contains cells marked as \"tumor\".\n - `tumor.X.max()`: Shows max value in tumor matrix\n\n**6. Filter and Scale Tumor Data (Lines 65-79):**\n\n - `adata=tumor`: Updates `adata` with the subset of cells marked as tumor in the previous steps.\n - `print('Preprocessing...')`: Prints a message.\n - `sc.pp.filter_cells(adata, min_genes=200)`: Filters cells with fewer than 200 detected genes.\n - `sc.pp.filter_genes(adata, min_cells=3)`: Filters genes expressed in fewer than 3 cells.\n - `adata.var['mt'] = adata.var_names.str.startswith('MT-')`: Identifies mitochondrial genes (names starting with 'MT-').\n - `sc.pp.calculate_qc_metrics(...)`: Calculates QC metrics, including the percentage of mitochondrial gene counts per cell.\n - The next line filters out cells with >30% of mitochondrial gene reads.\n - `adata.raw = adata.copy()`: Stores a raw copy of the data for later use.\n - `sc.pp.highly_variable_genes(adata)`: Identifies highly variable genes.\n - `adata = adata[:, adata.var.highly_variable]`: Subsets the `adata` object to keep only highly variable genes.\n - `sc.pp.scale(adata)`: Scales the expression values.\n - `sc.tl.pca(adata, svd_solver='arpack')`: Performs PCA on the scaled data.\n\n**7. Further Dimension Reduction and Preperation for Drug Response Prediction(Lines 81-82):**\n\n - `sc.pp.neighbors(adata, n_pcs=20)`: Calculates a neighborhood graph using the first 20 PCs.\n - `sc.tl.umap(adata)`: Performs UMAP embedding.\n\n**8. Download Required Resources for Drug Response Prediction(Lines 84-85):**\n\n - `ov.utils.download_GDSC_data()`: Download GDSC data needed for drug response prediction.\n - `ov.utils.download_CaDRReS_model()`: Downloads CaDRReS model needed for drug response prediction.\n\n**9. Cell Clustering (Line 87):**\n\n - `adata, res,plot_df = ov.single.autoResolution(adata,cpus=4)`: Performs automated clustering of cells using `omicverse`. It specifies that this should be performed using 4 cpus.\n\n**10. Save/Load Processed Data (Lines 89-93):**\n\n - `results_file = os.path.join('./', 'scanpyobj.h5ad')`: Defines the path for saving the results.\n - `adata.write(results_file)`: Saves the processed `adata` object to a file in `h5ad` format.\n - `adata=sc.read(results_file)`: Reads the previously saved `adata` object back into the script. This is redundant step.\n\n**11. Drug Response Prediction (Lines 96-101):**\n\n - `!git clone https://github.com/CSB5/CaDRReS-Sc`: Clones the CaDRReS-Sc repository using a system command (!).\n - `job=ov.single.Drug_Response(...)`: Initiates drug response prediction using the `omicverse` and the cloned CaDRReS-Sc repository.\n - `scriptpath='CaDRReS-Sc'`: Specifies the location of the cloned repo.\n - `modelpath='models/'`: Specifies the location of the downloaded CaDRReS model.\n - `output='result'`: Specifies the output folder.\n\n**In Summary:**\n\nThis script:\n\n1. Loads scRNA-seq data and annotates genes.\n2. Infers CNVs using `infercnvpy` and identifies tumor cells based on their CNV profiles.\n3. Performs standard preprocessing steps on tumor cells (filtering, scaling, PCA).\n4. Clusters cells using `omicverse`.\n5. Predicts drug responses using the CaDRReS model.\n\nThe script is heavily reliant on the `scanpy`, `infercnvpy`, and `omicverse` libraries, showcasing a common workflow for integrating CNV information with downstream scRNA-seq analysis. It demonstrates a complete pipeline from data loading and CNV inference to drug response prediction using publicly available datasets and tools.", + "file": "t_scdrug_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_scmulan_annotated.json b/OvStudent/Converted_Jsons/t_scmulan_annotated.json new file mode 100644 index 00000000..08932a20 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_scmulan_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs single-cell RNA sequencing (scRNA-seq) data analysis using the `scanpy` and `omicverse` libraries. It leverages a pre-trained `scMulan` model for cell type annotation and visualization. Let's break down the functionality and structure step by step:\n\n**1. Imports and Setup:**\n\n* **Lines 1-4:** Imports necessary libraries:\n * `os`: Used for operating system interactions (though not directly used in this specific code).\n * `scanpy as sc`: Imports the `scanpy` library for scRNA-seq data handling and preprocessing.\n * `omicverse as ov`: Imports the `omicverse` library, which provides additional functionalities for analysis and visualization, including those related to `scMulan`.\n* **Line 5:** `ov.plot_set()`: Sets up `omicverse`'s plotting environment, likely configuring defaults.\n\n**2. Data Loading and Initial Processing:**\n\n* **Line 8:** `adata = sc.read('./data/liver_test.h5ad')`: Reads a scRNA-seq data file in `h5ad` format (a common format for `scanpy` objects) named 'liver_test.h5ad' from the './data' directory and stores it into an `AnnData` object called `adata`. This is the main data structure used by `scanpy`.\n* **Line 10:** `adata`: Prints the `adata` object, displaying summary information about the loaded data like the number of cells and genes.\n* **Line 13:** `from scipy.sparse import csc_matrix`: Imports the `csc_matrix` class from `scipy.sparse`. This is used to represent sparse matrices efficiently.\n* **Line 14:** `adata.X = csc_matrix(adata.X)`: Converts the count matrix of the AnnData object (`adata.X`) into a sparse matrix in Compressed Sparse Column format. This can speed up calculations on large single-cell data matrices.\n\n**3. Gene Symbol Uniformization:**\n\n* **Lines 16-18:** `adata_GS_uniformed = ov.externel.scMulan.GeneSymbolUniform(...)`: This line uses an `omicverse` function that is specific to working with `scMulan`. The code performs gene symbol uniformization, which is important because sometimes gene names are presented in different formats across datasets. This function will attempt to unify gene symbols.\n * `input_adata=adata`: Takes the loaded `adata` object as input.\n * `output_dir=\"./data\"`: Specifies the output directory where the processed data will be saved.\n * `output_prefix='liver'`: Specifies the prefix for the output filename (e.g., 'liver_uniformed.h5ad').\n* **Line 22:** `adata_GS_uniformed=sc.read_h5ad('./data/liver_uniformed.h5ad')`: Reads the gene symbol uniformed AnnData object created in the previous step, from './data/liver_uniformed.h5ad', and stores it back to variable `adata_GS_uniformed`.\n* **Line 24:** `adata_GS_uniformed`: Prints the uniformed `adata` object, which now has consistent gene symbol representation.\n\n**4. Normalization and Log Transformation (Conditional):**\n\n* **Lines 28-30:** `if adata_GS_uniformed.X.max() > 10: ...`: This conditional block performs normalization and log transformation if the maximum value in the expression matrix of the uniformized data is greater than 10. This is typical preprocessing for scRNA-seq data.\n * `sc.pp.normalize_total(adata_GS_uniformed, target_sum=1e4)`: Normalizes the counts to sum to a fixed total (10,000 in this case) for each cell, removing variations in sequencing depth.\n * `sc.pp.log1p(adata_GS_uniformed)`: Applies a log transformation (log(x+1)) to the normalized counts. This helps to reduce the skew of the data and stabilize variance.\n\n**5. scMulan Model Inference:**\n\n* **Line 35:** `ckp_path = './ckpt/ckpt_scMulan.pt'`: Defines the path to the checkpoint file for the `scMulan` pre-trained model.\n* **Line 37:** `scml = ov.externel.scMulan.model_inference(ckp_path, adata_GS_uniformed)`: Initializes the `scMulan` inference model using the specified checkpoint path and the processed `AnnData` object.\n* **Line 38:** `base_process = scml.cuda_count()`: This line likely checks the availability of CUDA (NVIDIA's parallel computing platform) and returns a number. It appears to be checking for GPU usage but not actually doing anything with the number.\n* **Line 40:** `scml.get_cell_types_and_embds_for_adata(parallel=True, n_process = 1)`: This is the core step where the `scMulan` model is used. This function infers cell types and cell embeddings for each cell based on the expression matrix in the given dataset. `parallel=True` indicates the function will attempt to run the inference in parallel, with `n_process = 1` indicating that it will only use a single process.\n\n**6. Post-Inference Processing and Visualization:**\n\n* **Line 43:** `adata_mulan = scml.adata.copy()`: Copies the updated `AnnData` object containing the inferred cell types and embeddings from the `scml` object to the `adata_mulan` variable.\n* **Line 46:** `ov.pp.scale(adata_mulan)`: Scales the gene expression values using a method from `omicverse`, preparing the data for downstream analysis.\n* **Line 47:** `ov.pp.pca(adata_mulan)`: Performs Principal Component Analysis (PCA) on the scaled data, reducing the dimensionality of the data while preserving the main variance.\n* **Lines 50-51:** `ov.pp.mde(...)`: Applies Multidimensional Euclidean (MDE) embedding, further reducing dimensionality and creating a 2D embedding for visualization. This function takes PCA and scaled versions of data as input.\n * `embedding_dim=2`: Creates a 2D embedding.\n * `n_neighbors=15`: Sets the number of neighbors used by the MDE algorithm.\n * `basis='X_mde'`: Stores the MDE embedding in `adata_mulan.obsm['X_mde']`.\n * `n_pcs=10`: Uses the top 10 principal components for MDE.\n * `use_rep='scaled|original|X_pca'`: Specifies the data representations used by MDE.\n* **Lines 54-56:** `ov.pl.embedding(...)`: Generates a scatter plot of the MDE embedding, coloring cells by their inferred cell types ('cell_type_from_scMulan').\n * `basis='X_mde'`: Specifies the embedding to be plotted.\n * `color=[\"cell_type_from_scMulan\"]`: Specifies the column in `adata_mulan.obs` to color cells.\n * `ncols=1`: Sets the number of columns for subplots (1 in this case).\n * `frameon='small'`: Specifies plotting parameters.\n* **Line 58:** `adata_mulan.obsm['X_umap']=adata_mulan.obsm['X_mde']`: Copies the MDE embedding to a new field 'X_umap', although MDE was used. This may be an error or intentional to be used by other functions expecting UMAP.\n\n**7. Cell Type Smoothing and Refinement:**\n\n* **Line 61:** `ov.externel.scMulan.cell_type_smoothing(...)`: Applies a smoothing algorithm to the inferred cell types, possibly addressing discrepancies or improving the consistency of cell type assignments.\n * `threshold=0.1`: Sets a threshold parameter for the smoothing algorithm.\n* **Lines 65-67:** `ov.pl.embedding(...)`: Generates a scatter plot of the MDE embedding, coloring cells by both the original cell type assignments ('cell_type') and the smoothed assignments ('cell_type_from_mulan_smoothing').\n* **Line 69:** `adata_mulan`: Prints the modified `adata_mulan` object.\n\n**8. Visualization of Selected Cell Types:**\n\n* **Line 71:** `top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20]`: Selects the 20 most frequent cell types inferred by `scMulan`.\n* **Line 74:** `selected_cell_types = top_celltypes`: Assigns the 20 most frequent cell types for downstream analysis.\n* **Line 75:** `ov.externel.scMulan.visualize_selected_cell_types(...)`: Uses an `omicverse` function to visualize the selected cell types, likely creating plots to highlight their distributions.\n * `smoothing=True`: Indicates that the visualization should also show smoothed cell type assignments.\n\n**In Summary:**\n\nThis script takes scRNA-seq data, applies gene symbol uniformization, loads a `scMulan` model for cell type prediction, performs preprocessing steps (normalization, log transformation, scaling, PCA), visualizes the results using MDE embedding, applies cell type smoothing, and finally visualizes selected cell types. It leverages `scanpy` for core data handling and `omicverse` for additional functionalities including specific `scMulan` related operations. It is a common workflow for leveraging pre-trained deep learning models for scRNA-seq data annotation and visualization.", + "file": "t_scmulan_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_simba_annotated.json b/OvStudent/Converted_Jsons/t_simba_annotated.json new file mode 100644 index 00000000..5347e56b --- /dev/null +++ b/OvStudent/Converted_Jsons/t_simba_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs batch correction and visualization of single-cell RNA sequencing (scRNA-seq) data using the `omicverse` and `scanpy` libraries. Here's a breakdown of its functionality and structure:\n\n**Core Functionality:**\n\nThe script aims to:\n\n1. **Read and Preprocess scRNA-seq Data:** Reads raw scRNA-seq data from an `h5ad` file, preprocesses it (normalization, feature selection), and sets up a `pySIMBA` object for graph-based analysis.\n2. **Construct and Train a SIMBA Graph:** Generates a graph representation of the data using the SIMBA algorithm.\n3. **Perform Batch Correction:** Applies batch correction to mitigate the effects of different experimental batches in the data.\n4. **Embed and Visualize the Data:** Generates dimensionality reduction embeddings (MDE, UMAP) and visualizes the data colored by cell type and batch to assess the effectiveness of the batch correction.\n\n**Step-by-Step Breakdown:**\n\n1. **Import Libraries:**\n * `import omicverse as ov`: Imports the `omicverse` library, which is likely a collection of tools for single-cell data analysis. It's aliased as `ov` for brevity.\n * `from omicverse.utils import mde`: Imports the `mde` (Multidimensional Data Embedding) function from the `omicverse.utils` module. This function is used for dimensionality reduction.\n * `import scanpy as sc`: Imports the `scanpy` library, a widely used package for single-cell analysis, aliased as `sc`.\n\n2. **Set Up Environment:**\n * `workdir = 'result_human_pancreas'`: Defines the working directory where results will be saved.\n * `ov.utils.ov_plot_set()`: Configures plotting parameters using a function from `omicverse.utils`. This likely sets style defaults for plots.\n\n3. **Read Data:**\n * `adata=ov.utils.read('simba_adata_raw.h5ad')`: Reads the scRNA-seq data from an `h5ad` file named 'simba_adata_raw.h5ad' using the `read` function from `omicverse.utils`. The data is stored as an `AnnData` object in the variable `adata`. `AnnData` is a common data structure used in scRNA-seq analysis.\n\n4. **Initialize pySIMBA Object:**\n * `simba_object=ov.single.pySIMBA(adata,workdir)`: Creates an instance of the `pySIMBA` class from `omicverse.single`, passing the `AnnData` object and the working directory as arguments. This initializes the SIMBA analysis.\n\n5. **Preprocess Data for SIMBA:**\n * `simba_object.preprocess(batch_key='batch',min_n_cells=3, method='lib_size',n_top_genes=3000,n_bins=5)`: Preprocesses the data within the `simba_object`. This likely involves normalization (using `lib_size` method), filtering low-quality cells (with `min_n_cells=3`), selecting highly variable genes (`n_top_genes=3000`), binning gene expression (`n_bins=5`), and identifying batch effects based on the column 'batch' in the AnnData.\n\n6. **Generate SIMBA Graph:**\n * `simba_object.gen_graph()`: Creates the graph structure for the SIMBA model based on the preprocessed data.\n\n7. **Train SIMBA Model:**\n * `simba_object.train(num_workers=6)`: Trains the SIMBA model using 6 parallel worker processes for faster computation.\n\n8. **Load Trained SIMBA Model:**\n * `simba_object.load('result_human_pancreas/pbg/graph0')`: Loads a pre-trained SIMBA model from a specified path, likely from a previous run. This step suggests that the `train()` step may be skipped in some cases by using a pre-trained model.\n\n9. **Batch Correction:**\n * `adata=simba_object.batch_correction()`: Applies the trained SIMBA model to correct batch effects in the data, updating the `adata` object. The corrected data is stored back in the `adata` object\n\n10. **Display Corrected Data:**\n * `adata`: Displays the `AnnData` object, which now contains the corrected data.\n\n11. **Calculate and Store MDE Embedding:**\n * `adata.obsm[\"X_mde\"] = mde(adata.obsm[\"X_simba\"])`: Calculates the MDE embedding from the SIMBA representation (likely stored in `adata.obsm[\"X_simba\"]`) and stores it in the `adata.obsm` dictionary with the key `\"X_mde\"`. `obsm` is a space within `AnnData` to store embeddings\n\n12. **Visualize MDE Embedding:**\n * `sc.pl.embedding(adata,basis='X_mde',color=['cell_type1','batch'])`: Generates an embedding plot using the MDE embedding as the basis (`'X_mde'`). It visualizes the plot coloring data points (cells) based on `'cell_type1'` and `'batch'` annotations, allowing assessment of whether batch correction worked.\n\n13. **Neighbor Graph & UMAP Calculation**\n * `sc.pp.neighbors(adata, use_rep=\"X_simba\")`: Calculates the neighbor graph based on the X_simba representation. This graph is then used in UMAP calculation.\n * `sc.tl.umap(adata)`: Computes the UMAP (Uniform Manifold Approximation and Projection) embedding.\n\n14. **Visualize UMAP Embedding:**\n * `sc.pl.umap(adata,color=['cell_type1','batch'])`: Generates a UMAP plot, again colored by `'cell_type1'` and `'batch'`, to visualize the corrected data in a different embedding space.\n\n**In Summary:**\n\nThis script is a complete workflow for performing batch correction and data visualization on scRNA-seq data. It leverages the `omicverse` library for SIMBA-based analysis and batch correction, and utilizes `scanpy` for visualization and further embedding analysis via UMAP. The output of this script would be a series of plots comparing corrected data, visualized with two different embedding techniques (MDE and UMAP) and colored by cell type and batch, allowing users to evaluate the effectiveness of the batch correction.\n\n**Key Concepts:**\n\n* **AnnData:** A data structure common in single-cell analysis.\n* **Batch Effect:** Variation introduced by processing samples in different batches.\n* **SIMBA:** A graph-based method for single-cell data analysis.\n* **MDE (Multidimensional Data Embedding):** A dimensionality reduction technique.\n* **UMAP (Uniform Manifold Approximation and Projection):** Another dimensionality reduction technique, often used for visualization.\n* **`obsm` (Observations Map):** A dictionary inside an AnnData object to store representations of the data (e.g. embeddings)\n* **Single-cell RNA sequencing (scRNA-seq):** A technique that measures the gene expression of individual cells.\n\nThis analysis pipeline is designed to tackle batch effect issues present in scRNA-seq data, leading to visualizations of cell populations without the technical variation introduced by the batch the sample was processed in.", + "file": "t_simba_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_single2spatial_annotated.json b/OvStudent/Converted_Jsons/t_single2spatial_annotated.json new file mode 100644 index 00000000..9a9f44fd --- /dev/null +++ b/OvStudent/Converted_Jsons/t_single2spatial_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs spatial transcriptomics analysis using single-cell RNA-seq data as a reference. It leverages several key libraries for data manipulation, single-cell analysis, spatial omics, and visualization. Here's a breakdown of its functionality and structure:\n\n**Overall Purpose:**\n\nThe script aims to integrate single-cell gene expression data with spatial transcriptomics data to:\n\n1. **Infer spatial distribution of cell types:** It predicts the cell type composition at each spatial location (spot) in the spatial transcriptomics data based on the single-cell RNA-seq data.\n2. **Visualize spatial patterns of gene expression and cell types:** It generates spatial embedding plots that visualize the expression of specific genes and the spatial distribution of predicted cell types.\n\n**Detailed Breakdown:**\n\n**1. Import Libraries (Lines 1-5, 9):**\n\n * `import scanpy as sc`: Imports the Scanpy library, a common tool for single-cell RNA-seq analysis, providing functionalities for data manipulation, preprocessing, and visualization.\n * `import pandas as pd`: Imports the Pandas library, used for data manipulation and handling tabular data structures like DataFrames.\n * `import numpy as np`: Imports the NumPy library, providing support for numerical operations, particularly array manipulation.\n * `import omicverse as ov`: Imports the Omicverse library, specifically designed for spatial omics data analysis. It offers functionalities for integrating single-cell and spatial data, as well as custom plotting tools.\n * `import matplotlib.pyplot as plt`: Imports the Matplotlib library's Pyplot module, used for creating static and interactive plots.\n * `import anndata`: Imports the AnnData library, which provides a framework for representing and manipulating annotated data objects, which is very common for single cell and spatial data.\n\n**2. Set Plotting Style (Line 7):**\n\n * `ov.utils.ov_plot_set()`: Sets the plotting style using a custom function from the Omicverse library, ensuring consistent aesthetics across plots.\n\n**3. Read and Prepare Single-Cell Data (Lines 10-12):**\n\n * `raw_data=pd.read_csv('data/pdac/sc_data.csv', index_col=0)`: Reads the single-cell gene expression data from a CSV file (`sc_data.csv`) into a Pandas DataFrame. The `index_col=0` argument specifies that the first column should be used as the row index.\n * `single_data=anndata.AnnData(raw_data.T)`: Creates an `AnnData` object from the transposed DataFrame (`raw_data.T`). Transposing is often needed to arrange the data in the standard format required by `AnnData` where rows are observations (cells) and columns are variables (genes).\n * `single_data.obs = pd.read_csv('data/pdac/sc_meta.csv', index_col=0)[['Cell_type']]`: Reads the single-cell metadata from another CSV file (`sc_meta.csv`), and assigns the 'Cell_type' column to the `obs` attribute of the `single_data` `AnnData` object. The `obs` attribute typically contains metadata associated with each observation (cell).\n\n**4. Display Single-Cell Data Object (Line 13):**\n\n * `single_data`: Displays the `single_data` `AnnData` object, allowing the user to inspect its structure and content.\n\n**5. Read and Prepare Spatial Transcriptomics Data (Lines 15-17):**\n\n * `raw_data=pd.read_csv('data/pdac/st_data.csv', index_col=0)`: Reads the spatial transcriptomics gene expression data from a CSV file (`st_data.csv`) into a Pandas DataFrame, using the first column as index.\n * `spatial_data=anndata.AnnData(raw_data.T)`: Creates an `AnnData` object from the transposed spatial transcriptomics DataFrame.\n * `spatial_data.obs = pd.read_csv('data/pdac/st_meta.csv', index_col=0)`: Reads the spatial transcriptomics metadata from a CSV file (`st_meta.csv`) and assigns it to the `obs` attribute of the `spatial_data` `AnnData` object.\n\n**6. Display Spatial Data Object (Line 18):**\n\n * `spatial_data`: Displays the `spatial_data` `AnnData` object.\n\n**7. Initialize Single2Spatial Model (Lines 20-23):**\n\n * `st_model=ov.bulk2single.Single2Spatial(...)`: Initializes an instance of the `Single2Spatial` model from the Omicverse library.\n * `single_data=single_data`: Specifies the single-cell data `AnnData` object.\n * `spatial_data=spatial_data`: Specifies the spatial data `AnnData` object.\n * `celltype_key='Cell_type'`: Specifies the column name in the `obs` attribute of `single_data` containing the cell type annotations.\n * `spot_key=['xcoord','ycoord']`: Specifies the column names in the `obs` attribute of the `spatial_data` that contains the X and Y coordinates of each spot.\n\n**8. Train the Single2Spatial Model (Lines 27-31):**\n\n * `sp_adata=st_model.train(...)`: Trains the `Single2Spatial` model.\n * `spot_num=500`: Specifies the number of spatial spots to use during training.\n * `cell_num=10`: Specifies the cell number parameter for training.\n * `df_save_dir='data/pdac/predata_net/save_model'` : Specifies the directory where the model should be saved after training.\n * `df_save_name='pdac_df'` : Specifies the name to use when saving the model.\n * `k=10,num_epochs=1000,batch_size=1000,predicted_size=32`: Specifies various training parameters, including the number of neighbors (`k`), the number of training epochs (`num_epochs`), batch size (`batch_size`), and the latent dimension of the model (`predicted_size`).\n * The result `sp_adata` is the AnnData object storing spatial predictions\n\n**9. Load pre-trained model (Lines 34-35):**\n * `sp_adata=st_model.load(...)`: Loads a pre-trained `Single2Spatial` model from a saved file.\n * `modelsize=14478`: Specifies the model size when loading model.\n * `df_load_dir='data/pdac/predata_net/save_model/pdac_df.pth'`: Specifies the path to the saved model file.\n * `k=10,predicted_size=32`: Specifies the loading parameters such as the number of neighbours and predicted size.\n * The result `sp_adata` is the AnnData object storing spatial predictions\n\n**10. Perform Spatial Spot Assessment (Line 37):**\n\n * `sp_adata_spot=st_model.spot_assess()`: Performs spatial spot assessment using the trained model. The result `sp_adata_spot` is the AnnData object storing the spatial spot assessments\n\n**11. Create Spatial Embedding Plots (Lines 39-44, 49-54, 59-65, 70-77):**\n\n * `sc.pl.embedding(...)`: This part of the code generates spatial embedding plots using a function provided by Scanpy's plotting module. It's used multiple times with slight variations.\n * `sp_adata` or `sp_adata_spot`: Specifies the AnnData object to use for the plot.\n * `basis=\"X_spatial\"`: Specifies the spatial coordinates for plotting.\n * `color=[...]`: Specifies which features to use for coloring the points in the plot. This can include genes or metadata.\n * `frameon=False`: Disables the plot frame.\n * `ncols=4`: Specifies the number of columns for the subplot layout.\n * `show=False`: Prevents the plots from being displayed directly, they will be returned to the function call to be displayed later\n * `palette=ov.utils.ov_palette()[11:]`: Specifies a custom color palette for the \"Cell_type\" plot using a color palette provided in the omicverse library.\n * The plots are generated to visualize:\n * Spatial expression of genes in original spatial data (lines 39-44)\n * Spatial expression of genes in the inferred spatial spot data (lines 49-54)\n * Spatial distributions of inferred cell types in the inferred spatial spot data (lines 59-65)\n * Spatial distribution of cell types in the inferred spatial data (lines 70-77)\n\n**In summary, the script:**\n\n1. **Loads and prepares** single-cell and spatial transcriptomics data.\n2. **Trains a model** to predict cell type compositions in the spatial data based on the single-cell reference.\n3. **Loads a pre-trained model** if the model has been trained before.\n4. **Assess the spatial spot information** using the trained model\n5. **Visualizes** the spatial distributions of gene expression and inferred cell types in different `AnnData` objects.\n\nThis workflow is commonly used in spatial transcriptomics analysis to leverage the cellular resolution of single-cell RNA-seq to understand the spatial organization of tissues and their cellular composition, which has applications in biology, diagnostics, and drug discovery.", + "file": "t_single2spatial_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_single_batch_annotated.json b/OvStudent/Converted_Jsons/t_single_batch_annotated.json new file mode 100644 index 00000000..1ade61f8 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_single_batch_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a comprehensive single-cell RNA-seq analysis, including data loading, preprocessing, batch correction, dimensionality reduction, and evaluation using the `omicverse`, `scanpy`, and `scib_metrics` libraries. Here's a breakdown of its functionality and structure:\n\n**Overall Workflow:**\n\n1. **Data Loading and Preparation:**\n - Loads three AnnData objects from H5AD files (`neurips2021_s1d3.h5ad`, `neurips2021_s2d1.h5ad`, `neurips2021_s3d7.h5ad`).\n - Adds a 'batch' column to each AnnData object indicating the source of the data.\n - Concatenates the three AnnData objects into a single `adata` object.\n - Casts the expression matrix to integer type.\n\n2. **Quality Control (QC):**\n - Performs QC filtering based on thresholds for mitochondrial percentage, number of UMIs, and number of detected genes, taking batch effects into account.\n\n3. **Preprocessing:**\n - Preprocesses the data using a shift-log transformation and selects highly variable genes (HVGs).\n\n4. **Data Storage and Subset:**\n - Stores the original data in the `.raw` attribute of the AnnData object.\n - Subsets the AnnData object to only include the selected HVGs.\n\n5. **Data Scaling and PCA:**\n - Scales the expression matrix.\n - Performs Principal Component Analysis (PCA) on the scaled data.\n\n6. **Dimensionality Reduction (MDE):**\n - Applies Manifold Dimension Embedding (MDE) to the PCA results.\n\n7. **Batch Correction:**\n - Applies several batch correction methods:\n - **Harmony:** Integrates the datasets by aligning shared cell populations.\n - **ComBat:** Corrects for batch effects using a parametric Bayesian method.\n - **Scanorama:** Integrates the data through a matching-based approach.\n - **scVI:** Employs a variational autoencoder approach to integrate the data.\n - After each batch correction, it applies MDE to the corrected data and visualizes the embedding.\n\n8. **Topic Modeling (LDA):**\n - Initializes an LDA topic model using the expression data and considering batch effects.\n - Plots topic contributions and predicts topics for each cell.\n - Applies MDE to the topic composition and UMAP feature embeddings\n\n9. **Data Saving**\n - Saves the processed data to two separate H5AD files to disk\n - Reads one of the files to continue analysis\n\n10. **Batch Correction Evaluation:**\n - Initializes a Benchmarker object to evaluate different batch correction methods using `scib_metrics`.\n - Runs the benchmark and generates a results table.\n\n**Line-by-Line Explanation:**\n\n- **Lines 1-2:** Imports necessary libraries, `omicverse` as `ov` and `scanpy` as `sc`. `omicverse` seems to be built on top of `scanpy`.\n- **Line 5:** Sets the plotting style for `omicverse`.\n- **Lines 7-12:** Reads three AnnData files and adds a batch annotation to each one, indicating its origin.\n- **Line 14:** Concatenates the three AnnData objects into a single AnnData object. The `merge='same'` argument indicates that the same variables (genes) should be merged.\n- **Line 15:** Prints the concatenated AnnData object.\n- **Line 17:** Prints the unique batch values present in the `obs` data of the concatenated AnnData object.\n- **Line 19:** Imports the `numpy` library for numerical operations.\n- **Line 20:** Casts the values in the `.X` attribute (data matrix) to a 64-bit integer type. This is a common practice before applying certain analysis methods.\n- **Line 22:** Performs quality control filtering using the `ov.pp.qc` function with specified parameters to remove low-quality cells and genes.\n- **Line 24:** Prints the quality-controlled AnnData object.\n- **Line 26:** Preprocesses the data using `ov.pp.preprocess`. This function performs a shift-log transformation and selects the top 3000 highly variable genes (HVGs) using the 'pearson' method. `batch_key=None` indicates that batch effects are not considered during HVG selection at this stage.\n- **Line 28:** Prints the preprocessed AnnData object.\n- **Line 30:** Copies the processed AnnData to `adata.raw`. This is often done before further manipulations, so the original normalized data is retained.\n- **Line 31:** Subsets the AnnData object to only include the highly variable genes selected in the previous step by accessing the `highly_variable_features` annotation from the `.var` (variables) attribute.\n- **Line 32:** Prints the subset AnnData object.\n- **Line 34:** Saves the current AnnData object to an H5AD file using gzip compression.\n- **Line 36:** Scales the expression data in the AnnData object using `ov.pp.scale`.\n- **Line 37:** Performs PCA on the scaled expression data. `mask_var='highly_variable_features'` specifies that the PCA should only be done on the highly variable genes.\n- **Line 39:** Calculates an MDE embedding of the PCA results. MDE is a non-linear dimensionality reduction technique.\n- **Line 41:** Generates a scatter plot embedding of the MDE-transformed PCA data, coloring points by 'batch' and 'cell_type', without displaying the plot, so it is saved.\n- **Lines 43-44, 46-48, 50-51, 53-55, 57-58, 60-62, 64-65, 67-69:** These are similar sets of lines, performing batch correction using different methods (Harmony, ComBat, Scanorama, scVI), applying MDE, and visualizing the results. The batch corrected results are stored as `.obsm` embeddings.\n- **Lines 71-73, 75:** Sets up, executes, and plots the results of the LDA topic model.\n- **Line 77-78:** Calculates MDE embeddings for the LDA topic compositions and UMAP features and stores the results in `.obsm`.\n- **Lines 80-83:** Creates embedding plots of MDE-transformed topic and feature embeddings color coded by batch and cell type.\n- **Line 85:** Saves the final AnnData object containing the results of all batch correction methods to an H5AD file.\n- **Line 87:** Reads the saved AnnData object from file.\n- **Lines 89-91:** Copies the PCA, topic composition and UMAP feature embeddings to new `.obsm` keys for ease of access.\n- **Lines 93-94:** Imports and initializes the `Benchmarker` object from the `scib_metrics` library. This object is used for evaluating and comparing the different batch correction methods. It will measure the integration and conservation of the data after the application of each method based on the given parameters.\n- **Line 99:** Runs the benchmark, which computes metrics to assess the quality of each batch correction method.\n- **Line 101:** Generates and displays a table summarizing the benchmark results without min max scaling.\n\n**Key Concepts:**\n\n* **AnnData:** A data structure used in `scanpy` to store single-cell gene expression data along with annotations.\n* **Batch Effects:** Systematic biases in data that arise from technical differences between experiments or batches.\n* **Highly Variable Genes (HVGs):** Genes that show the most variation across cells. These genes are often used to reduce the dimensionality of the data.\n* **PCA (Principal Component Analysis):** A dimensionality reduction technique that finds the principal axes of variation in the data.\n* **MDE (Manifold Dimension Embedding):** A non-linear dimensionality reduction method.\n* **Batch Correction:** Techniques that are used to remove batch effects from the data.\n* **LDA (Latent Dirichlet Allocation):** A topic modeling algorithm that can be used to identify patterns in gene expression data.\n* **scib_metrics:** A library providing metrics to evaluate the quality of batch correction methods.\n\n**In summary, this script is a complete workflow for analyzing single-cell RNA-seq data by merging multiple datasets, performing quality control, batch correction, dimensionality reduction, and evaluation. It provides a comprehensive example of how to leverage the capabilities of `omicverse`, `scanpy` and `scib_metrics` for single-cell data analysis.**", + "file": "t_single_batch_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_tcga_annotated.json b/OvStudent/Converted_Jsons/t_tcga_annotated.json new file mode 100644 index 00000000..ce6518ca --- /dev/null +++ b/OvStudent/Converted_Jsons/t_tcga_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script processes gene expression and clinical data from the Cancer Genome Atlas (TCGA), specifically for Ovarian Cancer (OV). It uses the `omicverse` and `scanpy` libraries for data handling and analysis. Here's a breakdown of its functionality and structure:\n\n**Functionality:**\n\n1. **Data Loading & Initialization:**\n - The script starts by importing the necessary libraries: `omicverse` (as `ov`) and `scanpy` (as `sc`). It sets plotting parameters using `ov.plot_set()`.\n - It defines paths to three input files:\n - `gdc_sample_sheep`: A sample sheet from the GDC (Genomic Data Commons). This likely contains metadata about the samples.\n - `gdc_download_files`: The directory containing the downloaded raw data files from the GDC. This would hold gene expression data.\n - `clinical_cart`: A clinical data file.\n - It then creates an `omicverse.bulk.pyTCGA` object, which is presumably designed to handle TCGA data. This object is named `aml_tcga`.\n - It initializes the AnnData object within the `pyTCGA` object using `aml_tcga.adata_init()`. AnnData is a data structure from the `scanpy` library for storing and manipulating single-cell or bulk gene expression data.\n - The script saves this initial AnnData object to an HDF5 file (`ov_tcga_raw.h5ad`) for later use.\n\n2. **Re-loading and Metadata Initialization:**\n - The script repeats the assignment of file paths, possibly to ensure a clean execution or to show that paths are constants even if the variable is reused.\n - It re-creates the `aml_tcga` object using the same input files, indicating that the previous steps were primarily focused on file loading and preparation.\n - Instead of re-initializing the AnnData object, it reads the previously saved `ov_tcga_raw.h5ad` file into the `aml_tcga` object, demonstrating the load-save cycle.\n - It then initializes metadata in the AnnData object using `aml_tcga.adata_meta_init()`, which likely involves parsing and adding sample-level metadata from the `gdc_sample_sheet` and `clinical_cart` files.\n - Survival information is initialized within the `pyTCGA` object, likely preparing to perform survival analysis later.\n\n3. **Survival Analysis:**\n - The script accesses the AnnData object via `aml_tcga.adata` (although this line doesn't do anything by itself, it does demonstrate how to get the AnnData object).\n - A survival analysis is performed for a specific gene, `MYC`, using DESeq normalized data. The result is plotted using `plot=True`.\n - A survival analysis is conducted for **all** genes in the dataset, using the `aml_tcga.survial_analysis_all()`. This implies the analysis computes survival statistics for each gene independently and likely adds that to the AnnData.\n - Again the AnnData object is accessed.\n - Finally, the modified AnnData object, including the survival analysis results, is saved to a new HDF5 file (`ov_tcga_survial_all.h5ad`).\n\n**Structure:**\n\nThe script is structured in a linear, procedural way, with a logical flow:\n\n1. **Import and setup:** Libraries are imported, and basic parameters are set.\n2. **Data loading & initialization:** Raw data and metadata are loaded and combined into an AnnData object within a `pyTCGA` object.\n3. **Data persistence:** An initial version of the AnnData object is saved to disk.\n4. **Metadata integration:** Metadata is integrated into the AnnData object and the AnnData object is reloaded.\n5. **Survival analysis:** Survival analysis is performed for a specific gene and then for all genes.\n6. **Results persistence:** The final version of the AnnData object, including survival analysis results, is saved to disk.\n\n**Key takeaways:**\n\n- This script is designed to handle TCGA data.\n- It uses the `omicverse` library, which seems to provide a higher-level interface for working with genomic data, and leverages `scanpy` for its AnnData structure.\n- It performs a core set of operations: loading, pre-processing, and analysis, followed by saving of the result.\n- The main goal appears to be performing survival analysis on the gene expression data, a common task in cancer research.\n- The use of `h5ad` files allows for efficient storage and loading of the AnnData object, making it easy to resume analyses later.\n\n**In summary:** This script is a typical bioinformatics workflow for analyzing bulk gene expression data from the TCGA, focusing specifically on ovarian cancer. It leverages powerful Python libraries to handle data management, processing, and visualization.", + "file": "t_tcga_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_tosica_annotated.json b/OvStudent/Converted_Jsons/t_tosica_annotated.json new file mode 100644 index 00000000..ed70a062 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_tosica_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell data analysis workflow, primarily using the `omicverse` (likely for cell state prediction) and `scanpy` libraries. It loads reference and query datasets, trains a cell state prediction model, and applies it to the query dataset before performing downstream analyses. Let's break down the functionality step by step:\n\n**Data Loading and Preprocessing (Lines 1-17):**\n\n1. **Import Libraries:** Imports `omicverse` (as `ov`) and `scanpy` (as `sc`) for single-cell data handling and analysis.\n2. **Set Plotting Style:** Sets a specific plotting style for `omicverse`.\n3. **Load Reference Data:** Reads a reference dataset from `demo_train.h5ad` (an AnnData object) using `scanpy`.\n4. **Subset Reference Data (Var):** Ensures that all gene data in `ref_adata` is retained by explicitly indexing with its own `var_names`.\n5. **Print Reference Data:** Prints the contents of the `ref_adata` AnnData object.\n6. **Print Cell Type Counts (Reference):** Prints the number of cells belonging to each cell type in the reference data.\n7. **Load Query Data:** Reads a query dataset from `demo_test.h5ad` (an AnnData object) using `scanpy`.\n8. **Subset Query Data (Var):** Subsets `query_adata` to only contain genes that are also present in `ref_adata`. This ensures feature consistency between the two datasets.\n9. **Print Query Data:** Prints the contents of the `query_adata` AnnData object.\n10. **Print Cell Type Counts (Query):** Prints the number of cells belonging to each cell type in the query data.\n11. **Make Variable Names Unique (Reference & Query):** Ensures that gene names in `ref_adata` and `query_adata` are unique.\n12. **Find Common Genes:** Finds the intersection of genes present in both `query_adata` and `ref_adata` and stores it in `ret_gene`.\n13. **Subset Data by Common Genes:** Subsets `query_adata` and `ref_adata` to only contain the genes present in `ret_gene`. This ensures both datasets have the same gene set for downstream analysis.\n\n**TOSICA Model Training and Prediction (Lines 18-28):**\n\n14. **Print Data Max Value:** Prints the maximum values in the expression matrices of `ref_adata` and `query_adata`.\n15. **Download Gene Set:** Downloads the TOSICA gene set file for pathway analysis.\n16. **Initialize TOSICA Object:** Initializes a `pyTOSICA` object using the preprocessed `ref_adata`.\n * Specifies gene set path (`genesets/GO_bp.gmt`).\n * Sets the depth parameter for pathway analysis.\n * Defines the label name ('Celltype') for training the model.\n * Specifies the project path.\n * Sets a batch size.\n17. **Train TOSICA Model:** Trains the TOSICA model for 5 epochs.\n18. **Save TOSICA Model:** Saves the trained model.\n19. **Load TOSICA Model:** Loads the saved model.\n20. **Predict Cell States:** Predicts cell states for the `query_adata` using the trained TOSICA model and stores the results in a new AnnData object called `new_adata`.\n\n**Query Data Post-processing and Embedding (Lines 29-37):**\n\n21. **Scale Query Data:** Scales the gene expression values of the query data using a method from `omicverse`.\n22. **Perform PCA:** Performs Principal Component Analysis (PCA) on the scaled data of `query_adata`, keeping 50 principal components.\n23. **Compute Neighborhood Graph:** Calculates the nearest neighbor graph based on the scaled PCA data.\n24. **Compute Multidimensional Embedding (MDE):** Computes a multidimensional embedding (MDE) on the scaled PCA data and adds it to the `query_adata.obsm` as \"X_mde\".\n25. **Print Query Data:** Prints the modified `query_adata`.\n26. **Copy Embeddings:** Copies the `obsm` (embedding matrices) and `obsp` (sparse distance matrices) from the relevant observations in `query_adata` to `new_adata` based on matching indices in the obs slot. This links the prediction results to the dimensional reduction and neighbor graphs\n27. **Print New Data:** Prints the modified `new_adata` which now contains the embedding, neighbors, and prediction.\n\n**Visualization and Downstream Analysis (Lines 38-75):**\n\n28. **Import Numpy:** Imports the `numpy` library as `np`.\n29. **Define Color Array:** Creates a NumPy array of hex color codes to be used for plotting.\n30. **Define Cell Types Tuple:** Creates a tuple of cell type names in a desired order.\n31. **Convert Prediction to Categorical:** Converts the 'Prediction' column in `new_adata.obs` to a categorical type.\n32. **Reorder Prediction Categories:** Reorders the categories of 'Prediction' in `new_adata` based on the defined cell type tuple.\n33. **Set Prediction Colors:** Assigns a color palette to the 'Prediction' annotation in `new_adata.uns` using color codes from the `col` array.\n34. **Update Cell Types Tuple:** Creates a tuple of cell types including \"MHC class II\" in a desired order.\n35. **Convert Celltype to Categorical:** Converts the 'Celltype' column in `new_adata.obs` to a categorical type.\n36. **Reorder Celltype Categories:** Reorders the categories of 'Celltype' in `new_adata` based on the updated cell type tuple.\n37. **Set Celltype Colors:** Assigns a color palette to the 'Celltype' annotation in `new_adata.uns`.\n38. **Create Embedding Plot 1:** Generates an embedding plot of the \"X_mde\" visualization, colored by both 'Celltype' and 'Prediction' annotations using `scanpy.pl.embedding`.\n39. **Identify Rare Cell Types:** Identifies cell types in the 'Prediction' column that have fewer than 5 cells.\n40. **Remove Rare Cell Types:** Removes cells from `new_adata` that have a 'Prediction' that is in the list of rare cell types from previous step.\n41. **Perform Differential Gene Expression Analysis:** Performs differential gene expression analysis using the Wilcoxon rank-sum test for each predicted cell type in `new_adata`.\n42. **Create Dotplot:** Creates a dotplot to visualize the top ranked genes identified in the differential expression analysis for each predicted cell type.\n43. **Get DEGs for PP Group:** Gets the differentially expressed genes for the 'PP' group (specific cell type) from the results of `rank_genes_groups` that pass a p-value cutoff.\n44. **Print DEGs:** Prints the first 5 rows of the DEG dataframe.\n45. **Create Embedding Plot 2:** Generates an embedding plot of the \"X_mde\" visualization, colored by 'Prediction' and 'GOBP_REGULATION_OF_MUSCLE_SYSTEM_PROCESS' gene set score using `scanpy.pl.embedding`.\n\n**Summary:**\n\nIn summary, the script takes single-cell gene expression data, uses a trained model to predict cell states, performs some downstream analysis, and finally generates some visualizations. The steps of data loading, subsetting, dimensionality reduction, embedding, and cell type predictions can be commonly observed in single-cell analysis pipelines. The usage of TOSICA and GO gene sets suggests a specific focus on cell state analysis and functional pathway enrichment. The script showcases both `omicverse` and `scanpy` functionalities for analyzing and visualizing single-cell data.", + "file": "t_tosica_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_traj_annotated.json b/OvStudent/Converted_Jsons/t_traj_annotated.json new file mode 100644 index 00000000..6f605024 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_traj_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs trajectory inference on single-cell RNA sequencing data using a combination of libraries: `scanpy`, `scvelo`, `matplotlib`, and `omicverse`. It aims to identify developmental trajectories and gene expression changes along those trajectories.\n\nHere's a breakdown of the functionality and structure, line by line, with explanations:\n\n**Importing Libraries (Lines 1-4):**\n\n* **`import scanpy as sc`**: Imports the Scanpy library, a fundamental tool for single-cell analysis, aliased as `sc`.\n* **`import scvelo as scv`**: Imports the Scvelo library, which focuses on RNA velocity analysis, aliased as `scv`.\n* **`import matplotlib.pyplot as plt`**: Imports the pyplot module from Matplotlib, used for creating visualizations, aliased as `plt`.\n* **`import omicverse as ov`**: Imports the Omicverse library, which builds upon Scanpy for advanced single-cell analysis and offers a unified framework, aliased as `ov`.\n\n**Setting Plotting Style (Line 5):**\n\n* **`ov.plot_set()`**: Sets a consistent plotting style using the Omicverse library, ensuring uniformity in the appearance of plots.\n\n**Loading and Preprocessing Data (Lines 7-13):**\n\n* **`import scvelo as scv`**: This line is redundant, as `scvelo` was already imported on line 2.\n* **`adata=scv.datasets.dentategyrus()`**: Loads the dentategyrus dataset from the Scvelo library, which is a single-cell dataset for demonstrating RNA velocity. The data is stored in an `AnnData` object (a Scanpy data structure) called `adata`.\n* **`adata`**: Displays the loaded `AnnData` object. This likely prints its basic structure to the console.\n* **`adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=3000,)`**: Preprocesses the data using Omicverse's `preprocess` function. The `mode='shiftlog|pearson'` likely applies a shift-log transformation and then selects highly variable genes using the Pearson correlation. `n_HVGs=3000` specifies that the top 3000 most variable genes should be selected.\n* **`adata.raw = adata`**: Stores a copy of the processed data into the `.raw` attribute of the `AnnData` object. This is a standard Scanpy practice to preserve the original processed data for later use.\n* **`adata = adata[:, adata.var.highly_variable_features]`**: Selects only the highly variable genes for further analysis. The `adata.var.highly_variable_features` attribute is a boolean mask indicating which genes are deemed highly variable during preprocessing.\n* **`ov.pp.scale(adata)`**: Scales the data using Omicverse's `scale` function, often by centering to 0 and scaling to unit variance. This is useful for methods that are sensitive to different scales.\n\n**Dimensionality Reduction and Visualization (Lines 14-16):**\n\n* **`ov.pp.pca(adata,layer='scaled',n_pcs=50)`**: Performs Principal Component Analysis (PCA) on the scaled data. `n_pcs=50` specifies that it should compute the first 50 principal components.\n* **`ov.utils.plot_pca_variance_ratio(adata)`**: Plots the variance ratio explained by each principal component. This helps to visualize the importance of each PC and informs on how many PCs are needed to retain information.\n\n**Trajectory Inference and Pseudotime Analysis (Lines 18-77):**\n\nThis section performs trajectory inference and analysis with different methods:\n\n* **Diffusion Maps:**\n * **`Traj=ov.single.TrajInfer(adata,basis='X_umap',groupby='clusters', use_rep='scaled|original|X_pca',n_comps=50,)`**: Initializes a Trajectory Inference object from Omicverse.\n * **`Traj.set_origin_cells('nIPC')`**: Sets the origin cells for the trajectory inference, in this case, a cell type/cluster called \"nIPC.\"\n * **`Traj.inference(method='diffusion_map')`**: Performs trajectory inference using the diffusion map algorithm.\n * **`ov.utils.embedding(adata,basis='X_umap',color=['clusters','dpt_pseudotime'],frameon='small',cmap='Reds')`**: Plots an embedding (likely UMAP in this context) colored by cluster and Diffusion Pseudotime.\n * **`ov.utils.cal_paga(adata,use_time_prior='dpt_pseudotime',vkey='paga',groups='clusters')`**: Computes PAGA (Partition-based graph abstraction) using Diffusion Pseudotime as a prior.\n * **`ov.utils.plot_paga(adata,basis='umap', size=50, alpha=.1,title='PAGA LTNN-graph',min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False)`**: Plots the PAGA graph.\n\n* **Slingshot:**\n * **`Traj=ov.single.TrajInfer(adata,basis='X_umap',groupby='clusters', use_rep='scaled|original|X_pca',n_comps=50)`**: Initializes a new Trajectory Inference object.\n * **`Traj.set_origin_cells('nIPC')`**: Sets origin cells.\n * **`#Traj.set_terminal_cells([\"Granule mature\",\"OL\",\"Astrocytes\"])`**: This line is commented out and terminal cells are defined on the Palantir trajectory inference, line 58.\n * **`Traj.inference(method='slingshot',num_epochs=1)`**: Performs trajectory inference using the Slingshot method.\n * **`fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))`**: Creates a grid of subplots using Matplotlib.\n * **`Traj.inference(method='slingshot',num_epochs=1,debug_axes=axes)`**: Runs slingshot again, passing debug axes to visualize the fitting process.\n * **`ov.utils.embedding(adata,basis='X_umap',color=['clusters','slingshot_pseudotime'], frameon='small',cmap='Reds')`**: Plots the embedding colored by cluster and Slingshot pseudotime.\n * **`sc.pp.neighbors(adata,use_rep='scaled|original|X_pca')`**: Computes the neighborhood graph for the cells based on the specified representation using scanpy.\n * **`ov.utils.cal_paga(adata,use_time_prior='slingshot_pseudotime',vkey='paga',groups='clusters')`**: Computes PAGA using Slingshot pseudotime as a prior.\n * **`ov.utils.plot_paga(adata,basis='umap', size=50, alpha=.1,title='PAGA Slingshot-graph',min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False)`**: Plots the PAGA graph.\n\n* **Palantir:**\n * **`Traj=ov.single.TrajInfer(adata,basis='X_umap',groupby='clusters', use_rep='scaled|original|X_pca',n_comps=50)`**: Creates another Trajectory inference object.\n * **`Traj.set_origin_cells('nIPC')`**: Sets origin cells.\n * **`Traj.set_terminal_cells([\"Granule mature\",\"OL\",\"Astrocytes\"])`**: Sets terminal cells\n * **`Traj.inference(method='palantir',num_waypoints=500)`**: Performs trajectory inference using the Palantir method, with 500 waypoints along the trajectory.\n * **`Traj.palantir_plot_pseudotime(embedding_basis='X_umap',cmap='RdBu_r',s=3)`**: Plots the Palantir pseudotime onto the embedding.\n * **`Traj.palantir_cal_branch(eps=0)`**: Calculates the branching structure of the Palantir trajectory.\n * **`ov.externel.palantir.plot.plot_trajectory(adata, \"Granule mature\",cell_color=\"palantir_entropy\", n_arrows=10, color=\"red\",scanpy_kwargs=dict(cmap=\"RdBu_r\"))`**: Plots the Palantir trajectory, highlighting cell states, branching points and trajectory directionality.\n * **`gene_trends = Traj.palantir_cal_gene_trends(layers=\"MAGIC_imputed_data\")`**: Calculates the trend of gene expression along the Palantir trajectory using a MAGIC imputed expression matrix.\n * **`genes = ['Cdca3','Rasl10a','Mog','Aqp4']`**: Defines a list of genes to plot.\n * **`Traj.palantir_plot_gene_trends(genes)`**: Plots the trends of the specified genes along the Palantir trajectory.\n\n**Final Visualization and Analysis (Lines 78-84):**\n\n* **`plt.show()`**: Displays all the generated plots.\n* **`ov.utils.cal_paga(adata,use_time_prior='palantir_pseudotime',vkey='paga', groups='clusters')`**: Calculates the PAGA graph using Palantir pseudotime as prior.\n* **`ov.utils.plot_paga(adata,basis='umap', size=50, alpha=.1,title='PAGA LTNN-graph',min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False)`**: Plots the PAGA graph based on Palantir pseudotime.\n\n**In Summary:**\n\nThis script performs a comprehensive analysis of single-cell data using multiple trajectory inference methods. It uses Omicverse as the unifying framework with Scanpy and other tools for data manipulation, visualization and advanced analysis such as trajectory inference. The analysis involves:\n\n1. **Data Loading and Preprocessing:** Loads data and performs filtering and transformations, scaling and PCA.\n2. **Trajectory Inference:** Applies diffusion maps, Slingshot, and Palantir methods to infer developmental trajectories.\n3. **Pseudotime and Visualization:** Visualizes the inferred trajectories, pseudotime gradients and gene expression along the trajectories.\n4. **PAGA Analysis:** Generates PAGA graphs to visualize cell connectivity and trajectory relationships based on different pseudotime measurements.\n5. **Gene trend analysis:** Uses Palantir to compute and visualize gene expression changes along the trajectory.\n\nThe script utilizes various plotting options to visualize the results effectively. The goal is to understand the dynamic changes within the single-cell dataset and the differentiation process.", + "file": "t_traj_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_via_annotated.json b/OvStudent/Converted_Jsons/t_via_annotated.json new file mode 100644 index 00000000..f6eb3d9a --- /dev/null +++ b/OvStudent/Converted_Jsons/t_via_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs trajectory inference and visualization on single-cell RNA sequencing (scRNA-seq) data using the `omicverse` and `scanpy` libraries. Here's a breakdown of its functionality and structure:\n\n**Overall Goal:**\n\nThe script aims to:\n\n1. **Load and preprocess scRNA-seq data:** Loads a hematopoiesis dataset, performs PCA for dimensionality reduction.\n2. **Perform trajectory inference:** Uses the `pyVIA` algorithm (from `omicverse`) to infer cell trajectories and pseudotime.\n3. **Visualize the results:** Generates various plots to visualize the inferred trajectories, pseudotime, gene expression trends, and lineage probabilities.\n\n**Step-by-Step Explanation:**\n\n1. **Import Libraries:**\n * `import omicverse as ov`: Imports the `omicverse` library, which provides functions for single-cell analysis, including data loading and trajectory inference. It's aliased as `ov`.\n * `import scanpy as sc`: Imports the `scanpy` library, which is a popular Python package for single-cell analysis, and is aliased as `sc`. It's used for data manipulation and some plotting functions.\n * `import matplotlib.pyplot as plt`: Imports the plotting library `matplotlib.pyplot`, which is aliased as `plt`, for creating and manipulating plots.\n\n2. **Set Plotting Style:**\n * `ov.utils.ov_plot_set()`: Sets the plotting style using `omicverse`'s utility, providing a consistent aesthetic for the generated figures.\n\n3. **Load Data:**\n * `adata = ov.single.scRNA_hematopoiesis()`: Loads the scRNA-seq hematopoiesis dataset from `omicverse` and stores it as an `AnnData` object named `adata`. `AnnData` is a common data structure for single-cell data in Python.\n\n4. **Principal Component Analysis (PCA):**\n * `sc.tl.pca(adata, svd_solver='arpack', n_comps=200)`: Performs PCA on the loaded data to reduce dimensionality. It uses the 'arpack' solver (efficient for sparse matrices) and computes 200 principal components, which are stored in `adata.obsm['X_pca']`.\n\n5. **Display AnnData:**\n * `adata`: Displays the `AnnData` object `adata`, showing basic information about the data.\n\n6. **Initialize pyVIA:**\n * `v0 = ov.single.pyVIA(adata=adata,adata_key='X_pca',adata_ncomps=80, basis='tsne', clusters='label',knn=30,random_seed=4,root_user=[4823])`: Initializes the `pyVIA` object (from `omicverse`) for trajectory inference:\n * `adata`: The `AnnData` object.\n * `adata_key='X_pca'`: Specifies that PCA results (`adata.obsm['X_pca']`) are used as the basis for trajectory inference.\n * `adata_ncomps=80`: Uses the first 80 principal components.\n * `basis='tsne'`: Specifies t-SNE for dimensionality reduction before trajectory inference.\n * `clusters='label'`: Uses cell labels for cluster information.\n * `knn=30`: Sets the number of nearest neighbors for the KNN graph construction.\n * `random_seed=4`: Sets the random seed for reproducibility.\n * `root_user=[4823]`: Sets the root cell(s) for pseudotime calculation.\n\n7. **Run pyVIA:**\n * `v0.run()`: Executes the pyVIA trajectory inference algorithm on the initialized object. This computes pseudotime values, cell lineages and other trajectory-related data\n\n8. **t-SNE Embedding Plot:**\n * `fig, ax = plt.subplots(1,1,figsize=(4,4))`: Creates a figure and axes for plotting with a specified size.\n * `sc.pl.embedding(...)`: Generates a 2D embedding plot using t-SNE coordinates (specified by `basis=\"tsne\"`) from the `adata` object. Cells are colored by their `'label'` . The plot's appearance is controlled by parameters such as `frameon`, `ncols`, `wspace`, and the `ax` object.\n * `fig.savefig('figures/via_fig1.png',dpi=300,bbox_inches = 'tight')`: Saves the t-SNE plot as a PNG file.\n\n9. **Pie Chart Graph:**\n * `fig, ax, ax1 = v0.plot_piechart_graph(...)`: Generates a pie chart graph representing the inferred lineages/clusters. The pie chart's colors are defined by the 'Reds' colormap and other visual parameters.\n * `fig.savefig('figures/via_fig2.png',dpi=300,bbox_inches = 'tight')`: Saves the pie chart graph as a PNG file.\n\n10. **Extract Pseudotime:**\n * `v0.get_pseudotime(v0.adata)`: Extracts the computed pseudotime values from the pyVIA model, and attaches them to the `AnnData` object stored in `v0.adata`.\n\n11. **Display pyVIA AnnData:**\n * `v0.adata`: Displays the modified `AnnData` object stored within the `pyVIA` object, now including pseudotime and other pyVIA derived information.\n\n12. **Gene List:**\n * `gene_list_magic = [...]`: Defines a list of genes that will be used for subsequent analysis and plotting.\n\n13. **Cluster Graph with Gene Expression:**\n * `fig,axs=v0.plot_clustergraph(...)`: Generates a cluster graph (likely a UMAP or similar) showing the expression of the first 4 genes in the `gene_list_magic` across the clusters\n * `fig.savefig('figures/via_fig2_1.png',dpi=300,bbox_inches = 'tight')`: Saves the cluster graph plot as a PNG file.\n\n14. **Trajectory GAMs:**\n * `fig,ax1,ax2=v0.plot_trajectory_gams(...)`: Generates plots showing the expression of genes or pseudotime along the trajectory in GAMs.\n * `fig.savefig('figures/via_fig3.png',dpi=300,bbox_inches = 'tight')`: Saves the trajectory GAM plot as a PNG file.\n\n15. **Trajectory Stream Plot:**\n * `fig,ax=v0.plot_stream(...)`: Generates a stream plot on the t-SNE embedding, visualizing the direction and magnitude of the inferred trajectory. Cells are colored by clusters (`clusters='label'`).\n * `fig.savefig('figures/via_fig4.png',dpi=300,bbox_inches = 'tight')`: Saves the stream plot as a PNG file.\n\n16. **Trajectory Stream Plot (Time-Colored):**\n * `fig,ax=v0.plot_stream(...)`: Generates another stream plot on the t-SNE embedding, but this time cells are colored by their pseudotime values (`color_scheme='time'`).\n * `fig.savefig('figures/via_fig5.png',dpi=300,bbox_inches = 'tight')`: Saves the time-colored stream plot as a PNG file.\n\n17. **Lineage Probability Plot:**\n * `fig,axs=v0.plot_lineage_probability(...)`: Generates a plot showing the probabilities of cells belonging to different lineages.\n * `fig.savefig('figures/via_fig6.png',dpi=300,bbox_inches = 'tight')`: Saves the lineage probability plot as a PNG file.\n\n18. **Lineage Probability Plot (Marked Lineages):**\n * `fig,axs=v0.plot_lineage_probability(...)`: Generates another lineage probability plot, but this time highlighting the probability of the second and third lineages specified by `marker_lineages = [2,3]`.\n * `fig.savefig('figures/via_fig7.png',dpi=300,bbox_inches = 'tight')`: Saves the lineage probability plot with marked lineages as a PNG file.\n\n19. **Gene Trend Plots:**\n * `fig,axs=v0.plot_gene_trend(...)`: Generates a set of plots showing how expression of the genes in `gene_list_magic` varies along the inferred pseudotime trajectory.\n * `fig.savefig('figures/via_fig8.png',dpi=300,bbox_inches = 'tight')`: Saves the gene trend plot as a PNG file.\n\n20. **Gene Trend Heatmap:**\n * `fig,ax=v0.plot_gene_trend_heatmap(...)`: Generates a heatmap showing how the expression of genes in `gene_list_magic` varies along the pseudotime and highlights lineage 2.\n * `fig.savefig('figures/via_fig9.png',dpi=300,bbox_inches = 'tight')`: Saves the gene trend heatmap as a PNG file.\n\n**In summary,** this script demonstrates a common workflow for analyzing scRNA-seq data: loading the data, preprocessing it, performing trajectory inference, and visualizing the results through various informative plots. The use of `omicverse` and `scanpy` simplifies these steps and offers a wide range of customizable options for single-cell analysis.", + "file": "t_via_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_via_velo_annotated.json b/OvStudent/Converted_Jsons/t_via_velo_annotated.json new file mode 100644 index 00000000..73445c64 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_via_velo_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a single-cell RNA sequencing analysis using several popular libraries: `omicverse`, `scanpy`, `scvelo`, and `cellrank`. It aims to infer cell trajectories and lineage relationships within a pancreas dataset, incorporating both static gene expression and RNA velocity information. Let's break down the functionality and structure line by line:\n\n**Core Functionality:**\n\nThe script's overall goal is to:\n\n1. **Load and Preprocess Data:** Loads a pancreas dataset and prepares it for further analysis using scvelo and scanpy.\n2. **Compute RNA Velocities:** Calculates RNA velocity using a stochastic model.\n3. **Infer Trajectories with pyVIA:** Utilizes the `omicverse`'s `pyVIA` algorithm, which leverages both gene expression and velocity data to infer cell trajectories and lineage relationships.\n4. **Visualize Results:** Creates various visualizations, including:\n * Pie chart graph representing cell relationships and lineage proportions.\n * Trajectory GAMs showing gene expression changes along trajectories.\n * Stream plot visualizing velocity fields.\n * Lineage probabilities showing how likely each cell is to belong to a given lineage.\n\n**Line-by-Line Breakdown:**\n\n* **Lines 1-4: Library Imports:**\n * `import omicverse as ov`: Imports the `omicverse` library, which contains the `pyVIA` algorithm.\n * `import scanpy as sc`: Imports `scanpy`, a widely used library for single-cell analysis, for PCA and other preprocessing steps.\n * `import scvelo as scv`: Imports `scvelo`, a library specifically designed for RNA velocity analysis.\n * `import cellrank as cr`: Imports `cellrank`, which provides tools for lineage inference and includes example datasets, in this case the pancreas dataset.\n* **Line 5: Plotting Parameters:**\n * `ov.utils.ov_plot_set()`: Sets default plotting parameters for `omicverse` visualizations, ensuring a consistent visual style.\n* **Line 7: Dataset Loading:**\n * `adata = cr.datasets.pancreas()`: Loads a pre-processed pancreas single-cell dataset from the `cellrank` library and stores it in an AnnData object called `adata`. AnnData objects are the standard data containers in Scanpy, ScVelo, and CellRank.\n* **Line 8: Display AnnData:**\n * `adata`: Displays the `adata` object, which includes information like the gene expression matrix, cell annotations, and various analysis results. This is helpful for inspecting the initial data.\n* **Line 10: PCA Components:**\n * `n_pcs = 30`: Sets the number of principal components to 30 for dimensionality reduction.\n* **Line 11: Preprocessing for ScVelo:**\n * `scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=5000)`: Filters out cells with too few counts and normalizes the gene expression data, keeping the top 5000 genes. This is a necessary preprocessing step before computing velocity.\n* **Line 12: PCA:**\n * `sc.tl.pca(adata, n_comps = n_pcs)`: Performs Principal Component Analysis (PCA) to reduce the dimensionality of the gene expression data to the specified number of components.\n* **Line 13: Moments Computation:**\n * `scv.pp.moments(adata, n_pcs=None, n_neighbors=None)`: Computes moments (first and second) for velocity analysis. This step calculates the necessary statistics of the spliced and unspliced RNA counts for each cell for velocity calculation.\n* **Line 14: Velocity Calculation:**\n * `scv.tl.velocity(adata, mode='stochastic')`: Calculates RNA velocity using a stochastic model. This infers the direction of cell transitions by modeling the ratio between unspliced and spliced mRNAs.\n* **Lines 17-22: pyVIA Initialization:**\n * `v0 = ov.single.pyVIA(...)`: Creates an instance of the `pyVIA` algorithm, passing several parameters:\n * `adata=adata`: The AnnData object containing the data.\n * `adata_key='X_pca'`: Specifies the key within `adata.obsm` to use as the base for trajectory inference (here the PCA reduced data).\n * `adata_ncomps=n_pcs`: Number of PCA components.\n * `basis='X_umap'`: Specifies where to store a UMAP representation of the data which is done by pyVIA.\n * `clusters='clusters'`: Specifies the key in adata.obs that contains cluster assignments.\n * `knn=20`: The number of nearest neighbors to use for the kNN graph construction.\n * Other parameters related to initialization and output settings of the pyVIA algorithm.\n * `velocity_matrix=adata.layers['velocity']`: Uses the calculated RNA velocity data from `adata.layers` for trajectory inference.\n * `gene_matrix=adata.X.todense()`: Uses the gene expression matrix in dense format for trajectory inference.\n * `velo_weight=0.5`: Sets the weight for RNA velocity in the algorithm.\n * `edgebundle_pruning_twice=False, edgebundle_pruning=0.15`: Parameters for pruning the edge bundle graph.\n * `pca_loadings = adata.varm['PCs']`: The PCA loadings.\n\n* **Line 24: Running pyVIA:**\n * `v0.run()`: Executes the `pyVIA` algorithm with the specified parameters to infer cell trajectories.\n* **Lines 27-29: Pie Chart Visualization:**\n * `fig, ax, ax1 = v0.plot_piechart_graph(...)`: Generates a pie chart graph to visualize cell relationships and lineage proportions. The parameters control the visual appearance of the plot.\n * `fig.set_size_inches(8,4)`: Sets the figure size.\n* **Line 31: Trajectory GAMS Plot:**\n * `v0.plot_trajectory_gams(...)`: Creates plots showing the expression changes of genes along inferred trajectories (using GAMs, generalized additive models)\n* **Lines 33-34: Stream Plot:**\n * `v0.plot_stream(...)`: Generates a stream plot to visualize the RNA velocity field. The parameters control the visual appearance of the stream plot, including density, size, and linewidth.\n* **Line 36: Lineage Probability Plot:**\n * `v0.plot_lineage_probability()`: Visualizes the lineage probability for each cell.\n\n**Structure:**\n\nThe script follows a clear sequential structure:\n\n1. **Setup:** Import libraries and set up plotting parameters.\n2. **Data Loading and Preprocessing:** Load and prepare the dataset for further analysis.\n3. **Velocity Calculation:** Calculate RNA velocities using `scvelo`.\n4. **Trajectory Inference:** Run the `pyVIA` algorithm for cell trajectory analysis.\n5. **Visualization:** Generate various plots to interpret the results.\n\n**In Summary:**\n\nThis script combines several single-cell analysis tools to perform a comprehensive trajectory inference analysis. It uses `scanpy` for preprocessing and PCA, `scvelo` for velocity calculation, and `omicverse`'s `pyVIA` for trajectory analysis, and then generates informative plots. It demonstrates how to integrate different data modalities (gene expression and velocity) to reconstruct cell developmental trajectories.", + "file": "t_via_velo_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_visualize_bulk_annotated.json b/OvStudent/Converted_Jsons/t_visualize_bulk_annotated.json new file mode 100644 index 00000000..1ef0b2e5 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_visualize_bulk_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script uses the `omicverse`, `scanpy`, `matplotlib.pyplot`, and `seaborn` libraries to perform data visualization and analysis tasks. Here's a breakdown of its functionality and structure:\n\n**1. Imports and Setup:**\n\n* **Lines 1-3:** Imports necessary libraries:\n * `omicverse` as `ov`: A library likely designed for omics data analysis and visualization.\n * `scanpy` as `sc`: A library often used for single-cell RNA sequencing data analysis.\n * `matplotlib.pyplot` as `plt`: The core plotting library in Python.\n* **Line 4:** `ov.plot_set()`: Calls a function from `omicverse`, likely setting default plotting styles and parameters for the session.\n* **Line 5:** `fig,ax=plt.subplots(figsize = (4,4))`: Creates a figure (`fig`) and an axes object (`ax`) using `matplotlib.pyplot`. Sets the figure size to 4x4 inches.\n\n**2. Generating Venn Diagrams:**\n\n* **Lines 7-11:** Defines a dictionary named `sets`. This dictionary stores sets of numerical values associated with different keys, which represent labels for Venn diagram sets.\n* **Lines 13-14:** `ov.pl.venn(sets=sets,palette=ov.pl.sc_color,fontsize=5.5,ax=ax)`:\n * Calls a `venn` function from `omicverse.pl` to create a Venn diagram.\n * The `sets` dictionary provides the data.\n * `palette=ov.pl.sc_color` sets the colors for each set (likely using colors from the `scanpy` library).\n * `fontsize=5.5` specifies the font size for labels.\n * `ax=ax` makes the venn diagram use the axes object created in line 5.\n* **Lines 18-21:** `plt.annotate(...)`: Adds an annotation to the Venn diagram plot, likely to label a specific region or intersection. It specifies text, position, text alignment, appearance, and arrow properties.\n* **Line 23:** `plt.title('Venn4',fontsize=13)`: Adds a title to the plot with a specified font size.\n* **Line 25:** `fig.savefig(\"figures/bulk_venn4.png\",dpi=300,bbox_inches = 'tight')`: Saves the first Venn diagram to a PNG file. Sets the DPI (dots per inch) for high resolution and `bbox_inches` setting to remove excessive whitespace.\n* **Lines 27-32, 35-36:** Creates another set of venn diagram using three set of data and custom color palette.\n\n**3. Data Loading and Volcano Plot:**\n\n* **Line 40:** `result=ov.read('data/dds_result.csv',index_col=0)`: Reads a CSV file named 'dds\\_result.csv' into a pandas DataFrame using `omicverse.read`. The first column is used as the index.\n* **Line 41:** `result.head()`: Displays the first few rows of the loaded DataFrame.\n* **Lines 43-51:** `ov.pl.volcano(...)`:\n * Calls a function from `omicverse.pl` to generate a volcano plot.\n * `result` is the input DataFrame.\n * `pval_name`, `fc_name`: Specifies column names in the DataFrame that contain p-values and fold-change values.\n * `pval_threshold`, `fc_max`, `fc_min`: Sets thresholds to determine significant genes.\n * `pval_max`, `FC_max`: Sets axis limits.\n * `figsize`, `title`, `titlefont`: Sets plot figure size, title, and title font properties.\n * `up_color`, `down_color`, `normal_color`: Specifies colors for points based on upregulation, downregulation, or non-significance.\n * `up_fontcolor`, `down_fontcolor`, `normal_fontcolor`: Sets the font color for up/down/normal labels in the legend.\n * `legend_bbox`, `legend_ncol`, `legend_fontsize`: Sets legend properties.\n * `plot_genes`, `plot_genes_num`, `plot_genes_fontsize`: parameters for plotting specific genes or top number of genes along with labels.\n * `ticks_fontsize`: Sets the fontsize of axis labels.\n\n**4. Boxplot Generation and Annotation:**\n\n* **Line 53:** `import seaborn as sns`: Imports the seaborn library.\n* **Line 54:** `data = sns.load_dataset(\"tips\")`: Loads the \"tips\" dataset from `seaborn` for a boxplot example.\n* **Line 55:** `data.head()`: Displays the first few rows of the \"tips\" dataset.\n* **Lines 57-59:** `fig,ax=ov.pl.boxplot(...)`:\n * Calls a boxplot function from `omicverse.pl` to create a boxplot.\n * `data`, `hue`, `x_value`, `y_value`: Specifies the data and mapping of categories for the boxplot.\n * `palette=ov.pl.red_color`: Sets a color palette.\n * `figsize`, `fontsize`, `title`: Sets plot size, font size, and title.\n* **Lines 61-65:** `ov.pl.add_palue(...)`:\n * Adds a p-value annotation to the boxplot.\n * Specifies line coordinates and style for the p-value annotation.\n * Adds a text with the formatted p-value and specifies position, color, and font size.\n\n**In Summary:**\n\nThis script demonstrates several data visualization techniques using the `omicverse` library. It performs these tasks:\n\n1. **Venn Diagram Generation:** Creates Venn diagrams for visualizing set intersections.\n2. **Data Loading:** Reads data from a CSV file into a pandas DataFrame.\n3. **Volcano Plot:** Generates a volcano plot to visualize differential gene expression analysis results.\n4. **Boxplot:** Generates a boxplot to visualize data distributions across different categories and adds p-value annotations.\n\nThe script appears to be tailored for analyzing omics data, as suggested by the use of the `omicverse` library and typical data analysis techniques like volcano plots.", + "file": "t_visualize_bulk_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_visualize_colorsystem_annotated.json b/OvStudent/Converted_Jsons/t_visualize_colorsystem_annotated.json new file mode 100644 index 00000000..581440c2 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_visualize_colorsystem_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script utilizes the `omicverse` and `scanpy` libraries for single-cell data analysis and visualization, focusing on the analysis of a dataset related to the Dentate Gyrus. Here's a breakdown of its functionality and structure:\n\n**1. Import Libraries:**\n\n* **`import omicverse as ov`**: Imports the `omicverse` library and assigns it the alias `ov`. This library seems to be a custom package designed for single-cell data handling, visualization, and analysis, providing a high-level API.\n* **`import scanpy as sc`**: Imports the `scanpy` library, a popular package for single-cell analysis, often used as the backend for `omicverse`.\n* **`from IPython.display import HTML`**: Imports the `HTML` function from `IPython.display` which is used to display HTML content, suggesting the use of this script in a Jupyter Notebook or similar environment.\n* **`import matplotlib.pyplot as plt`**: Imports the `matplotlib.pyplot` module, the primary plotting library in Python, used for generating figures.\n\n**2. Setup & Data Loading:**\n\n* **`ov.plot_set()`**: Likely initializes or sets up default plotting configurations from `omicverse`.\n* **`adata = ov.read('data/DentateGyrus/10X43_1.h5ad')`**: Reads a single-cell data file in `.h5ad` format (AnnData format commonly used by `scanpy`) using the `ov.read()` function and loads it into an `AnnData` object named `adata`. This data seems to be related to a study of the Dentate Gyrus.\n* **`adata`**: Displays basic information about the `AnnData` object (likely including number of cells, genes and available metadata).\n\n**3. Custom Color Palette:**\n\n* **`fb=ov.pl.ForbiddenCity()`**: Creates an instance of a class named `ForbiddenCity` from the `omicverse.pl` module, likely a custom color palette library or module within `omicverse`. The alias `fb` is used to access its methods for dealing with colors.\n* **`HTML(fb.visual_color(loc_range=(0,384), num_per_row=24))`**: Generates an HTML visualization of the color palette from `ForbiddenCity`, showing its color map, likely as a table with 24 colors per row, with location indices ranging from 0 to 384. This is displayed using `HTML()` to show it in the notebook.\n* **`fb.get_color(name='\u51dd\u591c\u7d2b')`**: Retrieves the RGB color code for the color named '\u51dd\u591c\u7d2b' from the `ForbiddenCity` color palette. This suggests this class contains a list of colors with associated names, potentially using traditional Chinese colors.\n\n**4. Embedding Plotting and Comparison**\n\n* **`fig, axes = plt.subplots(1,3,figsize=(9,3))`**: Creates a figure with 3 subplots in a row, which we will use to display UMAP plots with different color schemes. The `figsize` controls the overall size of the figure.\n* **`ov.pl.embedding(...)`**: The core plotting function, called multiple times, to display UMAP embeddings using `omicverse`, likely based on scanpy's plotting. Each call creates a different plot based on different color palettes.\n * **`adata`**: the AnnData object is being passed to the function as the input\n * **`basis='X_umap'`**: specifies the UMAP coordinates are used for plotting.\n * **`frameon='small'`**: indicates that the bounding box/frame of the plot is small.\n * **`color=[\"clusters\"]`**: specify the metadata 'clusters' is used to color the cells.\n * **`palette=fb.red[:], palette=fb.pink1[:] and palette=fb.red1[:4]+fb.blue1`**: specify different color schemes taken from the `ForbiddenCity` module.\n * **`ncols=3, show=False, legend_loc=None`**: additional plot configurations such as number of columns of the color legend, to suppress the default show function and to disable color legend.\n * **`ax=axes[0]`**: Indicates that the plots generated in each call should be placed in the subplots defined earlier.\n\n The code plots three UMAP embeddings of the dataset, each colored based on the cluster labels but using different color palettes from `fb`: `red`, `pink1`, and a combination of `red1` and `blue1`. This is likely done to compare the visual appearance of the clusters using different palettes.\n\n* **`color_dict`**: Defines a dictionary that maps cell type (cluster) labels to specific hexadecimal color codes. These color codes represent the intended visual color of each cell type.\n* **`ov.pl.embedding(..., palette=color_dict)`**: Generates another UMAP plot of the data, coloring by the 'clusters' labels using the custom `color_dict`. This ensures each cell type has a designated color and facilitates visual identification of different clusters.\n\n**5. Custom Colormap Generation**\n\n* The code defines and uses custom color maps based on specific colors from the `ForbiddenCity` palette.\n* **`colors`**: Lists of colors (obtained from `fb.get_color_rgb()`) are defined and passed to `fb.get_cmap_seg()`. The function `get_cmap_seg` is used to generate a continuous colormap from a list of colors.\n* The script defines 3 different color schemes that consist of 2 or 3 color values.\n* **`ov.pl.embedding(..., cmap=fb.get_cmap_seg(colors))`**: Finally, a UMAP embedding plot is created that is colored by the gene expression of \"Sox7\", using the last colormap defined. The `cmap` argument takes the colormap generated earlier and applies it to the gene expression.\n\n**Overall Functionality:**\n\nThis script performs the following main actions:\n\n1. Loads single-cell RNA-seq data from an `.h5ad` file.\n2. Visualizes the data using UMAP dimensionality reduction plots.\n3. Leverages a custom color palette (`ForbiddenCity`) for coloring plots.\n4. Compares multiple UMAP embeddings using different color palettes to visually distinguish clusters.\n5. Creates a custom dictionary-based color palette for clusters.\n6. Generates custom colormaps based on selected colors for use with continuous gene expression data.\n7. Plots UMAPs colored based on the expression of the `Sox7` gene using a custom colormap.\n\n**Structure:**\n\nThe script is structured in a sequential manner:\n\n1. **Import:** necessary libraries.\n2. **Setup and loading:** loads the dataset and prepares the environment.\n3. **Custom Color Setup:** Instantiates the color object and demonstrates the custom color palette.\n4. **UMAP visualizations and analysis:** contains multiple `ov.pl.embedding` to generate a series of UMAP plots using different color palettes.\n5. **Custom Colormap Generation:** Defines colormaps by providing lists of `fb`-provided RGB colors.\n6. **Gene Expression Visualization:** Finally, a UMAP plot is colored by gene expression of Sox7, using the final colormap definition.\n\nThis structure is common in single-cell data analysis workflows, making it easy to follow the steps. The script also makes heavy use of `omicverse` to handle data loading, plotting, and color management, showing how `omicverse` handles high-level tasks.\n\nIn essence, this script is designed to perform a visual exploration of a single-cell dataset, using a custom color palette and tools for comparison, focusing on both discrete cell clusters and continuous gene expression patterns.", + "file": "t_visualize_colorsystem_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_visualize_single_annotated.json b/OvStudent/Converted_Jsons/t_visualize_single_annotated.json new file mode 100644 index 00000000..b79b4ef8 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_visualize_single_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script is designed to perform single-cell RNA sequencing (scRNA-seq) data analysis and visualization using the `omicverse` and `scanpy` libraries, along with `matplotlib` and other helper libraries like `pandas` and `seaborn`. It focuses on exploring gene expression patterns and cell type distributions within a dataset.\n\nHere's a breakdown of the functionality and structure:\n\n**1. Setup and Data Loading:**\n\n* **Lines 1-2:** Imports necessary libraries: `omicverse` (as `ov`) and `scanpy` (as `sc`). These libraries provide the foundation for single-cell data handling and analysis.\n* **Line 4:** Sets the plotting style for `omicverse`.\n* **Line 6:** Reads an AnnData object (a standard data structure for single-cell data) from an `h5ad` file named \"10X43_1.h5ad\" located in the \"data/DentateGyrus\" directory and assigns it to the variable `adata`. This loads the expression data, along with metadata, from the file.\n* **Line 7:** Displays the loaded `AnnData` object, giving the user an overview of its structure and contents.\n\n**2. UMAP Embedding and Basic Plotting:**\n\n* **Line 9:** Generates an optimized color palette (`optim_palette`) using the `omicverse` library, based on the UMAP embedding (`X_umap`) and the 'clusters' annotation in the `adata` object. This ensures visually distinct colors for different cell types.\n* **Lines 11-20:**\n * Imports `matplotlib.pyplot` as `plt` for plot creation.\n * Creates a figure and axes object for a plot with a specified size.\n * Generates a UMAP embedding plot using `ov.pl.embedding`, colored by cell clusters ('clusters'), with an optimized palette, small frame, and a specified title.\n* **Lines 22-26:** Generates another UMAP embedding plot, this time colored by the 'age(days)' annotation.\n\n**3. Cell Proportion and Composition Visualization:**\n\n* **Lines 28-31:**\n * Imports `matplotlib.pyplot` again.\n * Creates another figure and axes object for a plot with a different size.\n * Generates a cell proportion plot using `ov.pl.cellproportion`. This plot shows the proportion of cells within each cluster grouped by 'age(days)'.\n* **Lines 33-36:** Generates another cell proportion plot, this time using 'age(days)' as cell types and grouping by 'clusters', filtering to 'nIPC', 'Granule immature', and 'Granule mature' clusters.\n* **Lines 38-41:** Generates a cell stacked area plot (`ov.pl.cellstackarea`), showing the relative abundance of cell types grouped by clusters (filtered to nIPC, Granule immature, and Granule mature).\n\n**4. Cell Type-Specific Embedding Visualization:**\n\n* **Lines 43-47:** Creates a cell type-specific embedding plot using `ov.pl.embedding_celltype`. This plot highlights individual cell types on the UMAP embedding with a separate visualization of the embedding and their distribution, with range customization.\n\n**5. Convex Hull and Contour Visualization:**\n\n* **Lines 49-57:**\n * Imports `matplotlib.pyplot` again\n * Creates a figure and axes for a plot.\n * Generates a basic UMAP embedding plot colored by 'clusters'.\n* **Lines 59-63:** Adds a convex hull outline to the UMAP plot using `ov.pl.ConvexHull` specifically for the \"Granule mature\" cluster.\n* **Lines 66-74:**\n * Imports `matplotlib.pyplot` again.\n * Creates a figure and axes.\n * Generates a basic UMAP embedding colored by 'clusters'.\n* **Lines 76-78:** Adds contour lines using `ov.pl.contour` to the UMAP plot for the \"Granule immature\" and \"Granule mature\" clusters.\n\n**6. Embedding Adjustment and Density Plot:**\n\n* **Lines 81-90:**\n * Imports `patheffects` from matplotlib.\n * Imports `matplotlib.pyplot`.\n * Creates a figure and axes.\n * Generates a UMAP plot coloured by `clusters`, without legend, outline, and frame.\n* **Lines 92-100:** Adjusts the UMAP embedding using `ov.pl.embedding_adjust` with arrows indicating cell developmental trajectories. Adjustments are based on 'clusters' annotation, excluding 'OL', and with stylized arrows and text.\n* **Lines 102-107:** Creates a density plot of the \"Granule mature\" cluster on the UMAP embedding using `ov.pl.embedding_density`.\n\n**7. Gene Set Analysis and Visualization:**\n\n* **Lines 109-111:** Computes AUC scores using `ov.single.geneset_aucell` for a defined gene set 'Sox' containing multiple Sox genes.\n* **Lines 113-117:** Generates a UMAP embedding plot colored by the expression of the gene 'Sox4'.\n* **Line 119:** Generates a violin plot of 'Sox4' expression across cell clusters using `ov.pl.violin`.\n* **Lines 121-126:**\n * Creates a figure and axes for a plot.\n * Generates a bar dot plot using `ov.pl.bardotplot`, with clusters on the x-axis, 'Sox_aucell' scores as the color, y-axis labelled \"Expression\" and styled bar/dot parameters.\n* **Lines 128-132:** Adds a p-value annotation to the plot at specified locations.\n* **Lines 134-139:**\n * Creates a figure and axes for a plot.\n * Generates a bar dot plot using `ov.pl.bardotplot`, with clusters on the x-axis, 'Sox17' scores as the color, with y-axis labelled as \"Expression\", x-axis labelled as \"Cell Type\", and styled bar/dot parameters.\n* **Lines 141-145:** Adds a p-value annotation to the plot at specified locations.\n\n**8. Boxplot Visualization of Gene Expression:**\n\n* **Lines 147-148:** Imports `pandas` and `seaborn`.\n* **Lines 150-164:** Generates a boxplot using `ov.pl.single_group_boxplot` for 'Sox_aucell' scores grouped by clusters. It includes a Kruskal-Wallis test, with further customization for color, axis labels, title and styling of dots.\n* **Lines 165-166:** Disables the grid and rotates the x axis ticks on the boxplot.\n\n**9. Heatmap Visualization of Marker Genes:**\n\n* **Lines 168-180:**\n * Imports `pandas`.\n * Defines a dictionary `marker_genes_dict` with marker genes for the 'Sox' cell type.\n * Defines a color dictionary `color_dict`.\n * Initializes empty gene-specific color dictionaries, `gene_color_dict` and `gene_color_dict_black`.\n * Assigns colors from `color_dict` and black as color to each gene in `marker_genes_dict`.\n* **Lines 182-215:** Generates a complex heatmap using `ov.pl.complexheatmap`, showing expression levels of selected marker genes grouped by clusters, with specified colors for left, right, column annotations. Also sets other various parameters for column split, row cluster, label and save preferences.\n\n**10. Preprocessing and Marker Heatmap:**\n\n* **Line 217:** Preprocesses the `adata` using `ov.pp.preprocess`, performing log transformation, scaling, and selecting a specified number of highly variable genes, preparing it for further analysis.\n* **Lines 219-232:** Defines a dictionary of marker genes for several cell types.\n* **Lines 234-243:** Generates a marker gene heatmap using `ov.pl.marker_heatmap` showing the expression of marker genes across different clusters.\n\n**In summary, this script loads scRNA-seq data, performs exploratory data analysis, generates various visualizations including UMAP embeddings, cell proportion plots, violin plots, boxplots, density plots, convex hulls, contour plots, and heatmaps, and performs basic gene set and marker gene analysis using omicverse and other standard data analysis tools in Python.** It provides a comprehensive look at cell type distributions, gene expression patterns and regulatory mechanisms within the given single cell data.", + "file": "t_visualize_single_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Jsons/t_wgcna_annotated.json b/OvStudent/Converted_Jsons/t_wgcna_annotated.json new file mode 100644 index 00000000..4b4ae3d6 --- /dev/null +++ b/OvStudent/Converted_Jsons/t_wgcna_annotated.json @@ -0,0 +1,4 @@ +{ + "description": "This Python script performs a Weighted Gene Co-expression Network Analysis (WGCNA) on gene expression data using the `omicverse` package, after some initial preprocessing with `pandas` and `statsmodels`. Let's break down the functionality and structure line by line:\n\n**1. Setup and Data Loading (Lines 1-9)**\n\n - **Imports:** The script begins by importing necessary libraries:\n - `scanpy`: For single-cell data analysis (though not directly used in this specific script, it is likely intended for use with `omicverse` in other contexts).\n - `omicverse`: The core library for omics data analysis, providing the `pyWGCNA` object and other functionalities.\n - `matplotlib.pyplot`: For general plotting.\n - `pandas`: For data manipulation using DataFrames.\n - `statsmodels.robust`: For robust statistical methods, specifically Median Absolute Deviation (MAD).\n - **Plotting Style:** `ov.plot_set()` sets a specific plotting style provided by `omicverse`.\n - **Data Loading:**\n - `ov.utils.read('data/5xFAD_paper/expressionList.csv', index_col=0)`: Reads a CSV file containing gene expression data into a pandas DataFrame. The first column is set as the index (likely containing gene names).\n - `data.head()`: Displays the first few rows of the loaded DataFrame for a quick preview.\n\n**2. Gene Preprocessing and Filtering (Lines 11-15)**\n\n - **MAD Calculation:** `gene_mad=data.apply(robust.mad)`: Calculates the Median Absolute Deviation (MAD) for each gene across all samples. This is a robust measure of variability less sensitive to outliers than standard deviation.\n - **DataFrame Transposition:** `data=data.T`: Transposes the DataFrame so that genes become columns and samples are rows. This is a required input format for WGCNA.\n - **Gene Filtering:**\n - `data=data.loc[gene_mad.sort_values(ascending=False).index[:2000]]`: Selects the top 2000 genes with the highest MAD values. This filters the data to focus on the most variable genes, which are likely to be biologically informative.\n - **Preview of Processed Data:** `data.head()`: Displays the first few rows of the filtered DataFrame.\n\n**3. WGCNA Initialization and Processing (Lines 17-38)**\n\n - **`pyWGCNA` Object Initialization:**\n - `pyWGCNA_5xFAD = ov.bulk.pyWGCNA(...)`: Creates an instance of the `pyWGCNA` class, providing key information such as:\n - `name`: \"5xFAD_2k\" (a name for the analysis).\n - `species`: \"mus musculus\" (mouse).\n - `geneExp`: The transposed, filtered expression data.\n - `outputPath`: The output path is set to ''. This means results will be saved in current working directory, with no separate folder being created.\n - `save`: True indicating that intermediate files should be saved.\n - **Preview of Expression Data:** `pyWGCNA_5xFAD.geneExpr.to_df().head(5)`: Displays the first five rows of the gene expression data within the `pyWGCNA` object.\n - **WGCNA Preprocessing:** `pyWGCNA_5xFAD.preprocess()`: Performs essential preprocessing steps internally within the WGCNA pipeline.\n - **Soft Threshold Calculation:** `pyWGCNA_5xFAD.calculate_soft_threshold()`: Determines an appropriate soft-thresholding power for the adjacency matrix.\n - **Adjacency Matrix Calculation:** `pyWGCNA_5xFAD.calculating_adjacency_matrix()`: Creates the adjacency matrix based on the expression data and the calculated soft threshold.\n - **TOM Similarity Matrix Calculation:** `pyWGCNA_5xFAD.calculating_TOM_similarity_matrix()`: Calculates the Topological Overlap Matrix (TOM), a more robust measure of network connectivity.\n - **Gene Tree Calculation:** `pyWGCNA_5xFAD.calculate_geneTree()`: Creates a hierarchical clustering dendrogram based on the TOM similarity.\n - **Dynamic Module Detection:**\n - `pyWGCNA_5xFAD.calculate_dynamicMods(...)`: Detects modules (groups of co-expressed genes) using the cutreeHybrid method with specific settings.\n - `deepSplit`: Controls how sensitive the clustering method is to splitting groups.\n - `pamRespectsDendro`: Determines if the Partitioning Around Medoids (PAM) algorithm should respect the dendrogram.\n - **Module Eigengene Calculation:**\n - `pyWGCNA_5xFAD.calculate_gene_module(...)`: Calculates module eigengenes, which represent the overall expression profile of each module. The 'softPower' parameter is passed as 8.\n - **Network Matrix Plot:** `pyWGCNA_5xFAD.plot_matrix(save=False)`: Generates a visualization of the network connections but does not save the plot to file.\n - **WGCNA Results Saving:** `pyWGCNA_5xFAD.saveWGCNA()`: Saves all the WGCNA results to files.\n\n**4. Loading and Examining Results (Lines 40-54)**\n\n - **Loading Saved WGCNA Object:** `pyWGCNA_5xFAD=ov.bulk.readWGCNA('5xFAD_2k.p')`: Loads a previously saved `pyWGCNA` object, allowing for continued analysis without re-running the computationally expensive steps.\n - **Module Information:** `pyWGCNA_5xFAD.mol.head()`: Displays the module assignment information for each gene.\n - **Variable Information:** `pyWGCNA_5xFAD.datExpr.var.head()`: Displays variable information (genes in this case) stored in the expression data object.\n - **Sub-Module Selection:**\n - `sub_mol=pyWGCNA_5xFAD.get_sub_module(...)`: Extracts genes belonging to specific modules (in this case, 'gold' and 'lightgreen').\n - `sub_mol.head(), sub_mol.shape`: Displays the first few rows of the sub-module information and its shape.\n - **Sub-Network Selection:**\n - `G_sub=pyWGCNA_5xFAD.get_sub_network(...)`: Extracts a sub-network of interactions within the 'lightgreen' module based on a correlation threshold of 0.2.\n - `G_sub`: Prints out the structure of the graph sub-network.\n - **Edge Counting:** `len(G_sub.edges())`: Determines the number of edges within the sub-network.\n\n**5. Network Visualization and Metadata Integration (Lines 56-71)**\n\n - **Sub-Network Plot:**\n - `pyWGCNA_5xFAD.plot_sub_network(...)`: Generates a visual representation of the sub-network with specified parameters:\n - `pos_type='kamada_kawai'`: Layout algorithm for network visualization.\n - `pos_scale`: Scaling factor for the layout.\n - `pos_dim`: Dimensionality of the layout (2D here).\n - `figsize`: Size of the plot.\n - `node_size`, `label_fontsize`: Control node size and font size for labels.\n - `label_bbox`: Adjusts the bounding box around labels.\n - `correlation_threshold`=0.2: Only display edges with correlation above 0.2.\n - **Metadata Integration:**\n - `pyWGCNA_5xFAD.updateSampleInfo(...)`: Loads sample metadata from the specified CSV file, updating the `pyWGCNA` object.\n - `pyWGCNA_5xFAD.setMetadataColor(...)`: Sets specific colors for categorical metadata columns (Sex, Genotype, Age, and Tissue). This color mapping is used for visualization later.\n\n**6. WGCNA Analysis and Output (Lines 73-81)**\n\n - **Full WGCNA Analysis:** `pyWGCNA_5xFAD.analyseWGCNA()`: Performs a suite of WGCNA analysis tasks internally within the `omicverse` library, such as differential expression analysis related to the sample metadata.\n - **Metadata Extraction:** `metadata = pyWGCNA_5xFAD.datExpr.obs.columns.tolist()`: Retrieves a list of the metadata columns that can be used for plotting.\n - **Module Eigengene Plot:** `pyWGCNA_5xFAD.plotModuleEigenGene('lightgreen', metadata, show=True)`: Visualizes the expression of the 'lightgreen' module eigengene across samples, colored by metadata information, and displays the plot.\n - **Module Eigengene Barplot:** `pyWGCNA_5xFAD.barplotModuleEigenGene('lightgreen', metadata, show=True)`: Presents a barplot representation of the module eigengene expression across samples with metadata colored bars, and displays the plot.\n - **Hub Gene Identification:** `pyWGCNA_5xFAD.top_n_hub_genes(moduleName=\"lightgreen\", n=10)`: Identifies and displays the top 10 hub genes (genes with highest connectivity) within the 'lightgreen' module.\n\n**Summary:**\n\nThis script performs a complete WGCNA analysis workflow using the `omicverse` package. It loads gene expression data, preprocesses it (filtering by MAD), constructs a gene co-expression network, identifies modules of co-expressed genes, visualizes subnetworks, integrates metadata, and provides tools for exploring module eigengene expression, and hub genes. This workflow is common in systems biology to understand relationships between genes and phenotypes by identifying groups of genes that behave similarly across samples.", + "file": "t_wgcna_annotated.py" +} \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_anno_trans_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_anno_trans_annotated.py new file mode 100644 index 00000000..120df7a1 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_anno_trans_annotated.py @@ -0,0 +1,25 @@ +``` +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt +# Line 3: Import the scanpy library as sc. -- import scanpy as sc +# Line 4: Set plot styling using ov_plot_set() from the omicverse library. -- ov.ov_plot_set() +# Line 6: Read RNA data from an h5ad file into an AnnData object named rna. -- rna=sc.read("data/analysis_lymph/rna-emb.h5ad") +# Line 7: Read ATAC data from an h5ad file into an AnnData object named atac. -- atac=sc.read("data/analysis_lymph/atac-emb.h5ad") +# Line 9: Import the scanpy library as sc again (redundant). -- import scanpy as sc +# Line 10: Concatenate the rna and atac AnnData objects into a new AnnData object called combined, merging overlapping data. -- combined=sc.concat([rna,atac],merge='same') +# Line 11: Output the combined AnnData object. -- combined +# Line 13: Calculate a manifold diffusion embedding (MDE) and store it in 'X_mde' within the combined AnnData object. -- combined.obsm['X_mde']=ov.utils.mde(combined.obsm['X_glue']) +# Line 15: Generate and display an embedding plot using 'X_mde' as basis, color by 'domain', titled 'Layers', using a red palette. -- ov.utils.embedding(combined, +# Line 23: Generate and display an embedding plot using 'X_mde' as basis, color by 'major_celltype', titled 'Cell type', without a defined palette. -- ov.utils.embedding(rna, +# Line 31: Create a weighted k-nearest neighbor trainer object using RNA data and 'X_glue' as the embedding. -- knn_transformer=ov.utils.weighted_knn_trainer( +# Line 37: Transfer cell type labels from the RNA data to the ATAC data using weighted KNN, and store associated uncertainty values. -- labels,uncert=ov.utils.weighted_knn_transfer( +# Line 44: Assign transferred cell type labels to a new 'transf_celltype' column in the ATAC data's observation dataframe. -- atac.obs["transf_celltype"]=labels.loc[atac.obs.index,"major_celltype"] +# Line 45: Assign transferred cell type label uncertainties to a new 'transf_celltype_unc' column in the ATAC data's observation dataframe. -- atac.obs["transf_celltype_unc"]=uncert.loc[atac.obs.index,"major_celltype"] +# Line 47: Copy the transferred cell type labels to the 'major_celltype' column in the ATAC data's observation dataframe. -- atac.obs["major_celltype"]=atac.obs["transf_celltype"].copy() +# Line 49: Generate and display a UMAP embedding plot of ATAC data, colored by uncertainty and transferred cell type, without a title. -- ov.utils.embedding(atac, +# Line 57: Import the scanpy library as sc again (redundant). -- import scanpy as sc +# Line 58: Concatenate the rna and atac AnnData objects into a new AnnData object called combined1, merging overlapping data. -- combined1=sc.concat([rna,atac],merge='same') +# Line 59: Output the combined1 AnnData object. -- combined1 +# Line 61: Calculate a manifold diffusion embedding (MDE) and store it in 'X_mde' within the combined1 AnnData object. -- combined1.obsm['X_mde']=ov.utils.mde(combined1.obsm['X_glue']) +# Line 63: Generate and display an embedding plot using 'X_mde' as basis, colored by 'domain' and 'major_celltype', titled 'Layers' and 'Cell type'. -- ov.utils.embedding(combined1, +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py new file mode 100644 index 00000000..ddd4a44f --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py @@ -0,0 +1,42 @@ +```python +# Line 1: Imports the omicverse library as ov. -- import omicverse as ov +# Line 2: Imports the scanpy library as sc. -- import scanpy as sc +# Line 3: Imports the scvelo library as scv. -- import scvelo as scv +# Line 5: Sets the plotting style for omicverse. -- ov.utils.ov_plot_set() +# Line 7: Downloads the pathway database for omicverse. -- ov.utils.download_pathway_database() +# Line 8: Downloads the gene ID annotation pair for omicverse. -- ov.utils.download_geneid_annotation_pair() +# Line 10: Loads the pancreas dataset from scvelo. -- adata = scv.datasets.pancreas() +# Line 11: Displays the loaded AnnData object. -- adata +# Line 13: Finds the maximum value in the adata.X matrix. -- adata.X.max() +# Line 15: Normalizes the total counts per cell to 1e4. -- sc.pp.normalize_total(adata, target_sum=1e4) +# Line 16: Applies a log1p transformation to the adata.X matrix. -- sc.pp.log1p(adata) +# Line 18: Finds the maximum value in the adata.X matrix after processing. -- adata.X.max() +# Line 20: Prepares the pathway dictionary from a GO Biological Process file. -- pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2021.txt',organism='Mouse') +# Line 22: Defines a geneset name for the analysis. -- geneset_name='response to vitamin (GO:0033273)' +# Line 23: Performs AUCell analysis for a single geneset. -- ov.single.geneset_aucell(adata, +# Line 25: Plots the UMAP embedding with AUCell scores for the specified geneset. -- sc.pl.embedding(adata, +# Line 28: Defines multiple geneset names for the analysis. -- geneset_names=['response to vitamin (GO:0033273)','response to vitamin D (GO:0033280)'] +# Line 29: Performs AUCell analysis for multiple pathways. -- ov.single.pathway_aucell(adata, +# Line 31: Plots the UMAP embedding with AUCell scores for the specified pathways. -- sc.pl.embedding(adata, +# Line 34: Performs AUCell analysis for a test geneset. -- ov.single.geneset_aucell(adata, +# Line 36: Plots the UMAP embedding with AUCell scores for the test geneset. -- sc.pl.embedding(adata, +# Line 39: Calculates the pathway enrichment using AUCell for all pathways. -- adata_aucs=ov.single.pathway_aucell_enrichment(adata, +# Line 42: Copies the obs from adata to adata_aucs. -- adata_aucs.obs=adata[adata_aucs.obs.index].obs +# Line 43: Copies the obsm from adata to adata_aucs. -- adata_aucs.obsm=adata[adata_aucs.obs.index].obsm +# Line 44: Copies the obsp from adata to adata_aucs. -- adata_aucs.obsp=adata[adata_aucs.obs.index].obsp +# Line 45: Displays the adata_aucs AnnData object. -- adata_aucs +# Line 47: Writes the adata_aucs to an h5ad file with gzip compression. -- adata_aucs.write_h5ad('data/pancreas_auce.h5ad',compression='gzip') +# Line 49: Reads the adata_aucs from an h5ad file. -- adata_aucs=sc.read('data/pancreas_auce.h5ad') +# Line 51: Plots the UMAP embedding with AUCell scores for specified pathways from adata_aucs. -- sc.pl.embedding(adata_aucs, +# Line 53: Performs differential gene expression analysis with t-test. -- sc.tl.rank_genes_groups(adata_aucs, 'clusters', method='t-test',n_genes=100) +# Line 54: Plots dotplot of rank genes based on clusters. -- sc.pl.rank_genes_groups_dotplot(adata_aucs,groupby='clusters', +# Line 57: Gets a list of differentially expressed genes for the Beta cluster. -- degs = sc.get.rank_genes_groups_df(adata_aucs, group='Beta', key='rank_genes_groups', log2fc_min=2, +# Line 58: Displays the list of differentially expressed genes. -- degs +# Line 60: Imports the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt +# Line 62: Plots UMAP embedding with clusters and differentially expressed genes. -- axes=sc.pl.embedding(adata_aucs,ncols=3, +# Line 66: Adjusts the plot layout. -- axes.tight_layout() +# Line 68: Sets the base of log1p to None in adata.uns. -- adata.uns['log1p']['base']=None +# Line 69: Performs differential gene expression analysis with t-test for adata. -- sc.tl.rank_genes_groups(adata, 'clusters', method='t-test',n_genes=100) +# Line 71: Performs pathway enrichment analysis on adata. -- res=ov.single.pathway_enrichment(adata,pathways_dict=pathway_dict,organism='Mouse', +# Line 73: Plots the pathway enrichment analysis results. -- ax=ov.single.pathway_enrichment_plot(res,plot_title='Enrichment',cmap='Reds', +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py new file mode 100644 index 00000000..059c3954 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py @@ -0,0 +1,52 @@ +```python +# Line 1: Import the scanpy library for single-cell analysis -- import scanpy as sc +# Line 2: Import the omicverse library for omics data analysis -- import omicverse as ov +# Line 3: Import the matplotlib plotting library -- import matplotlib.pyplot as plt +# Line 4: Apply default plotting settings from omicverse -- ov.plot_set() +# Line 6: Read bulk RNA-seq data from a file, setting the first column as the index -- bulk_data=ov.read('data/GSE74985_mergedCount.txt.gz',index_col=0) +# Line 7: Map gene IDs in the bulk data using a provided gene mapping file -- bulk_data=ov.bulk.Matrix_ID_mapping(bulk_data,'genesets/pair_GRCm39.tsv') +# Line 8: Display the first few rows of the processed bulk data -- bulk_data.head() +# Line 10: Import the anndata library for handling annotated data -- import anndata +# Line 11: Import the scvelo library for RNA velocity analysis -- import scvelo as scv +# Line 12: Load single-cell RNA-seq data from the dentategyrus dataset -- single_data=scv.datasets.dentategyrus() +# Line 13: Display the loaded single-cell data -- single_data +# Line 15: Initialize a Bulk2Single model for deconvoluting bulk RNA-seq data using single-cell data -- model=ov.bulk2single.Bulk2Single(bulk_data=bulk_data,single_data=single_data, +# Line 16: Specify the cell type annotation key and bulk groups for the Bulk2Single model -- celltype_key='clusters',bulk_group=['dg_d_1','dg_d_2','dg_d_3'], +# Line 17: Define the number of top markers and ratio for the model and whether to use GPU -- top_marker_num=200,ratio_num=1,gpu=0) +# Line 19: Predict cell type fractions in the bulk samples using the trained model -- CellFractionPrediction=model.predicted_fraction() +# Line 21: Display the first few rows of the predicted cell fractions -- CellFractionPrediction.head() +# Line 23: Create a stacked bar plot of the cell fraction predictions -- ax = CellFractionPrediction.plot(kind='bar', stacked=True, figsize=(8, 4)) +# Line 24: Set the x-axis label of the plot -- ax.set_xlabel('Sample') +# Line 25: Set the y-axis label of the plot -- ax.set_ylabel('Cell Fraction') +# Line 26: Set the title of the plot -- ax.set_title('TAPE Cell fraction predicted') +# Line 27: Display the legend outside of the plot area -- plt.legend(bbox_to_anchor=(1.05, 1),ncol=1,) +# Line 28: Show the generated plot -- plt.show() +# Line 30: Preprocess the bulk data in a lazy manner -- model.bulk_preprocess_lazy() +# Line 31: Preprocess the single-cell data in a lazy manner -- model.single_preprocess_lazy() +# Line 32: Prepare the input data for the model -- model.prepare_input() +# Line 34: Train a variational autoencoder (VAE) model using the bulk and single-cell data -- vae_net=model.train( +# Line 35: Set the batch size for training -- batch_size=512, +# Line 36: Set the learning rate for the optimizer -- learning_rate=1e-4, +# Line 37: Set the hidden size of the VAE model -- hidden_size=256, +# Line 38: Set the number of training epochs -- epoch_num=3500, +# Line 39: Specify the directory to save the trained VAE model -- vae_save_dir='data/bulk2single/save_model', +# Line 40: Specify the name for the saved VAE model -- vae_save_name='dg_vae', +# Line 41: Set the directory to save the generated data -- generate_save_dir='data/bulk2single/output', +# Line 42: Set the name for the generated data -- generate_save_name='dg') +# Line 44: Plot the training loss of the VAE model -- model.plot_loss() +# Line 49: Load a pre-trained VAE model from a file -- vae_net=model.load('data/bulk2single/save_model/dg_vae.pth') +# Line 51: Generate single-cell expression data from the bulk data using the trained model -- generate_adata=model.generate() +# Line 52: Display the generated single-cell data -- generate_adata +# Line 54: Filter the generated data based on leiden cluster size -- generate_adata=model.filtered(generate_adata,leiden_size=25) +# Line 55: Display the filtered generated data -- generate_adata +# Line 57: Plot cell type proportions for the generated data -- ov.bulk2single.bulk2single_plot_cellprop(generate_adata,celltype_key='clusters') +# Line 58: Turn off the grid on the plot -- plt.grid(False) +# Line 60: Plot cell type proportions for the original single-cell data -- ov.bulk2single.bulk2single_plot_cellprop(single_data,celltype_key='clusters') +# Line 61: Turn off the grid on the plot -- plt.grid(False) +# Line 63: Plot correlation between cell type proportions in original and generated data -- ov.bulk2single.bulk2single_plot_correlation(single_data,generate_adata,celltype_key='clusters') +# Line 64: Turn off the grid on the plot -- plt.grid(False) +# Line 66: Import the scanpy library again (redundant since it was already imported) -- import scanpy as sc +# Line 67: Compute the MDE embedding using the PCA coordinates of the generated data -- generate_adata.obsm["X_mde"] = ov.utils.mde(generate_adata.obsm["X_pca"]) +# Line 68: Generate and display an embedding plot with specified color, palette and settings -- ov.utils.embedding(generate_adata,basis='X_mde',color=['clusters'],wspace=0.4, +# Line 69: Use a Pyomic color palette, and 'small' frame -- palette=ov.utils.pyomic_palette(),frameon='small') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py new file mode 100644 index 00000000..acbc4e87 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py @@ -0,0 +1,57 @@ +```python +# Line 1: Imports the anndata library for working with annotated data objects. -- import anndata +# Line 2: Imports the pandas library for data manipulation and analysis. -- import pandas as pd +# Line 3: Imports the omicverse library, likely for omics data analysis. -- import omicverse as ov +# Line 4: Sets plotting parameters for omicverse visualizations. -- ov.ov_plot_set() +# Line 6: Reads a pickled pandas DataFrame from a file path and assigns it to the variable `dataset_1`. -- dataset_1 = pd.read_pickle("data/combat/GSE18520.pickle") +# Line 7: Creates an AnnData object from the transpose of `dataset_1`. -- adata1=anndata.AnnData(dataset_1.T) +# Line 8: Adds a 'batch' column to the `obs` attribute of `adata1` and sets all values to '1'. -- adata1.obs['batch']='1' +# Line 9: Displays the `adata1` AnnData object. -- adata1 +# Line 11: Reads a pickled pandas DataFrame from a file path and assigns it to the variable `dataset_2`. -- dataset_2 = pd.read_pickle("data/combat/GSE66957.pickle") +# Line 12: Creates an AnnData object from the transpose of `dataset_2`. -- adata2=anndata.AnnData(dataset_2.T) +# Line 13: Adds a 'batch' column to the `obs` attribute of `adata2` and sets all values to '2'. -- adata2.obs['batch']='2' +# Line 14: Displays the `adata2` AnnData object. -- adata2 +# Line 16: Reads a pickled pandas DataFrame from a file path and assigns it to the variable `dataset_3`. -- dataset_3 = pd.read_pickle("data/combat/GSE69428.pickle") +# Line 17: Creates an AnnData object from the transpose of `dataset_3`. -- adata3=anndata.AnnData(dataset_3.T) +# Line 18: Adds a 'batch' column to the `obs` attribute of `adata3` and sets all values to '3'. -- adata3.obs['batch']='3' +# Line 19: Displays the `adata3` AnnData object. -- adata3 +# Line 21: Concatenates `adata1`, `adata2`, and `adata3` into a single AnnData object named `adata`, merging observations with the same name. -- adata=anndata.concat([adata1,adata2,adata3],merge='same') +# Line 22: Displays the `adata` AnnData object. -- adata +# Line 24: Applies batch correction to the `adata` object using the 'batch' column. -- ov.bulk.batch_correction(adata,batch_key='batch') +# Line 26: Converts the raw data from the `adata` object to a pandas DataFrame and transposes it. -- raw_data=adata.to_df().T +# Line 27: Displays the first few rows of the `raw_data` DataFrame. -- raw_data.head() +# Line 29: Converts the batch-corrected data from the `adata` object to a pandas DataFrame and transposes it. -- removing_data=adata.to_df(layer='batch_correction').T +# Line 30: Displays the first few rows of the `removing_data` DataFrame. -- removing_data.head() +# Line 32: Saves the `raw_data` DataFrame to a CSV file named 'raw_data.csv'. -- raw_data.to_csv('raw_data.csv') +# Line 33: Saves the `removing_data` DataFrame to a CSV file named 'removing_data.csv'. -- removing_data.to_csv('removing_data.csv') +# Line 35: Writes the `adata` AnnData object to an H5AD file named 'adata_batch.h5ad' with gzip compression. -- adata.write_h5ad('adata_batch.h5ad',compression='gzip') +# Line 38: Creates a dictionary mapping batch identifiers to colors. -- color_dict={ +# Line 39: Maps batch '1' to the second red color from omicverse's utils. -- '1':ov.utils.red_color[1], +# Line 40: Maps batch '2' to the second blue color from omicverse's utils. -- '2':ov.utils.blue_color[1], +# Line 41: Maps batch '3' to the second green color from omicverse's utils. -- '3':ov.utils.green_color[1], +# Line 43: Creates a figure and an axes object for plotting, with a specified figure size. -- fig,ax=plt.subplots( figsize = (20,4)) +# Line 44: Creates a boxplot of the transposed raw data from the `adata` object, with filled boxes. -- bp=plt.boxplot(adata.to_df().T,patch_artist=True) +# Line 45: Iterates through the boxes and batch labels of the data. -- for i,batch in zip(range(adata.shape[0]),adata.obs['batch']): +# Line 46: Sets the fill color of each boxplot to a color determined by the batch. -- bp['boxes'][i].set_facecolor(color_dict[batch]) +# Line 47: Turns off the axis display for the boxplot. -- ax.axis(False) +# Line 48: Displays the plot. -- plt.show() +# Line 50: Creates a figure and an axes object for plotting, with a specified figure size. -- fig,ax=plt.subplots( figsize = (20,4)) +# Line 51: Creates a boxplot of the transposed batch-corrected data from the `adata` object, with filled boxes. -- bp=plt.boxplot(adata.to_df(layer='batch_correction').T,patch_artist=True) +# Line 52: Iterates through the boxes and batch labels of the data. -- for i,batch in zip(range(adata.shape[0]),adata.obs['batch']): +# Line 53: Sets the fill color of each boxplot to a color determined by the batch. -- bp['boxes'][i].set_facecolor(color_dict[batch]) +# Line 54: Turns off the axis display for the boxplot. -- ax.axis(False) +# Line 55: Displays the plot. -- plt.show() +# Line 57: Creates a 'raw' layer in the adata.layers, copying the original data from adata.X. -- adata.layers['raw']=adata.X.copy() +# Line 59: Performs Principal Component Analysis (PCA) on the 'raw' layer of the `adata` object, using 50 components. -- ov.pp.pca(adata,layer='raw',n_pcs=50) +# Line 60: Displays the modified `adata` object after PCA. -- adata +# Line 62: Performs Principal Component Analysis (PCA) on the 'batch_correction' layer of the `adata` object, using 50 components. -- ov.pp.pca(adata,layer='batch_correction',n_pcs=50) +# Line 63: Displays the modified `adata` object after PCA. -- adata +# Line 65: Creates an embedding plot using the raw data PCA results, colored by batch. -- ov.utils.embedding(adata, +# Line 66: Specifies embedding basis as 'raw|original|X_pca' and labels color by 'batch', with no frame. -- basis='raw|original|X_pca', +# Line 67: Specifies embedding color is 'batch'. -- color='batch', +# Line 68: Specifies smaller frame around the plot. -- frameon='small') +# Line 70: Creates an embedding plot using the batch-corrected data PCA results, colored by batch. -- ov.utils.embedding(adata, +# Line 71: Specifies embedding basis as 'batch_correction|original|X_pca' and labels color by 'batch', with no frame. -- basis='batch_correction|original|X_pca', +# Line 72: Specifies embedding color is 'batch'. -- color='batch', +# Line 73: Specifies smaller frame around the plot. -- frameon='small') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py new file mode 100644 index 00000000..03f41109 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py @@ -0,0 +1,105 @@ +```python +# Line 1: Import the omicverse library as ov -- import omicverse as ov +# Line 2: Import the mde function from omicverse.utils -- from omicverse.utils import mde +# Line 3: Import the scanpy library as sc -- import scanpy as sc +# Line 4: Import the scvelo library as scv -- import scvelo as scv +# Line 5: Set plotting style using ov.plot_set() -- ov.plot_set() +# Line 8: Load the dentategyrus dataset from scvelo into adata -- adata=scv.datasets.dentategyrus() +# Line 9: Display the AnnData object -- adata +# Line 11: Import the numpy library as np -- import numpy as np +# Line 12: Read bulk RNA-seq data from a file using ov.utils.read -- bulk=ov.utils.read('data/GSE74985_mergedCount.txt.gz',index_col=0) +# Line 13: Map gene IDs in bulk data using ov.bulk.Matrix_ID_mapping -- bulk=ov.bulk.Matrix_ID_mapping(bulk,'genesets/pair_GRCm39.tsv') +# Line 14: Display the first few rows of the bulk data -- bulk.head() +# Line 16: Create a BulkTrajBlend object using bulk and single-cell data -- bulktb=ov.bulk2single.BulkTrajBlend(bulk_seq=bulk,single_seq=adata, +# Line 17: Specify bulk groups and cell type key for the BulkTrajBlend object -- bulk_group=['dg_d_1','dg_d_2','dg_d_3'], +# Line 18: Specify cell type key for the BulkTrajBlend object -- celltype_key='clusters',) +# Line 20: Configure the VAE model within BulkTrajBlend with 100 target cells -- bulktb.vae_configure(cell_target_num=100) +# Line 23: Train the VAE model using specific parameters and save it -- vae_net=bulktb.vae_train( +# Line 24: Set batch size to 512 -- batch_size=512, +# Line 25: Set learning rate to 1e-4 -- learning_rate=1e-4, +# Line 26: Set hidden size to 256 -- hidden_size=256, +# Line 27: Set the number of training epochs to 3500 -- epoch_num=3500, +# Line 28: Set the directory to save the VAE model -- vae_save_dir='data/bulk2single/save_model', +# Line 29: Set the filename to save the VAE model -- vae_save_name='dg_btb_vae', +# Line 30: Set the directory to save generated data -- generate_save_dir='data/bulk2single/output', +# Line 31: Set the filename to save generated data -- generate_save_name='dg_btb') +# Line 33: Load the pretrained VAE model from specified path -- bulktb.vae_load('data/bulk2single/save_model/dg_btb_vae.pth') +# Line 35: Generate new data using the loaded VAE and specified leiden size -- generate_adata=bulktb.vae_generate(leiden_size=25) +# Line 37: Plot the cell proportion after bulk-to-single-cell mapping using generate_adata -- ov.bulk2single.bulk2single_plot_cellprop(generate_adata,celltype_key='clusters', +# Line 38: Close parenthesis -- ) +# Line 40: Configure the GNN model in BulkTrajBlend with specific parameters -- bulktb.gnn_configure(max_epochs=2000,use_rep='X', +# Line 41: Specify the neighbor representation for the GNN -- neighbor_rep='X_pca') +# Line 43: Train the GNN model -- bulktb.gnn_train() +# Line 45: Load the trained GNN model from the specified path -- bulktb.gnn_load('save_model/gnn.pth') +# Line 47: Generate the results from the GNN model -- res_pd=bulktb.gnn_generate() +# Line 48: Display the first few rows of the GNN results -- res_pd.head() +# Line 50: Compute and store MDE coordinates in the 'X_mde' slot of the AnnData object -- bulktb.nocd_obj.adata.obsm["X_mde"] = mde(bulktb.nocd_obj.adata.obsm["X_pca"]) +# Line 51: Plot the MDE embedding colored by clusters and nocd_n with specified parameters -- sc.pl.embedding(bulktb.nocd_obj.adata,basis='X_mde',color=['clusters','nocd_n'],wspace=0.4, +# Line 52: Specify palette -- palette=ov.utils.pyomic_palette()) +# Line 54: Plot the MDE embedding for cells without '-' in 'nocd_n', colored by clusters and nocd_n -- sc.pl.embedding(bulktb.nocd_obj.adata[~bulktb.nocd_obj.adata.obs['nocd_n'].str.contains('-')], +# Line 55: Specify basis, color, and spacing for plot -- basis='X_mde', +# Line 56: Specify color and spacing -- color=['clusters','nocd_n'], +# Line 57: Specify spacing and palette -- wspace=0.4,palette=sc.pl.palettes.default_102) +# Line 59: Print the number of raw cells -- print('raw cells: ',bulktb.single_seq.shape[0]) +# Line 61: Interpolate cells based on 'OPC' -- adata1=bulktb.interpolation('OPC') +# Line 62: Print the number of interpolated cells -- print('interpolation cells: ',adata1.shape[0]) +# Line 64: Store the raw data in the raw slot of adata1 -- adata1.raw = adata1 +# Line 65: Identify highly variable genes in adata1 -- sc.pp.highly_variable_genes(adata1, min_mean=0.0125, max_mean=3, min_disp=0.5) +# Line 66: Subset adata1 to keep only highly variable genes -- adata1 = adata1[:, adata1.var.highly_variable] +# Line 67: Scale the gene expression data in adata1 -- sc.pp.scale(adata1, max_value=10) +# Line 69: Perform PCA on adata1 with 100 components -- sc.tl.pca(adata1, n_comps=100, svd_solver="auto") +# Line 71: Normalize the total counts in the original AnnData object -- sc.pp.normalize_total(adata, target_sum=1e4) +# Line 72: Apply log1p transformation to the normalized counts -- sc.pp.log1p(adata) +# Line 73: Store the raw data in the raw slot of adata -- adata.raw = adata +# Line 74: Identify highly variable genes in the original AnnData object -- sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) +# Line 75: Subset adata to keep only highly variable genes -- adata = adata[:, adata.var.highly_variable] +# Line 76: Scale the gene expression data in the original AnnData object -- sc.pp.scale(adata, max_value=10) +# Line 78: Perform PCA on the original AnnData object with 100 components -- sc.tl.pca(adata, n_comps=100, svd_solver="auto") +# Line 80: Compute and store MDE coordinates in the 'X_mde' slot of the original AnnData object -- adata.obsm["X_mde"] = mde(adata.obsm["X_pca"]) +# Line 81: Compute and store MDE coordinates in the 'X_mde' slot of the interpolated AnnData object -- adata1.obsm["X_mde"] = mde(adata1.obsm["X_pca"]) +# Line 83: Generate and display an MDE embedding plot for the original data, colored by clusters -- ov.utils.embedding(adata, +# Line 84: Specify basis, color, frameon, spacing, and palette for plot -- basis='X_mde', +# Line 85: Specify color for plot -- color=['clusters'], +# Line 86: Specify frameon -- frameon='small', +# Line 87: Specify spacing and palette -- wspace=0.4,palette=sc.pl.palettes.default_102) +# Line 89: Generate and display an MDE embedding plot for the interpolated data, colored by clusters -- ov.utils.embedding(adata1, +# Line 90: Specify basis, color, frameon, spacing, and palette for plot -- basis='X_mde', +# Line 91: Specify color for plot -- color=['clusters'], +# Line 92: Specify frameon -- frameon='small', +# Line 93: Specify spacing and palette -- wspace=0.4,palette=sc.pl.palettes.default_102) +# Line 95: Create a pyVIA object for the original data with specified parameters -- v0 = ov.single.pyVIA(adata=adata,adata_key='X_pca',adata_ncomps=100, basis='X_mde', +# Line 96: Specify clusters, knn, random seed, root user, and dataset -- clusters='clusters',knn=20,random_seed=4,root_user=['nIPC'], +# Line 97: Specify dataset -- dataset='group') +# Line 98: Run the pyVIA analysis for the original data -- v0.run() +# Line 100: Create a pyVIA object for the interpolated data with specified parameters -- v1 = ov.single.pyVIA(adata=adata1,adata_key='X_pca',adata_ncomps=100, basis='X_mde', +# Line 101: Specify clusters, knn, random seed, root user and dataset -- clusters='clusters',knn=15,random_seed=4,root_user=['Neuroblast'], +# Line 103: Specify dataset -- dataset='group') +# Line 105: Run the pyVIA analysis for the interpolated data -- v1.run() +# Line 107: Import the matplotlib.pyplot module as plt -- import matplotlib.pyplot as plt +# Line 108: Create and display a stream plot for the original data -- fig,ax=v0.plot_stream(basis='X_mde',clusters='clusters', +# Line 109: Set plotting parameters for the stream plot -- density_grid=0.8, scatter_size=30, scatter_alpha=0.3, linewidth=0.5) +# Line 110: Set the title of the stream plot for the original data -- plt.title('Raw Dentategyrus',fontsize=12) +# Line 113: Create and display a stream plot for the interpolated data -- fig,ax=v1.plot_stream(basis='X_mde',clusters='clusters', +# Line 114: Set plotting parameters for the stream plot -- density_grid=0.8, scatter_size=30, scatter_alpha=0.3, linewidth=0.5) +# Line 115: Set the title of the stream plot for the interpolated data -- plt.title('Interpolation Dentategyrus',fontsize=12) +# Line 118: Create and display a stream plot of pseudo time for the original data -- fig,ax=v0.plot_stream(basis='X_mde',density_grid=0.8, scatter_size=30, color_scheme='time', linewidth=0.5, +# Line 119: Set plotting parameters for pseudo time stream plot -- min_mass = 1, cutoff_perc = 5, scatter_alpha=0.3, marker_edgewidth=0.1, +# Line 120: Set plotting parameters for pseudo time stream plot -- density_stream = 2, smooth_transition=1, smooth_grid=0.5) +# Line 121: Set the title of the pseudo time stream plot for the original data -- plt.title('Raw Dentategyrus\nPseudoTime',fontsize=12) +# Line 123: Create and display a stream plot of pseudo time for the interpolated data -- fig,ax=v1.plot_stream(basis='X_mde',density_grid=0.8, scatter_size=30, color_scheme='time', linewidth=0.5, +# Line 124: Set plotting parameters for pseudo time stream plot -- min_mass = 1, cutoff_perc = 5, scatter_alpha=0.3, marker_edgewidth=0.1, +# Line 125: Set plotting parameters for pseudo time stream plot -- density_stream = 2, smooth_transition=1, smooth_grid=0.5) +# Line 126: Set the title of the pseudo time stream plot for the interpolated data -- plt.title('Interpolation Dentategyru\nPseudoTime',fontsize=12) +# Line 128: Compute pseudotime using pyVIA and store in the original AnnData -- v0.get_pseudotime(adata) +# Line 129: Compute neighbors using PCA embeddings for the original AnnData -- sc.pp.neighbors(adata,n_neighbors= 15,use_rep='X_pca') +# Line 130: Calculate PAGA graph using pseudotime as prior for the original AnnData -- ov.utils.cal_paga(adata,use_time_prior='pt_via',vkey='paga', +# Line 131: Specify group -- groups='clusters') +# Line 133: Generate and display a PAGA graph for the original AnnData -- ov.utils.plot_paga(adata,basis='mde', size=50, alpha=.1,title='PAGA LTNN-graph', +# Line 134: Set plotting parameters -- min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False) +# Line 136: Compute pseudotime using pyVIA and store in the interpolated AnnData -- v1.get_pseudotime(adata1) +# Line 137: Compute neighbors using PCA embeddings for the interpolated AnnData -- sc.pp.neighbors(adata1,n_neighbors= 15,use_rep='X_pca') +# Line 138: Calculate PAGA graph using pseudotime as prior for the interpolated AnnData -- ov.utils.cal_paga(adata1,use_time_prior='pt_via',vkey='paga', +# Line 139: Specify group -- groups='clusters') +# Line 141: Generate and display a PAGA graph for the interpolated AnnData -- ov.utils.plot_paga(adata1,basis='mde', size=50, alpha=.1,title='PAGA LTNN-graph', +# Line 142: Set plotting parameters -- min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py new file mode 100644 index 00000000..b52d8c00 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py @@ -0,0 +1,96 @@ +```python +# Line 1: Import the omicverse library as ov -- import omicverse as ov +# Line 2: Print the version of the omicverse library. -- print(f'omicverse version:{ov.__version__}') +# Line 3: Import the scanpy library as sc -- import scanpy as sc +# Line 4: Print the version of the scanpy library. -- print(f'scanpy version:{sc.__version__}') +# Line 5: Set plotting defaults for omicverse. -- ov.ov_plot_set() +# Line 11: Read 10X data into an AnnData object from the specified directory, using gene symbols as variable names, and enabling caching. -- adata = sc.read_10x_mtx( +# Line 12: This is a comment describing the directory with the .mtx file -- 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file +# Line 13: This is a comment indicating that gene symbols are used for variable names -- var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) +# Line 14: This is a comment indicating that a cache file is written for faster subsequent reading -- cache=True) # write a cache file for faster subsequent reading +# Line 18: Perform quality control on the AnnData object, filtering cells based on mitochondrial percentage, number of UMIs, and detected genes. -- adata=ov.pp.qc(adata, +# Line 19: This is a comment describing the quality control parameters -- tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250}) +# Line 21: Preprocess the AnnData object, including normalization and calculation of highly variable genes. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 23: Save the original data in adata.raw and then filter the data to retain only highly variable genes. -- adata.raw = adata +# Line 24: Filter the AnnData object to keep only highly variable genes. -- adata = adata[:, adata.var.highly_variable_features] +# Line 27: Scale the expression data in adata.X. -- ov.pp.scale(adata) +# Line 30: Perform principal component analysis (PCA) on the scaled data, retaining the top 50 principal components. -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 33: Construct a neighborhood graph for the AnnData object using the specified number of neighbors, PCs, and representation. -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 34: This is a comment indicating the representation to use for the graph construction. -- use_rep='scaled|original|X_pca') +# Line 37: Perform Leiden clustering on the AnnData object. -- sc.tl.leiden(adata) +# Line 40: Reduce the dimensionality of the AnnData object using MDE for visualization and store it in adata.obsm. -- adata.obsm["X_mde"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) +# Line 41: Display the AnnData object. -- adata +# Line 43: Create a pySCSA object for cell annotation using cellmarker database. -- scsa=ov.single.pySCSA(adata=adata, +# Line 44: This is a comment describing the fold change parameter. -- foldchange=1.5, +# Line 45: This is a comment describing the p-value parameter. -- pvalue=0.01, +# Line 46: This is a comment describing the celltype parameter. -- celltype='normal', +# Line 47: This is a comment describing the target parameter. -- target='cellmarker', +# Line 48: This is a comment describing the tissue parameter. -- tissue='All', +# Line 49: This is a comment describing the model path parameter. -- model_path='temp/pySCSA_2023_v2_plus.db' +# Line 51: Annotate the cells based on leiden clusters using cellmarker database. -- anno=scsa.cell_anno(clustertype='leiden', +# Line 52: This is a comment describing the cluster parameter. -- cluster='all',rank_rep=True) +# Line 54: Automatically annotate cells based on the cellmarker annotations and store in adata. -- scsa.cell_auto_anno(adata,key='scsa_celltype_cellmarker') +# Line 56: Create a pySCSA object for cell annotation using panglaodb database. -- scsa=ov.single.pySCSA(adata=adata, +# Line 57: This is a comment describing the fold change parameter. -- foldchange=1.5, +# Line 58: This is a comment describing the p-value parameter. -- pvalue=0.01, +# Line 59: This is a comment describing the celltype parameter. -- celltype='normal', +# Line 60: This is a comment describing the target parameter. -- target='panglaodb', +# Line 61: This is a comment describing the tissue parameter. -- tissue='All', +# Line 62: This is a comment describing the model path parameter. -- model_path='temp/pySCSA_2023_v2_plus.db' +# Line 66: Annotate the cells based on leiden clusters using panglaodb database. -- res=scsa.cell_anno(clustertype='leiden', +# Line 67: This is a comment describing the cluster parameter. -- cluster='all',rank_rep=True) +# Line 69: Print the cell annotations. -- scsa.cell_anno_print() +# Line 71: Automatically annotate cells based on the panglaodb annotations and store in adata. -- scsa.cell_auto_anno(adata,key='scsa_celltype_panglaodb') +# Line 73: Generate and display an embedding plot of cells colored by Leiden clusters, cellmarker annotations, and panglaodb annotations. -- ov.utils.embedding(adata, +# Line 74: This is a comment describing the basis of the embedding. -- basis='X_mde', +# Line 75: This is a comment describing the colors of the plot. -- color=['leiden','scsa_celltype_cellmarker','scsa_celltype_panglaodb'], +# Line 76: This is a comment describing the legend location. -- legend_loc='on data', +# Line 77: This is a comment describing the frame and legend_fontoutline parameters. -- frameon='small', +# Line 78: This is a comment describing the legend font outline -- legend_fontoutline=2, +# Line 79: This is a comment describing the color palette -- palette=ov.utils.palette()[14:], +# Line 82: Add a 'group' column to adata.obs and set it to 'A' for all cells initially. -- adata.obs['group']='A' +# Line 83: Set the 'group' column to 'B' for the first 1000 cells. -- adata.obs.loc[adata.obs.index[:1000],'group']='B' +# Line 85: Generate and display an embedding plot of cells colored by group, using red color palette. -- ov.utils.embedding(adata, +# Line 86: This is a comment describing the basis of the embedding. -- basis='X_mde', +# Line 87: This is a comment describing the color of the plot. -- color=['group'], +# Line 88: This is a comment describing the frame and legend_fontoutline parameters. -- frameon='small',legend_fontoutline=2, +# Line 89: This is a comment describing the red color palette -- palette=ov.utils.red_color, +# Line 92: Generate and display a cell proportion plot based on cellmarker cell types and sample groups. -- ov.utils.plot_cellproportion(adata=adata,celltype_clusters='scsa_celltype_cellmarker', +# Line 93: This is a comment describing the visual clusters parameter. -- visual_clusters='group', +# Line 94: This is a comment describing the visual_name parameter. -- visual_name='group',figsize=(2,4)) +# Line 96: Generate and display an embedding plot showing the cell type annotations, and adjust title and cell type and embedding range parameters -- ov.utils.plot_embedding_celltype(adata,figsize=None,basis='X_mde', +# Line 97: This is a comment describing the celltype_key parameter -- celltype_key='scsa_celltype_cellmarker', +# Line 98: This is a comment describing the title parameter -- title=' Cell type', +# Line 99: This is a comment describing the celltype_range parameter -- celltype_range=(2,6), +# Line 100: This is a comment describing the embedding_range parameter -- embedding_range=(4,10),) +# Line 102: Calculate the ratio of observed to expected cell proportions for each cell type in each group. -- roe=ov.utils.roe(adata,sample_key='group',cell_type_key='scsa_celltype_cellmarker') +# Line 104: Import the seaborn plotting library as sns -- import seaborn as sns +# Line 105: Import the pyplot module from matplotlib library as plt -- import matplotlib.pyplot as plt +# Line 106: Create a new figure and an axes for the heatmap with a specified figsize. -- fig, ax = plt.subplots(figsize=(2,4)) +# Line 108: Copy the roe data for transformation. -- transformed_roe = roe.copy() +# Line 109: Transform the roe data based on different thresholds into a symbolic data. -- transformed_roe = transformed_roe.applymap( +# Line 110: This is a comment describing transformation of value >=2 to '+++', value >=1.5 to '++', value >=1 to '+', else to '+/-' -- lambda x: '+++' if x >= 2 else ('++' if x >= 1.5 else ('+' if x >= 1 else '+/-'))) +# Line 112: Create a heatmap with the transformed Ro/e data and add symbolic annotations. -- sns.heatmap(roe, annot=transformed_roe, cmap='RdBu_r', fmt='', +# Line 113: This is a comment describing cbar and ax parameters -- cbar=True, ax=ax,vmin=0.5,vmax=1.5,cbar_kws={'shrink':0.5}) +# Line 114: Adjust the size of the xtick labels. -- plt.xticks(fontsize=12) +# Line 115: Adjust the size of the ytick labels. -- plt.yticks(fontsize=12) +# Line 117: Label the x axis as 'Group'. -- plt.xlabel('Group',fontsize=13) +# Line 118: Label the y axis as 'Cell type'. -- plt.ylabel('Cell type',fontsize=13) +# Line 119: Set the title of heatmap as 'Ro/e'. -- plt.title('Ro/e',fontsize=13) +# Line 121: This is a dictionary defining marker genes for cell types. -- res_marker_dict={ +# Line 131: Compute and store a dendrogram for the leiden clusters -- sc.tl.dendrogram(adata,'leiden') +# Line 132: Create a dotplot of gene expression for the specified marker genes, grouped by Leiden cluster -- sc.pl.dotplot(adata, res_marker_dict, 'leiden', +# Line 133: This is a comment describing dendrogram and standard_scale parameters -- dendrogram=True,standard_scale='var') +# Line 136: Create a dictionary mapping cluster ID to cell type labels. -- cluster2annotation = { +# Line 150: Annotate the AnnData object with cell types based on the cluster2annotation dictionary. -- ov.single.scanpy_cellanno_from_dict(adata,anno_dict=cluster2annotation, +# Line 151: This is a comment describing the clustertype parameter -- clustertype='leiden') +# Line 153: Generate and display an embedding plot with major cell types and scsa cellmarker annotations. -- ov.utils.embedding(adata, +# Line 154: This is a comment describing the basis of the embedding. -- basis='X_mde', +# Line 155: This is a comment describing the colors of the plot. -- color=['major_celltype','scsa_celltype_cellmarker'], +# Line 156: This is a comment describing legend_loc, frameon and legend_fontoutline parameters. -- legend_loc='on data', frameon='small',legend_fontoutline=2, +# Line 157: This is a comment describing the color palette -- palette=ov.utils.palette()[14:], +# Line 160: Get a dictionary of marker genes for each cell type based on scsa_celltype_cellmarker annotations. -- marker_dict=ov.single.get_celltype_marker(adata,clustertype='scsa_celltype_cellmarker') +# Line 161: Print the keys of the marker dictionary. -- marker_dict.keys() +# Line 163: Print the marker genes for the 'B cell' cell type. -- marker_dict['B cell'] +# Line 165: Get a list of tissues in the pySCSA database. -- scsa.get_model_tissue() +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cellfate_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cellfate_annotated.py new file mode 100644 index 00000000..3eb69c92 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cellfate_annotated.py @@ -0,0 +1,59 @@ +``` +# Line 1: import omicverse as ov -- import omicverse as ov +# Line 3: import scanpy as sc -- import scanpy as sc +# Line 5: import pandas as pd -- import pandas as pd +# Line 6: from tqdm.auto import tqdm -- from tqdm.auto import tqdm +# Line 7: ov.plot_set() -- ov.plot_set() +# Line 9: adata = ov.single.mouse_hsc_nestorowa16() -- adata = ov.single.mouse_hsc_nestorowa16() +# Line 10: adata -- adata +# Line 12: prior_network = ov.single.load_human_prior_interaction_network(dataset='nichenet') -- prior_network = ov.single.load_human_prior_interaction_network(dataset='nichenet') +# Line 15: prior_network = ov.single.convert_human_to_mouse_network(prior_network,server_name='asia') -- prior_network = ov.single.convert_human_to_mouse_network(prior_network,server_name='asia') +# Line 16: prior_network -- prior_network +# Line 18: prior_network.to_csv('result/combined_network_Mouse.txt.gz',sep='\t') -- prior_network.to_csv('result/combined_network_Mouse.txt.gz',sep='\t') +# Line 20: prior_network=ov.read('result/combined_network_Mouse.txt.gz',index_col=0) -- prior_network=ov.read('result/combined_network_Mouse.txt.gz',index_col=0) +# Line 22: CEFCON_obj = ov.single.pyCEFCON(adata, prior_network, repeats=5, solver='GUROBI') -- CEFCON_obj = ov.single.pyCEFCON(adata, prior_network, repeats=5, solver='GUROBI') +# Line 23: CEFCON_obj -- CEFCON_obj +# Line 25: CEFCON_obj.preprocess() -- CEFCON_obj.preprocess() +# Line 27: CEFCON_obj.train() -- CEFCON_obj.train() +# Line 29: CEFCON_obj.predicted_driver_regulators() -- CEFCON_obj.predicted_driver_regulators() +# Line 31: CEFCON_obj.cefcon_results_dict['E_pseudotime'].driver_regulator.head() -- CEFCON_obj.cefcon_results_dict['E_pseudotime'].driver_regulator.head() +# Line 33: CEFCON_obj.predicted_RGM() -- CEFCON_obj.predicted_RGM() +# Line 35: CEFCON_obj.cefcon_results_dict['E_pseudotime'] -- CEFCON_obj.cefcon_results_dict['E_pseudotime'] +# Line 37: lineage = 'E_pseudotime' -- lineage = 'E_pseudotime' +# Line 38: result = CEFCON_obj.cefcon_results_dict[lineage] -- result = CEFCON_obj.cefcon_results_dict[lineage] +# Line 40: gene_ad=sc.AnnData(result.gene_embedding) -- gene_ad=sc.AnnData(result.gene_embedding) +# Line 41: sc.pp.neighbors(gene_ad, n_neighbors=30, use_rep='X') -- sc.pp.neighbors(gene_ad, n_neighbors=30, use_rep='X') +# Line 43: sc.tl.leiden(gene_ad, resolution=1) -- sc.tl.leiden(gene_ad, resolution=1) +# Line 44: sc.tl.umap(gene_ad, n_components=2, min_dist=0.3) -- sc.tl.umap(gene_ad, n_components=2, min_dist=0.3) +# Line 46: ov.utils.embedding(gene_ad,basis='X_umap',legend_loc='on data', -- ov.utils.embedding(gene_ad,basis='X_umap',legend_loc='on data', +# Line 47: legend_fontsize=8, legend_fontoutline=2, -- legend_fontsize=8, legend_fontoutline=2, +# Line 48: color='leiden',frameon='small',title='Leiden clustering using CEFCON\nderived gene embeddings') -- color='leiden',frameon='small',title='Leiden clustering using CEFCON\nderived gene embeddings') +# Line 50: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 51: import seaborn as sns -- import seaborn as sns +# Line 52: data_for_plot = result.driver_regulator[result.driver_regulator['is_driver_regulator']] -- data_for_plot = result.driver_regulator[result.driver_regulator['is_driver_regulator']] +# Line 53: data_for_plot = data_for_plot[0:20] -- data_for_plot = data_for_plot[0:20] +# Line 55: plt.figure(figsize=(2, 20 * 0.2)) -- plt.figure(figsize=(2, 20 * 0.2)) +# Line 56: sns.set_theme(style='ticks', font_scale=0.5) -- sns.set_theme(style='ticks', font_scale=0.5) +# Line 58: ax = sns.barplot(x='influence_score', y=data_for_plot.index, data=data_for_plot, orient='h', -- ax = sns.barplot(x='influence_score', y=data_for_plot.index, data=data_for_plot, orient='h', +# Line 59: palette=sns.color_palette(f"ch:start=.5,rot=-.5,reverse=1,dark=0.4", n_colors=20)) -- palette=sns.color_palette(f"ch:start=.5,rot=-.5,reverse=1,dark=0.4", n_colors=20)) +# Line 60: ax.set_title(result.name) -- ax.set_title(result.name) +# Line 61: ax.set_xlabel('Influence score') -- ax.set_xlabel('Influence score') +# Line 62: ax.set_ylabel('Driver regulators') -- ax.set_ylabel('Driver regulators') +# Line 64: ax.spines['left'].set_position(('outward', 10)) -- ax.spines['left'].set_position(('outward', 10)) +# Line 65: ax.spines['bottom'].set_position(('outward', 10)) -- ax.spines['bottom'].set_position(('outward', 10)) +# Line 66: plt.xticks(fontsize=12) -- plt.xticks(fontsize=12) +# Line 67: plt.yticks(fontsize=12) -- plt.yticks(fontsize=12) +# Line 69: plt.grid(False) -- plt.grid(False) +# Line 70: ax.spines['top'].set_visible(False) -- ax.spines['top'].set_visible(False) +# Line 71: ax.spines['right'].set_visible(False) -- ax.spines['right'].set_visible(False) +# Line 72: ax.spines['bottom'].set_visible(True) -- ax.spines['bottom'].set_visible(True) +# Line 73: ax.spines['left'].set_visible(True) -- ax.spines['left'].set_visible(True) +# Line 75: plt.title('E_pseudotime',fontsize=12) -- plt.title('E_pseudotime',fontsize=12) +# Line 76: plt.xlabel('Influence score',fontsize=12) -- plt.xlabel('Influence score',fontsize=12) +# Line 77: plt.ylabel('Driver regulon',fontsize=12) -- plt.ylabel('Driver regulon',fontsize=12) +# Line 79: sns.despine() -- sns.despine() +# Line 81: result.plot_driver_genes_Venn() -- result.plot_driver_genes_Venn() +# Line 83: adata_lineage = adata[adata.obs_names[adata.obs[result.name].notna()],:] -- adata_lineage = adata[adata.obs_names[adata.obs[result.name].notna()],:] +# Line 85: result.plot_RGM_activity_heatmap(cell_label=adata_lineage.obs['cell_type_finely'], -- result.plot_RGM_activity_heatmap(cell_label=adata_lineage.obs['cell_type_finely'], +# Line 86: type='out',col_cluster=True,bbox_to_anchor=(1.48, 0.25)) -- type='out',col_cluster=True,bbox_to_anchor=(1.48, 0.25)) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py new file mode 100644 index 00000000..6e330f0c --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py @@ -0,0 +1,165 @@ +```python +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the scvelo library as scv. -- import scvelo as scv +# Line 3: Import the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt +# Line 4: Set the plotting parameters using omicverse's ov_plot_set function. -- ov.ov_plot_set() +# Line 5: Load the dentategyrus dataset from scvelo into an AnnData object named adata. -- adata = scv.datasets.dentategyrus() +# Line 6: Display the adata object. -- adata +# Line 7: Apply quality control filtering to the adata object using omicverse's qc function, with specified thresholds for mitochondrial percentage, number of UMIs, and number of detected genes. -- adata=ov.pp.qc(adata, +# Line 8: Store the 'counts' layer of the adata object using omicverse's store_layers function. -- tresh={'mito_perc': 0.15, 'nUMIs': 500, 'detected_genes': 250}, +# Line 9: -- ) +# Line 10: Store the 'counts' layer of the adata object using omicverse's store_layers function. -- ov.utils.store_layers(adata,layers='counts') +# Line 11: Display the adata object. -- adata +# Line 12: Preprocess the adata object using omicverse's preprocess function, applying shiftlog and pearson normalization, selecting 2000 highly variable genes. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson', +# Line 13: -- n_HVGs=2000) +# Line 14: Store the current state of adata into adata.raw before performing further operations. -- adata.raw = adata +# Line 15: Subset the adata object to only include highly variable genes. -- adata = adata[:, adata.var.highly_variable_features] +# Line 16: Display the adata object. -- adata +# Line 17: Scale the data in the adata object using omicverse's scale function. -- ov.pp.scale(adata) +# Line 18: Perform principal component analysis (PCA) on the scaled data in the adata object, keeping 50 PCs, using omicverse's pca function. -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 19: Compute the minimum distortion embedding (MDE) using omicverse's utils.mde function, using PCA embedding result, and stores it in the obsm slot of the adata. -- adata.obsm["X_mde_pca"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) +# Line 20: Convert the raw counts stored in adata.raw into an AnnData object and assigns it back to adata. -- adata=adata.raw.to_adata() +# Line 21: Create a figure and an axes object using matplotlib for plotting. -- fig, ax = plt.subplots(figsize=(3,3)) +# Line 22: Generate an embedding plot using omicverse's embedding function, with 'X_mde_pca' as basis, displaying clusters and setting plotting options. -- ov.utils.embedding(adata, +# Line 23: -- basis='X_mde_pca',frameon='small', +# Line 24: -- color=['clusters'],show=False,ax=ax) +# Line 25: Import the SEACells library. -- import SEACells +# Line 26: Subset the adata object to remove 'Endothelial' cells based on their cluster assignment. -- adata=adata[adata.obs['clusters']!='Endothelial'] +# Line 27: Initialize a SEACells model with specified parameters such as kernel building basis, number of SEACells, and number of waypoint eigenvectors. -- model = SEACells.core.SEACells(adata, +# Line 28: -- build_kernel_on='scaled|original|X_pca', +# Line 29: -- n_SEACells=200, +# Line 30: -- n_waypoint_eigs=10, +# Line 31: -- convergence_epsilon = 1e-5) +# Line 32: Construct the kernel matrix for the SEACells model. -- model.construct_kernel_matrix() +# Line 33: Store the kernel matrix in the variable M. -- M = model.kernel_matrix +# Line 34: Initialize archetypes for the SEACells model. -- # Initialize archetypes +# Line 35: Initialize archetypes for the SEACells model. -- model.initialize_archetypes() +# Line 36: Fit the SEACells model with minimum and maximum iteration limits. -- model.fit(min_iter=10, max_iter=50) +# Line 37: Enable inline plotting for matplotlib. -- # Check for convergence +# Line 38: Enable inline plotting for matplotlib. -- %matplotlib inline +# Line 39: Plot the convergence of the SEACells model using model.plot_convergence(). -- model.plot_convergence() +# Line 40: Print the number of iterations the model has run. -- # You can force the model to run additional iterations step-wise using the .step() function +# Line 41: Print the number of iterations the model has run. -- print(f'Run for {len(model.RSS_iters)} iterations') +# Line 42: Run the model for 10 additional steps. -- for _ in range(10): +# Line 43: Run the model for 10 additional steps. -- model.step() +# Line 44: Print the updated number of iterations. -- print(f'Run for {len(model.RSS_iters)} iterations') +# Line 45: Enable inline plotting for matplotlib. -- # Check for convergence +# Line 46: Enable inline plotting for matplotlib. -- %matplotlib inline +# Line 47: Plot the convergence of the SEACells model after additional iterations. -- model.plot_convergence() +# Line 48: Enable inline plotting for matplotlib. -- %matplotlib inline +# Line 49: Generate a 2D plot using SEACells.plot.plot_2D function, visualizing the mde_pca embedding with specified parameters, while disabling the plotting of meta cells -- SEACells.plot.plot_2D(adata, key='X_mde_pca', colour_metacells=False, +# Line 50: -- figsize=(4,4),cell_size=20,title='Dentategyrus Metacells', +# Line 51: -- ) +# Line 52: Store the current state of adata into adata.raw for later use. -- adata.raw=adata.copy() +# Line 53: Generate a soft SEACell representation of the data by summarizing the data according to the soft SEACell matrix and celltype labels. -- SEACell_soft_ad = SEACells.core.summarize_by_soft_SEACell(adata, model.A_, +# Line 54: -- celltype_label='clusters', +# Line 55: -- summarize_layer='raw', minimum_weight=0.05) +# Line 56: Display the resulting AnnData object, SEACell_soft_ad. -- SEACell_soft_ad +# Line 57: Import scanpy library as sc. -- import scanpy as sc +# Line 58: Store a copy of the SEACell_soft_ad into the raw attribute. -- SEACell_soft_ad.raw=SEACell_soft_ad.copy() +# Line 59: Calculate highly variable genes using Scanpy's function and stores the result inplace. -- sc.pp.highly_variable_genes(SEACell_soft_ad, n_top_genes=2000, inplace=True) +# Line 60: Subset the SEACell_soft_ad object to include only highly variable genes. -- SEACell_soft_ad=SEACell_soft_ad[:,SEACell_soft_ad.var.highly_variable] +# Line 61: Scale the data in SEACell_soft_ad using omicverse's scale function. -- ov.pp.scale(SEACell_soft_ad) +# Line 62: Perform PCA on the scaled data in SEACell_soft_ad, using omicverse's pca function. -- ov.pp.pca(SEACell_soft_ad,layer='scaled',n_pcs=50) +# Line 63: Compute a neighborhood graph of the SEACell_soft_ad using Scanpy's neighbor function, using the pca embedding. -- sc.pp.neighbors(SEACell_soft_ad, use_rep='scaled|original|X_pca') +# Line 64: Calculate UMAP embedding of the SEACell_soft_ad using Scanpy's umap function. -- sc.tl.umap(SEACell_soft_ad) +# Line 65: Convert the 'celltype' column in the obs attribute of the SEACell_soft_ad object to a categorical type. -- SEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].astype('category') +# Line 66: Reorder the categories in the 'celltype' column to match the order of categories from the original adata clusters. -- SEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories) +# Line 67: Copy the color mapping from the original adata's clusters to the celltype colors in the SEACell_soft_ad object. -- SEACell_soft_ad.uns['celltype_colors']=adata.uns['clusters_colors'] +# Line 68: Import matplotlib.pyplot as plt. -- import matplotlib.pyplot as plt +# Line 69: Create a figure and an axes object using matplotlib. -- fig, ax = plt.subplots(figsize=(3,3)) +# Line 70: Generate an embedding plot using omicverse's embedding function, visualizing the UMAP embedding and coloring by celltype. -- ov.utils.embedding(SEACell_soft_ad, +# Line 71: -- basis='X_umap', +# Line 72: -- color=["celltype"], +# Line 73: -- title='Meta Celltype', +# Line 74: -- frameon='small', +# Line 75: -- legend_fontsize=12, +# Line 76: -- #palette=ov.utils.palette()[11:], +# Line 77: -- ax=ax, +# Line 78: -- show=False) +# Line 79: Initialize a pyVIA object using omicverse's single.pyVIA function with specified parameters for trajectory inference. -- v0 = ov.single.pyVIA(adata=SEACell_soft_ad,adata_key='scaled|original|X_pca', +# Line 80: -- adata_ncomps=50, basis='X_umap', +# Line 81: -- clusters='celltype',knn=10, root_user=['nIPC','Neuroblast'], +# Line 82: -- dataset='group', +# Line 83: -- random_seed=112,is_coarse=True, +# Line 84: -- preserve_disconnected=True, +# Line 85: -- piegraph_arrow_head_width=0.05,piegraph_edgeweight_scalingfactor=2.5, +# Line 86: -- gene_matrix=SEACell_soft_ad.X,velo_weight=0.5, +# Line 87: -- edgebundle_pruning_twice=False, edgebundle_pruning=0.15, +# Line 88: -- jac_std_global=0.05,too_big_factor=0.05, +# Line 89: -- cluster_graph_pruning_std=1, +# Line 90: -- time_series=False, +# Line 91: -- ) +# Line 92: Run the pyVIA trajectory inference. -- v0.run() +# Line 93: Calculate and store pseudotime in the SEACell_soft_ad object using the pyVIA results. -- v0.get_pseudotime(SEACell_soft_ad) +# Line 94: Import matplotlib.pyplot as plt. -- #v0.get_pseudotime(SEACell_soft_ad) +# Line 95: Import matplotlib.pyplot as plt. -- import matplotlib.pyplot as plt +# Line 96: Create a figure and an axes object using matplotlib for plotting. -- fig, ax = plt.subplots(figsize=(3,3)) +# Line 97: Generate an embedding plot using omicverse's embedding function, visualizing the UMAP embedding and coloring by pseudotime. -- ov.utils.embedding(SEACell_soft_ad, +# Line 98: -- basis='X_umap', +# Line 99: -- color=["pt_via"], +# Line 100: -- title='Pseudotime', +# Line 101: -- frameon='small', +# Line 102: -- cmap='Reds', +# Line 103: -- #size=40, +# Line 104: -- legend_fontsize=12, +# Line 105: -- #palette=ov.utils.palette()[11:], +# Line 106: -- ax=ax, +# Line 107: -- show=False) +# Line 108: Write the SEACell_soft_ad object to an h5ad file with gzip compression. -- SEACell_soft_ad.write_h5ad('data/tutorial_meta_den.h5ad',compression='gzip') +# Line 109: Read the h5ad file into the SEACell_soft_ad object using omicverse's utils.read function. -- SEACell_soft_ad=ov.utils.read('data/tutorial_meta_den.h5ad') +# Line 110: Initialize a cellfategenie object using omicverse's single.cellfategenie function with pseudotime. -- cfg_obj=ov.single.cellfategenie(SEACell_soft_ad,pseudotime='pt_via') +# Line 111: Initialize the cellfategenie model. -- cfg_obj.model_init() +# Line 112: Run the ATR filtering method to filter the data with specified stop and flux parameters. -- cfg_obj.ATR(stop=500,flux=0.01) +# Line 113: Generate and display the filtering plot using the cellfategenie object. -- fig,ax=cfg_obj.plot_filtering(color='#5ca8dc') +# Line 114: Add a title to the filtering plot. -- ax.set_title('Dentategyrus Metacells\nCellFateGenie') +# Line 115: Fit the cellfategenie model. -- res=cfg_obj.model_fit() +# Line 116: Plot the gene fitting curves using the raw gene expression for each cell type. -- cfg_obj.plot_color_fitting(type='raw',cluster_key='celltype') +# Line 117: Plot the gene fitting curves using the filtered gene expression for each cell type. -- cfg_obj.plot_color_fitting(type='filter',cluster_key='celltype') +# Line 118: Calculate the Kendall Tau correlation for each gene after filtering and return results. -- kt_filter=cfg_obj.kendalltau_filter() +# Line 119: Display the top few rows of the Kendall Tau filtering results. -- kt_filter.head() +# Line 120: Select the variable names (genes) whose p-value is less than average p-value in the kendall tau filtered table. -- var_name=kt_filter.loc[kt_filter['pvalue']=0. +# Line 62: result_precision = 3, # Sets the rounding for the mean values in significan_means. +# Line 63: pvalue = 0.05, # P-value threshold to employ for significance. +# Line 64: subsampling = False, # To enable subsampling the data (geometri sketching). +# Line 65: subsampling_log = False, # (mandatory) enable subsampling log1p for non log-transformed data inputs. +# Line 66: subsampling_num_pc = 100, # Number of componets to subsample via geometric skectching (dafault: 100). +# Line 67: subsampling_num_cells = 1000, # Number of cells to subsample (integer) (default: 1/3 of the dataset). +# Line 68: separator = '|', # Sets the string to employ to separate cells in the results dataframes "cellA|CellB". +# Line 69: debug = False, # Saves all intermediate tables employed during the analysis in pkl format. +# Line 70: output_path = out_path, # Path to save results. +# Line 71: output_suffix = None # Replaces the timestamp in the output files by a user defined string in the (default: None). +# Line 73: Saves the CellphoneDB results to a pickle file. -- ov.utils.save(cpdb_results,'data/cpdb/gex_cpdb_test.pkl') +# Line 75: Loads CellphoneDB results from a pickle file. -- cpdb_results=ov.utils.load('data/cpdb/gex_cpdb_test.pkl') +# Line 77: Calculates cell interaction edges for the network visualization. -- interaction=ov.single.cpdb_network_cal(adata = adata, +# Line 78: pvals = cpdb_results['pvalues'], +# Line 79: celltype_key = "cell_labels",) +# Line 81: Displays the head of interaction edges dataframe. -- interaction['interaction_edges'].head() +# Line 83: Sets the plotting style using omicverse. -- ov.plot_set() +# Line 85: Creates a figure and axes for a heatmap plot. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 86: Generates a CellphoneDB heatmap using the interaction edges. -- ov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels', +# Line 87: fontsize=11, +# Line 88: ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',}) +# Line 90: Creates a figure and axes for a heatmap plot with specified source cells. -- fig, ax = plt.subplots(figsize=(2,4)) +# Line 91: Generates a CellphoneDB heatmap with source cell subset. -- ov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels', +# Line 92: source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'], +# Line 93: ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',}) +# Line 95: Generates a CellphoneDB chord diagram. -- fig=ov.pl.cpdb_chord(adata,interaction['interaction_edges'],celltype_key='cell_labels', +# Line 96: count_min=60,fontsize=12,padding=50,radius=100,save=None,) +# Line 97: Displays the chord diagram figure. -- fig.show() +# Line 99: Creates a figure and axes for a network plot. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 100: Generates a CellphoneDB network graph with cell labels. -- ov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels', +# Line 101: counts_min=60, +# Line 102: nodesize_scale=5, +# Line 103: ax=ax) +# Line 105: Creates a figure and axes for a network plot with specified source cells. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 106: Generates a CellphoneDB network with a source cell subset. -- ov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels', +# Line 107: counts_min=60, +# Line 108: nodesize_scale=5, +# Line 109: source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'], +# Line 110: ax=ax) +# Line 112: Creates a figure and axes for a network plot with specified target cells. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 113: Generates a CellphoneDB network with a target cell subset. -- ov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels', +# Line 114: counts_min=60, +# Line 115: nodesize_scale=5, +# Line 116: target_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'], +# Line 117: ax=ax) +# Line 119: Generates a CellphoneDB network plot with detailed customizations. -- ov.single.cpdb_plot_network(adata=adata, +# Line 120: interaction_edges=interaction['interaction_edges'], +# Line 121: celltype_key='cell_labels', +# Line 122: nodecolor_dict=None,title='EVT Network', +# Line 123: edgeswidth_scale=25,nodesize_scale=10, +# Line 124: pos_scale=1,pos_size=10,figsize=(6,6), +# Line 125: legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10) +# Line 127: Assigns the interaction edges to a new variable sub_i. -- sub_i=interaction['interaction_edges'] +# Line 128: Filters the interaction edges for source cells. -- sub_i=sub_i.loc[sub_i['SOURCE'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])] +# Line 129: Filters the interaction edges for target cells. -- sub_i=sub_i.loc[sub_i['TARGET'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])] +# Line 131: Creates a subset AnnData object based on specified cell labels. -- sub_adata=adata[adata.obs['cell_labels'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])] +# Line 132: Displays the sub-AnnData object. -- sub_adata +# Line 134: Generates a CellphoneDB network plot for a subset of cells. -- ov.single.cpdb_plot_network(adata=sub_adata, +# Line 135: interaction_edges=sub_i, +# Line 136: celltype_key='cell_labels', +# Line 137: nodecolor_dict=None,title='Sub-EVT Network', +# Line 138: edgeswidth_scale=25,nodesize_scale=1, +# Line 139: pos_scale=1,pos_size=10,figsize=(5,5), +# Line 140: legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10) +# Line 142: Generates a CellphoneDB chord diagram for the sub-AnnData object. -- fig=ov.pl.cpdb_chord(sub_adata,sub_i,celltype_key='cell_labels', +# Line 143: count_min=10,fontsize=12,padding=60,radius=100,save=None,) +# Line 144: Displays the chord diagram figure. -- fig.show() +# Line 146: Creates a figure and axes for a network plot for the sub-AnnData object. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 147: Generates a CellphoneDB network graph for the sub-AnnData object. -- ov.pl.cpdb_network(sub_adata,sub_i,celltype_key='cell_labels', +# Line 148: counts_min=10, +# Line 149: nodesize_scale=5, +# Line 150: ax=ax) +# Line 152: Creates a figure and axes for a heatmap plot for the sub-AnnData object. -- fig, ax = plt.subplots(figsize=(3,3)) +# Line 153: Generates a CellphoneDB heatmap for the sub-AnnData object. -- ov.pl.cpdb_heatmap(sub_adata,sub_i,celltype_key='cell_labels', +# Line 154: ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',}) +# Line 156: Extracts exact target interaction means from the CellphoneDB results. -- sub_means=ov.single.cpdb_exact_target(cpdb_results['means'],['eEVT','iEVT']) +# Line 157: Extracts exact source interaction means from the sub_means dataframe. -- sub_means=ov.single.cpdb_exact_source(sub_means,['dNK1','dNK2','dNK3']) +# Line 158: Displays the head of the sub_means DataFrame. -- sub_means.head() +# Line 160: Generates a CellphoneDB interacting heatmap with specific source and target cells. -- ov.pl.cpdb_interacting_heatmap(adata=adata, +# Line 161: celltype_key='cell_labels', +# Line 162: means=cpdb_results['means'], +# Line 163: pvalues=cpdb_results['pvalues'], +# Line 164: source_cells=['dNK1','dNK2','dNK3'], +# Line 165: target_cells=['eEVT','iEVT'], +# Line 166: plot_secret=True, +# Line 167: min_means=3, +# Line 168: nodecolor_dict=None, +# Line 169: ax=None, +# Line 170: figsize=(2,6), +# Line 171: fontsize=10,) +# Line 173: Generates a CellphoneDB group heatmap with specified source and target cells. -- ov.pl.cpdb_group_heatmap(adata=adata, +# Line 174: celltype_key='cell_labels', +# Line 175: means=cpdb_results['means'], +# Line 176: cmap={'Target':'Blues','Source':'Reds'}, +# Line 177: source_cells=['dNK1','dNK2','dNK3'], +# Line 178: target_cells=['eEVT','iEVT'], +# Line 179: plot_secret=True, +# Line 180: min_means=3, +# Line 181: nodecolor_dict=None, +# Line 182: ax=None, +# Line 183: figsize=(2,6), +# Line 184: fontsize=10,) +# Line 186: Generates a CellphoneDB interacting network with specified source and target cells. -- ov.pl.cpdb_interacting_network(adata=adata, +# Line 187: celltype_key='cell_labels', +# Line 188: means=cpdb_results['means'], +# Line 189: source_cells=['dNK1','dNK2','dNK3'], +# Line 190: target_cells=['eEVT','iEVT'], +# Line 191: means_min=1, +# Line 192: means_sum_min=1, +# Line 193: nodecolor_dict=None, +# Line 194: ax=None, +# Line 195: figsize=(6,6), +# Line 196: fontsize=10) +# Line 198: Filters the sub_means DataFrame to remove rows with null 'gene_a'. -- sub_means=sub_means.loc[~sub_means['gene_a'].isnull()] +# Line 199: Filters the sub_means DataFrame to remove rows with null 'gene_b'. -- sub_means=sub_means.loc[~sub_means['gene_b'].isnull()] +# Line 200: Creates a list of genes from 'gene_a' and 'gene_b' columns. -- enrichr_genes=sub_means['gene_a'].tolist()+sub_means['gene_b'].tolist() +# Line 202: Prepares a dictionary of pathway gene sets. -- pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Human') +# Line 205: Performs gene set enrichment analysis. -- enr=ov.bulk.geneset_enrichment(gene_list=enrichr_genes, +# Line 206: pathways_dict=pathway_dict, +# Line 207: pvalue_type='auto', +# Line 208: organism='human') +# Line 210: Sets the plotting style using omicverse. -- ov.plot_set() +# Line 211: Generates a gene set enrichment plot. -- ov.bulk.geneset_plot(enr,figsize=(2,4),fig_title='GO-Bio(EVT)', +# Line 212: cax_loc=[2, 0.45, 0.5, 0.02],num=8, +# Line 213: bbox_to_anchor_used=(-0.25, -13),custom_ticks=[10,100], +# Line 214: cmap='Greens') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cluster_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cluster_annotated.py new file mode 100644 index 00000000..c0fd615c --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cluster_annotated.py @@ -0,0 +1,117 @@ +``` +# Line 1: Import the omicverse library and alias it as ov. -- import omicverse as ov +# Line 2: Import the scanpy library and alias it as sc. -- import scanpy as sc +# Line 3: Import the scvelo library and alias it as scv. -- import scvelo as scv +# Line 4: Set the plotting style using the omicverse library. -- ov.plot_set() +# Line 6: Import the scvelo library and alias it as scv (again, which is redundant). -- import scvelo as scv +# Line 7: Load the dentategyrus dataset from scvelo into an AnnData object called adata. -- adata=scv.datasets.dentategyrus() +# Line 8: Display the adata AnnData object. -- adata +# Line 10: Preprocess the adata AnnData object using a 'shiftlog|pearson' method and selecting the top 3000 highly variable genes using the omicverse library. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=3000,) +# Line 11: Store the processed adata AnnData object in the .raw attribute. -- adata.raw = adata +# Line 12: Subset the adata AnnData object to only keep the highly variable genes. -- adata = adata[:, adata.var.highly_variable_features] +# Line 13: Scale the adata AnnData object using the omicverse library. -- ov.pp.scale(adata) +# Line 14: Perform Principal Component Analysis (PCA) on the scaled data with 50 principal components using the omicverse library. -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 16: Plot the variance ratio explained by each principal component using the omicverse library. -- ov.utils.plot_pca_variance_ratio(adata) +# Line 18: Compute the neighborhood graph with 15 neighbors using the top 50 PCs of the scaled, original or X_pca representations, using the scanpy library. -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 19: Cluster the data using the Leiden algorithm with resolution 1, using the omicverse library. -- use_rep='scaled|original|X_pca') +# Line 20: Generate a UMAP embedding colored by the 'clusters' and 'leiden' columns, using the omicverse library. -- ov.utils.cluster(adata,method='leiden',resolution=1) +# Line 21: Compute the neighborhood graph with 15 neighbors using the top 50 PCs of the scaled, original or X_pca representations, using the scanpy library (repeated). -- ov.utils.embedding(adata,basis='X_umap', +# Line 22: Cluster the data using the Louvain algorithm with resolution 1, using the omicverse library. -- color=['clusters','leiden'], +# Line 23: Generate a UMAP embedding colored by the 'clusters' and 'louvain' columns, using the omicverse library. -- frameon='small',wspace=0.5) +# Line 25: Compute the neighborhood graph with 15 neighbors using the top 50 PCs of the scaled, original or X_pca representations, using the scanpy library (repeated). -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 26: Cluster the data using the Louvain algorithm with resolution 1, using the omicverse library (repeated). -- use_rep='scaled|original|X_pca') +# Line 27: Generate a UMAP embedding colored by the 'clusters' and 'louvain' columns, using the omicverse library (repeated). -- ov.utils.cluster(adata,method='louvain',resolution=1) +# Line 29: Cluster the data using Gaussian Mixture Model (GMM) with 21 components on the scaled, original or X_pca representations, using the omicverse library. -- ov.utils.embedding(adata,basis='X_umap', +# Line 30: Generate a UMAP embedding colored by the 'clusters' and 'gmm_cluster' columns, using the omicverse library. -- color=['clusters','louvain'], +# Line 31: Perform Latent Dirichlet Allocation (LDA) topic modeling on the gene expression, using the omicverse library. -- frameon='small',wspace=0.5) +# Line 33: Generate a plot of the topic contributions from the LDA model using the omicverse library. -- ov.utils.cluster(adata,use_rep='scaled|original|X_pca', +# Line 35: Generate a plot of the topic contributions from the LDA model using the omicverse library (for topic 13). -- method='GMM',n_components=21, +# Line 37: Set the plot style using the omicverse library again (redundant). -- covariance_type='full',tol=1e-9, max_iter=1000, ) +# Line 38: Generate a UMAP embedding colored by the LDA topic columns, using the omicverse library. -- ov.utils.embedding(adata,basis='X_umap', +# Line 41: Generate a UMAP embedding colored by the 'clusters' and 'LDA_cluster' columns, using the omicverse library. -- color=['clusters','gmm_cluster'], +# Line 43: Run a Random Forest Classifier (RFC) based on LDA topic results using the omicverse library. -- frameon='small',wspace=0.5) +# Line 45: Generate a UMAP embedding colored by the Random Forest classification and cluster assignments from LDA results, using the omicverse library. -- LDA_obj=ov.utils.LDA_topic(adata,feature_type='expression', +# Line 47: Convert the sparse matrix of the AnnData object to a dense array using numpy -- highly_variable_key='highly_variable_features', +# Line 49: Import the numpy library. -- layers='counts',batch_key=None,learning_rate=1e-3) +# Line 50: Initialize a cNMF object to perform consensus non-negative matrix factorization. -- LDA_obj.plot_topic_contributions(6) +# Line 51: Launch a cNMF worker with id 0, among 4 total workers, using the omicverse library. -- LDA_obj.predicted(13) +# Line 52: Combine the results of all the cNMF workers skipping any missing file using the omicverse library. -- ov.plot_set() +# Line 53: Generate a plot of the cNMF K-selection, closing the figure after generation using the omicverse library. -- ov.utils.embedding(adata, basis='X_umap',color = LDA_obj.model.topic_cols, cmap='BuPu', ncols=4, +# Line 55: Set the number of selected components to 7. -- add_outline=True, frameon='small',) +# Line 56: Set the density threshold for the consensus cNMF step. -- ov.utils.embedding(adata,basis='X_umap', +# Line 57: Perform the consensus step for cNMF, and show clustering on the generated heatmap, using the omicverse library. -- color=['clusters','LDA_cluster'], +# Line 58: Load the cNMF results for the specified k and density threshold using the omicverse library. -- frameon='small',wspace=0.5) +# Line 59: Add the cNMF results into the adata AnnData object using the omicverse library. -- LDA_obj.get_results_rfc(adata,use_rep='scaled|original|X_pca', +# Line 61: Generate a UMAP embedding colored by the cNMF normalized usage matrix, using the omicverse library. -- LDA_threshold=0.4,num_topics=13) +# Line 63: Run a Random Forest Classifier (RFC) based on cNMF results using the omicverse library. -- ov.utils.embedding(adata,basis='X_umap', +# Line 65: Generate a UMAP embedding colored by the Random Forest classification and cluster assignments from cNMF results, using the omicverse library. -- color=['LDA_cluster_rfc','LDA_cluster_clf'], +# Line 66: Convert the sparse matrix of the AnnData object to a dense array using numpy (redundant since done in line 47). -- frameon='small',wspace=0.5) +# Line 68: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "leiden" labels, using sklearn. -- adata.X.toarray() +# Line 69: Print the Adjusted Rand Index (ARI) for "leiden". -- import numpy as np +# Line 71: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "louvain" labels, using sklearn. -- ## Initialize the cnmf object that will be used to run analyses +# Line 72: Print the Adjusted Rand Index (ARI) for "louvain". -- cnmf_obj = ov.single.cNMF(adata,components=np.arange(5,11), n_iter=20, seed=14, num_highvar_genes=2000, +# Line 74: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "gmm_cluster" labels, using sklearn. -- output_dir='example_dg1/cNMF', name='dg_cNMF') +# Line 75: Print the Adjusted Rand Index (ARI) for "GMM". -- ## Specify that the jobs are being distributed over a single worker (total_workers=1) and then launch that worker +# Line 77: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "LDA_cluster" labels, using sklearn. -- cnmf_obj.factorize(worker_i=0, total_workers=4) +# Line 78: Print the Adjusted Rand Index (ARI) for "LDA". -- cnmf_obj.combine(skip_missing_files=True) +# Line 80: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "LDA_cluster_rfc" labels, using sklearn. -- cnmf_obj.k_selection_plot(close_fig=False) +# Line 81: Print the Adjusted Rand Index (ARI) for "LDA_rfc". -- +# Line 83: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "LDA_cluster_clf" labels, using sklearn. -- selected_K = 7 +# Line 84: Print the Adjusted Rand Index (ARI) for "LDA_clf". -- density_threshold = 2.00 +# Line 86: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "cNMF_cluster_rfc" labels, using sklearn. -- cnmf_obj.consensus(k=selected_K, +# Line 87: Print the Adjusted Rand Index (ARI) for "cNMF_rfc". -- density_threshold=density_threshold, +# Line 89: Calculate the Adjusted Rand Index (ARI) comparing the "clusters" to the "cNMF_cluster_clf" labels, using sklearn. -- show_clustering=True, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- close_clustergram_fig=False) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- result_dict = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- cnmf_obj.get_results(adata,result_dict) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ov.pl.embedding(adata, basis='X_umap',color=result_dict['usage_norm'].columns, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- use_raw=False, ncols=3, vmin=0, vmax=1,frameon='small') +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- cnmf_obj.get_results_rfc(adata,result_dict, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- use_rep='scaled|original|X_pca', +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- cNMF_threshold=0.5) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ov.pl.embedding( +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- adata, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- basis="X_umap", +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- color=['cNMF_cluster_rfc','cNMF_cluster_clf'], +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- frameon='small', +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- #title="Celltypes", +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- #legend_loc='on data', +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- legend_fontsize=14, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- legend_fontoutline=2, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- #size=10, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- #legend_loc=True, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- add_outline=False, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- #add_outline=True, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- outline_color='black', +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- outline_width=1, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- show=False, +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- from sklearn.metrics.cluster import adjusted_rand_score +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['leiden']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('Leiden, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['louvain']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('Louvain, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['gmm_cluster']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('GMM, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('LDA, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster_rfc']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('LDA_rfc, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster_clf']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('LDA_clf, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['cNMF_cluster_rfc']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('cNMF_rfc, Adjusted rand index = %.2f' %ARI) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- ARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['cNMF_cluster_clf']) +# Line 90: Print the Adjusted Rand Index (ARI) for "cNMF_clf". -- print('cNMF_clf, Adjusted rand index = %.2f' %ARI) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py new file mode 100644 index 00000000..b3c4e884 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py @@ -0,0 +1,83 @@ +```python +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 3: Import the scanpy library as sc. -- import scanpy as sc +# Line 5: Set plotting parameters using omicverse. -- ov.plot_set() +# Line 7: Read Visium spatial data using scanpy. -- adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5') +# Line 8: Make variable names unique in the AnnData object. -- adata.var_names_make_unique() +# Line 10: Calculate quality control metrics using scanpy. -- sc.pp.calculate_qc_metrics(adata, inplace=True) +# Line 11: Filter out genes with total counts less than or equal to 100. -- adata = adata[:,adata.var['total_counts']>100] +# Line 12: Perform spatial variable gene selection using omicverse. -- adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform="visium",) +# Line 13: Display the AnnData object. -- adata +# Line 15: Write the AnnData object to a file with gzip compression. -- adata.write('data/cluster_svg.h5ad',compression='gzip') +# Line 17: Read the AnnData object from a file with gzip compression. -- adata=ov.read('data/cluster_svg.h5ad',compression='gzip') +# Line 20: Import the pandas library as pd. -- import pandas as pd +# Line 21: Import the os library. -- import os +# Line 22: Read the ground truth annotations from a tab-separated file into a pandas DataFrame. -- Ann_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0) +# Line 23: Assign the column name 'Ground Truth' to the DataFrame. -- Ann_df.columns = ['Ground Truth'] +# Line 24: Add the ground truth annotation as an observation in the AnnData object. -- adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth'] +# Line 25: Create a spatial plot with annotations using scanpy. -- sc.pl.spatial(adata, img_key="hires", color=["Ground Truth"]) +# Line 27: Initialize a dictionary to store keyword arguments for methods. -- methods_kwargs={} +# Line 28: Set parameters for the GraphST method in the methods_kwargs dictionary. -- methods_kwargs['GraphST']={ +# Line 31: Perform clustering using the GraphST method using omicverse. -- adata=ov.space.clusters(adata, +# Line 35: Cluster data using mclust on the GraphST representation with omicverse. -- ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10, +# Line 38: Refine cluster labels using omicverse based on the mclust results. -- adata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust') +# Line 39: Convert the mclust_GraphST column to categorical data type. -- adata.obs['mclust_GraphST']=adata.obs['mclust_GraphST'].astype('category') +# Line 41: Merge clusters based on the mclust_GraphST labels using omicverse. -- res=ov.space.merge_cluster(adata,groupby='mclust_GraphST',use_rep='graphst|original|X_pca', +# Line 44: Create a spatial plot showing several cluster annotations using scanpy. -- sc.pl.spatial(adata, color=['mclust_GraphST','mclust_GraphST_tree','mclust','Ground Truth']) +# Line 46: Cluster data using mclust_R on the GraphST representation using omicverse. -- ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust_R',n_components=10, +# Line 49: Refine cluster labels using omicverse based on the mclust_R results. -- adata.obs['mclust_R_GraphST'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') +# Line 50: Convert the mclust_R_GraphST column to categorical data type. -- adata.obs['mclust_R_GraphST']=adata.obs['mclust_R_GraphST'].astype('category') +# Line 51: Merge clusters based on the mclust_R_GraphST labels using omicverse. -- res=ov.space.merge_cluster(adata,groupby='mclust_R_GraphST',use_rep='graphst|original|X_pca', +# Line 54: Create a spatial plot showing several cluster annotations using scanpy. -- sc.pl.spatial(adata, color=['mclust_R_GraphST','mclust_R_GraphST_tree','mclust','Ground Truth']) +# Line 56: Re-initialize the methods_kwargs dictionary. -- methods_kwargs={} +# Line 57: Set parameters for the BINARY method in the methods_kwargs dictionary. -- methods_kwargs['BINARY']={ +# Line 73: Perform clustering using the BINARY method using omicverse. -- adata=ov.space.clusters(adata, +# Line 77: Cluster data using mclust_R on the BINARY representation using omicverse. -- ov.utils.cluster(adata,use_rep='BINARY',method='mclust_R',n_components=10, +# Line 80: Refine cluster labels using omicverse based on the mclust_R results. -- adata.obs['mclust_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') +# Line 81: Convert the mclust_BINARY column to categorical data type. -- adata.obs['mclust_BINARY']=adata.obs['mclust_BINARY'].astype('category') +# Line 83: Merge clusters based on the mclust_BINARY labels using omicverse. -- res=ov.space.merge_cluster(adata,groupby='mclust_BINARY',use_rep='BINARY', +# Line 86: Create a spatial plot showing several cluster annotations using scanpy. -- sc.pl.spatial(adata, color=['mclust_BINARY','mclust_BINARY_tree','mclust','Ground Truth']) +# Line 88: Cluster data using mclust on the BINARY representation using omicverse. -- ov.utils.cluster(adata,use_rep='BINARY',method='mclust',n_components=10, +# Line 91: Refine cluster labels using omicverse based on the mclust results. -- adata.obs['mclustpy_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust') +# Line 92: Convert the mclustpy_BINARY column to categorical data type. -- adata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category') +# Line 94: Convert the mclustpy_BINARY column to categorical data type. -- adata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category') +# Line 95: Merge clusters based on the mclustpy_BINARY labels using omicverse. -- res=ov.space.merge_cluster(adata,groupby='mclustpy_BINARY',use_rep='BINARY', +# Line 98: Create a spatial plot showing several cluster annotations using scanpy. -- sc.pl.spatial(adata, color=['mclustpy_BINARY','mclustpy_BINARY_tree','mclust','Ground Truth']) +# Line 102: Re-initialize the methods_kwargs dictionary. -- methods_kwargs={} +# Line 103: Set parameters for the STAGATE method in the methods_kwargs dictionary. -- methods_kwargs['STAGATE']={ +# Line 110: Perform clustering using the STAGATE method using omicverse. -- adata=ov.space.clusters(adata, +# Line 114: Cluster data using mclust_R on the STAGATE representation using omicverse. -- ov.utils.cluster(adata,use_rep='STAGATE',method='mclust_R',n_components=10, +# Line 117: Refine cluster labels using omicverse based on the mclust_R results. -- adata.obs['mclust_R_STAGATE'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') +# Line 118: Convert the mclust_R_STAGATE column to categorical data type. -- adata.obs['mclust_R_STAGATE']=adata.obs['mclust_R_STAGATE'].astype('category') +# Line 119: Merge clusters based on the mclust_R_STAGATE labels using omicverse. -- res=ov.space.merge_cluster(adata,groupby='mclust_R_STAGATE',use_rep='STAGATE', +# Line 122: Create a spatial plot showing several cluster annotations using scanpy. -- sc.pl.spatial(adata, color=['mclust_R_STAGATE','mclust_R_STAGATE_tree','mclust_R','Ground Truth']) +# Line 124: Display the top 5 genes with highest PI values. -- adata.var.sort_values('PI',ascending=False).head(5) +# Line 126: Set the name of the gene to plot. -- plot_gene = 'MBP' +# Line 127: Import the matplotlib library as plt. -- import matplotlib.pyplot as plt +# Line 128: Create a figure and a set of subplots for spatial plotting. -- fig, axs = plt.subplots(1, 2, figsize=(8, 4)) +# Line 129: Create a spatial plot showing raw expression of the specified gene using scanpy. -- sc.pl.spatial(adata, img_key="hires", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99') +# Line 130: Create a spatial plot showing STAGATE-transformed expression of the specified gene using scanpy. -- sc.pl.spatial(adata, img_key="hires", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99') +# Line 133: Re-initialize the methods_kwargs dictionary. -- methods_kwargs={} +# Line 134: Set parameters for the CAST method in the methods_kwargs dictionary. -- methods_kwargs['CAST']={ +# Line 139: Perform clustering using the CAST method using omicverse. -- adata=ov.space.clusters(adata, +# Line 142: Cluster data using mclust on the CAST representation using omicverse. -- ov.utils.cluster(adata,use_rep='X_cast',method='mclust',n_components=10, +# Line 145: Refine cluster labels using omicverse based on the mclust results. -- adata.obs['mclust_CAST'] = ov.utils.refine_label(adata, radius=50, key='mclust') +# Line 146: Convert the mclust_CAST column to categorical data type. -- adata.obs['mclust_CAST']=adata.obs['mclust_CAST'].astype('category') +# Line 148: Merge clusters based on the mclust_CAST labels using omicverse. -- res=ov.space.merge_cluster(adata,groupby='mclust_CAST',use_rep='X_cast', +# Line 151: Create a spatial plot showing several cluster annotations using scanpy. -- sc.pl.spatial(adata, color=['mclust_CAST','mclust_CAST_tree','mclust','Ground Truth']) +# Line 153: Display the AnnData object. -- adata +# Line 155: Import the adjusted rand score from sklearn. -- from sklearn.metrics.cluster import adjusted_rand_score +# Line 157: Create a subset of adata's obs dataframe that does not contain any NA values. -- obs_df = adata.obs.dropna() +# Line 159: Calculate the adjusted rand index for mclust_GraphST compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth']) +# Line 160: Print the ARI for mclust_GraphST. -- print('mclust_GraphST: Adjusted rand index = %.2f' %ARI) +# Line 162: Calculate the adjusted rand index for mclust_R_GraphST compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclust_R_GraphST'], obs_df['Ground Truth']) +# Line 163: Print the ARI for mclust_R_GraphST. -- print('mclust_R_GraphST: Adjusted rand index = %.2f' %ARI) +# Line 165: Calculate the adjusted rand index for mclust_R_STAGATE compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclust_R_STAGATE'], obs_df['Ground Truth']) +# Line 166: Print the ARI for mclust_STAGATE. -- print('mclust_STAGATE: Adjusted rand index = %.2f' %ARI) +# Line 168: Calculate the adjusted rand index for mclust_BINARY compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclust_BINARY'], obs_df['Ground Truth']) +# Line 169: Print the ARI for mclust_BINARY. -- print('mclust_BINARY: Adjusted rand index = %.2f' %ARI) +# Line 171: Calculate the adjusted rand index for mclustpy_BINARY compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclustpy_BINARY'], obs_df['Ground Truth']) +# Line 172: Print the ARI for mclustpy_BINARY. -- print('mclustpy_BINARY: Adjusted rand index = %.2f' %ARI) +# Line 174: Calculate the adjusted rand index for mclust_CAST compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclust_CAST'], obs_df['Ground Truth']) +# Line 175: Print the ARI for mclust_CAST. -- print('mclust_CAST: Adjusted rand index = %.2f' %ARI) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py new file mode 100644 index 00000000..d9c2daef --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py @@ -0,0 +1,187 @@ +```python +# Line 1: Import the scanpy library for single-cell analysis -- import scanpy as sc +# Line 2: Import the omicverse library, likely for multi-omics analysis and visualization -- import omicverse as ov +# Line 3: Set plotting style using omicverse's plot_set function -- ov.plot_set() +# Line 4: -- +# Line 5: Import the scvelo library for RNA velocity analysis -- import scvelo as scv +# Line 6: Load the dentategyrus dataset from scvelo -- adata=scv.datasets.dentategyrus() +# Line 7: -- +# Line 8: Time the preprocessing step using ipython's magic command -- %%time +# Line 9: Preprocess the adata object using omicverse with shiftlog normalization and pearson scaling, keeping 2000 highly variable genes -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 10: Display the preprocessed adata object -- adata +# Line 11: -- +# Line 12: Scale the data using omicverse -- ov.pp.scale(adata) +# Line 13: Perform principal component analysis using omicverse -- ov.pp.pca(adata) +# Line 14: -- +# Line 15: Import the matplotlib.pyplot library for plotting -- import matplotlib.pyplot as plt +# Line 16: Import patheffects from matplotlib for adding outline to objects -- from matplotlib import patheffects +# Line 17: Create a matplotlib figure and axes object with a specified size -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 18: Generate an embedding plot using omicverse, with specified color, frame, title, legend, and other aesthetic settings -- ov.pl.embedding( +# Line 19: -- adata, +# Line 20: -- basis="X_umap", +# Line 21: -- color=['clusters'], +# Line 22: -- frameon='small', +# Line 23: -- title="Celltypes", +# Line 24: -- #legend_loc='on data', +# Line 25: -- legend_fontsize=14, +# Line 26: -- legend_fontoutline=2, +# Line 27: -- #size=10, +# Line 28: -- ax=ax, +# Line 29: -- #legend_loc=True, +# Line 30: -- add_outline=False, +# Line 31: -- #add_outline=True, +# Line 32: -- outline_color='black', +# Line 33: -- outline_width=1, +# Line 34: -- show=False, +# Line 35: -- ) +# Line 36: -- +# Line 37: Import the numpy library for numerical operations -- import numpy as np +# Line 38: Initialize a cNMF object with specified components, iterations, seed, high variance genes, output directory and name -- cnmf_obj = ov.single.cNMF(adata,components=np.arange(5,11), n_iter=20, seed=14, num_highvar_genes=2000, +# Line 39: -- output_dir='example_dg/cNMF', name='dg_cNMF') +# Line 40: -- +# Line 41: Run the factorization for the cNMF object with a specified worker and number of workers -- cnmf_obj.factorize(worker_i=0, total_workers=2) +# Line 42: -- +# Line 43: Combine the results of the cNMF factorization, skipping missing files -- cnmf_obj.combine(skip_missing_files=True) +# Line 44: -- +# Line 45: Create a plot for k selection of the cNMF object -- cnmf_obj.k_selection_plot(close_fig=False) +# Line 46: -- +# Line 47: Set the selected K value for consensus clustering -- selected_K = 7 +# Line 48: Set the density threshold for filtering -- density_threshold = 2.00 +# Line 49: -- +# Line 50: Run consensus clustering with the specified k and density threshold, showing the clustering results -- cnmf_obj.consensus(k=selected_K, +# Line 51: -- density_threshold=density_threshold, +# Line 52: -- show_clustering=True, +# Line 53: -- close_clustergram_fig=False) +# Line 54: -- +# Line 55: Set the density threshold for filtering -- density_threshold = 0.10 +# Line 56: -- +# Line 57: Run consensus clustering with the specified k and density threshold, showing the clustering results -- cnmf_obj.consensus(k=selected_K, +# Line 58: -- density_threshold=density_threshold, +# Line 59: -- show_clustering=True, +# Line 60: -- close_clustergram_fig=False) +# Line 61: -- +# Line 62: Import the seaborn library for statistical data visualization -- import seaborn as sns +# Line 63: Import the matplotlib.pyplot library for plotting -- import matplotlib.pyplot as plt +# Line 64: Import patheffects from matplotlib for adding outline to objects -- from matplotlib import patheffects +# Line 65: -- +# Line 66: Import gridspec for creating more complex figure layouts -- from matplotlib import gridspec +# Line 67: Import matplotlib.pyplot library for plotting -- import matplotlib.pyplot as plt +# Line 68: -- +# Line 69: Define the width ratios for the subplots -- width_ratios = [0.2, 4, 0.5, 10, 1] +# Line 70: Define the height ratios for the subplots -- height_ratios = [0.2, 4] +# Line 71: Create a matplotlib figure object with specified size based on ratios -- fig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios))) +# Line 72: Create a gridspec object for defining the structure of the subplots -- gs = gridspec.GridSpec(len(height_ratios), len(width_ratios), fig, +# Line 73: -- 0.01, 0.01, 0.98, 0.98, +# Line 74: -- height_ratios=height_ratios, +# Line 75: -- width_ratios=width_ratios, +# Line 76: -- wspace=0, hspace=0) +# Line 77: -- +# Line 78: Extract topic distances from the cNMF object -- D = cnmf_obj.topic_dist[cnmf_obj.spectra_order, :][:, cnmf_obj.spectra_order] +# Line 79: Add a subplot for the distance matrix with no x or y ticks, labels or frame -- dist_ax = fig.add_subplot(gs[1,1], xscale='linear', yscale='linear', +# Line 80: -- xticks=[], yticks=[],xlabel='', ylabel='', +# Line 81: -- frameon=True) +# Line 82: Display the topic distance matrix using imshow -- dist_im = dist_ax.imshow(D, interpolation='none', cmap='viridis', +# Line 83: -- aspect='auto', rasterized=True) +# Line 84: -- +# Line 85: Add a subplot for the left cluster labels with no x or y ticks, labels or frame -- left_ax = fig.add_subplot(gs[1,0], xscale='linear', yscale='linear', xticks=[], yticks=[], +# Line 86: -- xlabel='', ylabel='', frameon=True) +# Line 87: Display the cluster labels on the left using imshow -- left_ax.imshow(cnmf_obj.kmeans_cluster_labels.values[cnmf_obj.spectra_order].reshape(-1, 1), +# Line 88: -- interpolation='none', cmap='Spectral', aspect='auto', +# Line 89: -- rasterized=True) +# Line 90: -- +# Line 91: Add a subplot for the top cluster labels with no x or y ticks, labels or frame -- top_ax = fig.add_subplot(gs[0,1], xscale='linear', yscale='linear', xticks=[], yticks=[], +# Line 92: -- xlabel='', ylabel='', frameon=True) +# Line 93: Display the cluster labels on the top using imshow -- top_ax.imshow(cnmf_obj.kmeans_cluster_labels.values[cnmf_obj.spectra_order].reshape(1, -1), +# Line 94: -- interpolation='none', cmap='Spectral', aspect='auto', +# Line 95: -- rasterized=True) +# Line 96: -- +# Line 97: Create a nested gridspec for the colorbar with no spacing -- cbar_gs = gridspec.GridSpecFromSubplotSpec(3, 3, subplot_spec=gs[1, 2], +# Line 98: -- wspace=0, hspace=0) +# Line 99: Add a subplot for the colorbar with no x or y labels, with a title -- cbar_ax = fig.add_subplot(cbar_gs[1,2], xscale='linear', yscale='linear', +# Line 100: -- xlabel='', ylabel='', frameon=True, title='Euclidean\nDistance') +# Line 101: Set the title of the colorbar subplot -- cbar_ax.set_title('Euclidean\nDistance',fontsize=12) +# Line 102: Find the minimum value in the distance matrix for color scaling -- vmin = D.min().min() +# Line 103: Find the maximum value in the distance matrix for color scaling -- vmax = D.max().max() +# Line 104: Add a colorbar to the figure based on the dist_im object, with specified ticks and formatting -- fig.colorbar(dist_im, cax=cbar_ax, +# Line 105: -- ticks=np.linspace(vmin, vmax, 3), +# Line 106: -- ) +# Line 107: Set the font size of the y-axis tick labels -- cbar_ax.set_yticklabels(cbar_ax.get_yticklabels(),fontsize=12) +# Line 108: -- +# Line 109: Filter local density values based on the specified threshold -- density_filter = cnmf_obj.local_density.iloc[:, 0] < density_threshold +# Line 110: Create a matplotlib figure and axes for histogram -- fig, hist_ax = plt.subplots(figsize=(4,4)) +# Line 111: -- +# Line 112: Create a histogram of the local density values, with specified bins -- hist_ax.hist(cnmf_obj.local_density.values, bins=np.linspace(0, 1, 50)) +# Line 113: Put y-axis ticks on the right side of the plot -- hist_ax.yaxis.tick_right() +# Line 114: -- +# Line 115: Get the current x limits of the hist_ax plot -- xlim = hist_ax.get_xlim() +# Line 116: Get the current y limits of the hist_ax plot -- ylim = hist_ax.get_ylim() +# Line 117: If the density threshold is within the x-axis limits, add a vertical line and a text indicating the threshold -- if density_threshold < xlim[1]: +# Line 118: -- hist_ax.axvline(density_threshold, linestyle='--', color='k') +# Line 119: -- hist_ax.text(density_threshold + 0.02, ylim[1] * 0.95, 'filtering\nthreshold\n\n', va='top') +# Line 120: Set the x-axis limits of the hist_ax plot -- hist_ax.set_xlim(xlim) +# Line 121: Set the x-axis label with information about filtering based on density threshold -- hist_ax.set_xlabel('Mean distance to k nearest neighbors\n\n%d/%d (%.0f%%) spectra above threshold\nwere removed prior to clustering'%(sum(~density_filter), len(density_filter), 100*(~density_filter).mean())) +# Line 122: Set the title of the histogram -- hist_ax.set_title('Local density histogram') +# Line 123: -- +# Line 124: Load the cNMF results with the specified K and density threshold -- result_dict = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold) +# Line 125: -- +# Line 126: -- +# Line 127: Display the head of the 'usage_norm' dataframe from the results -- result_dict['usage_norm'].head() +# Line 128: -- +# Line 129: Display the head of the 'gep_scores' dataframe from the results -- result_dict['gep_scores'].head() +# Line 130: -- +# Line 131: Display the head of the 'gep_tpm' dataframe from the results -- result_dict['gep_tpm'].head() +# Line 132: -- +# Line 133: Display the head of the 'top_genes' dataframe from the results -- result_dict['top_genes'].head() +# Line 134: -- +# Line 135: Add the results of cNMF to the anndata object -- cnmf_obj.get_results(adata,result_dict) +# Line 136: -- +# Line 137: Generate an embedding plot of the usage_norm values from cNMF using omicverse -- ov.pl.embedding(adata, basis='X_umap',color=result_dict['usage_norm'].columns, +# Line 138: -- use_raw=False, ncols=3, vmin=0, vmax=1,frameon='small') +# Line 139: -- +# Line 140: Generate an embedding plot using omicverse with specified color, frame, title, legend, and other aesthetic settings -- ov.pl.embedding( +# Line 141: -- adata, +# Line 142: -- basis="X_umap", +# Line 143: -- color=['cNMF_cluster'], +# Line 144: -- frameon='small', +# Line 145: -- #title="Celltypes", +# Line 146: -- #legend_loc='on data', +# Line 147: -- legend_fontsize=14, +# Line 148: -- legend_fontoutline=2, +# Line 149: -- #size=10, +# Line 150: -- #legend_loc=True, +# Line 151: -- add_outline=False, +# Line 152: -- #add_outline=True, +# Line 153: -- outline_color='black', +# Line 154: -- outline_width=1, +# Line 155: -- show=False, +# Line 156: -- ) +# Line 157: -- +# Line 158: Add the random forest classifier results to the anndata object -- cnmf_obj.get_results_rfc(adata,result_dict, +# Line 159: -- use_rep='scaled|original|X_pca', +# Line 160: -- cNMF_threshold=0.5) +# Line 161: -- +# Line 162: Generate an embedding plot using omicverse with specified color, frame, title, legend, and other aesthetic settings -- ov.pl.embedding( +# Line 163: -- adata, +# Line 164: -- basis="X_umap", +# Line 165: -- color=['cNMF_cluster_rfc','cNMF_cluster_clf'], +# Line 166: -- frameon='small', +# Line 167: -- #title="Celltypes", +# Line 168: -- #legend_loc='on data', +# Line 169: -- legend_fontsize=14, +# Line 170: -- legend_fontoutline=2, +# Line 171: -- #size=10, +# Line 172: -- #legend_loc=True, +# Line 173: -- add_outline=False, +# Line 174: -- #add_outline=True, +# Line 175: -- outline_color='black', +# Line 176: -- outline_width=1, +# Line 177: -- show=False, +# Line 178: -- ) +# Line 179: -- +# Line 180: Initialize empty list to hold top genes -- plot_genes=[] +# Line 181: Loop through columns of top genes dataframe and add the top 3 genes from each column to plot_genes list -- for i in result_dict['top_genes'].columns: +# Line 182: -- plot_genes+=result_dict['top_genes'][i][:3].values.reshape(-1).tolist() +# Line 183: -- +# Line 184: Create a dotplot of top genes for each cNMF cluster using scanpy -- sc.pl.dotplot(adata,plot_genes, +# Line 185: -- "cNMF_cluster", dendrogram=False,standard_scale='var',) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_commot_flowsig_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_commot_flowsig_annotated.py new file mode 100644 index 00000000..f1238d9a --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_commot_flowsig_annotated.py @@ -0,0 +1,62 @@ +``` +# Line 1: Import the omicverse library as ov -- import omicverse as ov +# Line 3: Import the scanpy library as sc -- import scanpy as sc +# Line 5: Set the plotting style using ov.plot_set() -- ov.plot_set() +# Line 7: Read Visium spatial data into an AnnData object named adata -- adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5') +# Line 8: Make variable names unique in the adata object -- adata.var_names_make_unique() +# Line 10: Calculate quality control metrics for the adata object in place -- sc.pp.calculate_qc_metrics(adata, inplace=True) +# Line 11: Filter the adata object, keeping only variables with total counts greater than 100 -- adata = adata[:,adata.var['total_counts']>100] +# Line 12: Perform spatial variable gene selection using ov.space.svg with specified parameters, saving to adata -- adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform="visium",) +# Line 13: Display the adata object -- adata +# Line 15: Write the adata object to a compressed h5ad file -- adata.write('data/cluster_svg.h5ad',compression='gzip') +# Line 19: Load a ligand-receptor database from CellChat using ov.externel.commot.pp.ligand_receptor_database with specific parameters -- df_cellchat = ov.externel.commot.pp.ligand_receptor_database(species='human', +# Line 21: Print the shape of the df_cellchat dataframe -- print(df_cellchat.shape) +# Line 23: Filter the ligand-receptor database based on gene presence in adata using ov.externel.commot.pp.filter_lr_database -- df_cellchat_filtered = ov.externel.commot.pp.filter_lr_database(df_cellchat, +# Line 26: Print the shape of the filtered dataframe df_cellchat_filtered -- print(df_cellchat_filtered.shape) +# Line 28: Perform spatial communication analysis using ov.externel.commot.tl.spatial_communication with specified parameters -- ov.externel.commot.tl.spatial_communication(adata, +# Line 36: Import the pandas library as pd -- import pandas as pd +# Line 37: Import the os library -- import os +# Line 38: Read the annotation file into a pandas DataFrame, setting the index and column names -- Ann_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0) +# Line 39: Assign a column name to the annotation DataFrame -- Ann_df.columns = ['Ground_Truth'] +# Line 40: Add the 'Ground_Truth' annotation data to the adata.obs DataFrame -- adata.obs['Ground_Truth'] = Ann_df.loc[adata.obs_names, 'Ground_Truth'] +# Line 41: Define a list of colors for plotting -- Layer_color=['#283b5c', '#d8e17b', '#838e44', '#4e8991', '#d08c35', '#511a3a', +# Line 43: Plot the spatial data with annotations using sc.pl.spatial with specified parameters -- sc.pl.spatial(adata, img_key="hires", color=["Ground_Truth"],palette=Layer_color) +# Line 45: Create a dictionary mapping ground truth categories to colors from the adata object -- ct_color_dict=dict(zip(adata.obs['Ground_Truth'].cat.categories, +# Line 47: Display the first few rows of the ligand-receptor information in adata -- adata.uns['commot-cellchat-info']['df_ligrec'].head() +# Line 49: Import the matplotlib plotting library as plt -- import matplotlib.pyplot as plt +# Line 50: Set a scaling factor for plotting -- scale=0.000008 +# Line 51: Set a neighborhood size parameter for spatial communication analysis -- k=5 +# Line 52: Set the target pathway for spatial communication analysis -- goal_pathway='FGF' +# Line 53: Perform communication direction analysis using ov.externel.commot.tl.communication_direction for the specified pathway -- ov.externel.commot.tl.communication_direction(adata, database_name='cellchat', pathway_name=goal_pathway, k=k) +# Line 54: Plot cell communication patterns using ov.externel.commot.pl.plot_cell_communication with specific parameters for the FGF pathway -- ov.externel.commot.pl.plot_cell_communication(adata, database_name='cellchat', +# Line 63: Set the title of the plot using the pathway name -- plt.title(f'Pathway:{goal_pathway}',fontsize=13) +# Line 67: Write the adata object to a compressed h5ad file with a new name -- adata.write('data/151676_commot.h5ad',compression='gzip') +# Line 69: Read a compressed h5ad file into adata -- adata=ov.read('data/151676_commot.h5ad') +# Line 70: Display the adata object -- adata +# Line 72: Create a new layer in adata called 'normalized' by copying the contents of adata.X -- adata.layers['normalized'] = adata.X.copy() +# Line 74: Construct gene expression modules using non-negative matrix factorization via ov.externel.flowsig.pp.construct_gems_using_nmf -- ov.externel.flowsig.pp.construct_gems_using_nmf(adata, +# Line 80: Set the target gene expression module for further analysis -- goal_gem='GEM-5' +# Line 81: Get the top genes for the selected GEM module using ov.externel.flowsig.ul.get_top_gem_genes -- gem_gene=ov.externel.flowsig.ul.get_top_gem_genes(adata=adata, +# Line 88: Display the top genes for the selected GEM module -- gem_gene.head() +# Line 90: Define the commot output key as 'commot-cellchat' -- commot_output_key = 'commot-cellchat' +# Line 91: Construct cellular flows from the commot output using ov.externel.flowsig.pp.construct_flows_from_commot -- ov.externel.flowsig.pp.construct_flows_from_commot(adata, +# Line 99: Determine informative variables in the flow data using ov.externel.flowsig.pp.determine_informative_variables with spatial information -- ov.externel.flowsig.pp.determine_informative_variables(adata, +# Line 109: Import the KMeans class from scikit-learn and the pandas library -- from sklearn.cluster import KMeans +# Line 111: Perform KMeans clustering on spatial coordinates of the data -- kmeans = KMeans(n_clusters=10, random_state=0).fit(adata.obsm['spatial']) +# Line 112: Add the spatial KMeans clustering labels to the adata.obs data -- adata.obs['spatial_kmeans'] = pd.Series(kmeans.labels_, dtype='category').values +# Line 115: Learn intercellular flows using ov.externel.flowsig.tl.learn_intercellular_flows with spatial data -- ov.externel.flowsig.tl.learn_intercellular_flows(adata, +# Line 123: Apply biological flow validation using ov.externel.flowsig.tl.apply_biological_flow -- ov.externel.flowsig.tl.apply_biological_flow(adata, +# Line 129: Set a threshold for edge filtering in the network -- edge_threshold = 0.7 +# Line 131: Filter low-confidence edges in the network using ov.externel.flowsig.tl.filter_low_confidence_edges -- ov.externel.flowsig.tl.filter_low_confidence_edges(adata, +# Line 137: Write the adata object to a compressed h5ad file with a new name -- adata.write('data/cortex_commot_flowsig.h5ad',compression='gzip') +# Line 141: Construct the intercellular flow network using ov.externel.flowsig.tl.construct_intercellular_flow_network -- flow_network = ov.externel.flowsig.tl.construct_intercellular_flow_network(adata, +# Line 144: Set the flow expression key for subsequent analysis -- flowsig_expr_key='X_gem' +# Line 145: Retrieve the expression data from adata using flowsig_expr_key -- X_flow = adata.obsm[flowsig_expr_key] +# Line 146: Create a new AnnData object called adata_subset using expression data from X_flow -- adata_subset = sc.AnnData(X=X_flow) +# Line 147: Assign the observations from adata to adata_subset -- adata_subset.obs = adata.obs +# Line 148: Rename variable names of adata_subset using the GEM naming convention -- adata_subset.var.index =[f'GEM-{i}' for i in range(1,len(adata_subset.var)+1)] +# Line 151: Import the matplotlib plotting library -- import matplotlib.pyplot as plt +# Line 152: Create a dotplot using scanpy.pl.dotplot on the subset of adata object, grouped by 'Ground_Truth', with specified parameters -- ax=sc.pl.dotplot(adata_subset, adata_subset.var.index, groupby='Ground_Truth', +# Line 154: Create a color dictionary from ground truth categories to colors -- color_dict=dict(zip(adata.obs['Ground_Truth'].cat.categories,adata.uns['Ground_Truth_colors'])) +# Line 156: Plot the flowsig network using ov.pl.plot_flowsig_network -- ov.pl.plot_flowsig_network(flow_network=flow_network, +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py new file mode 100644 index 00000000..b494b4cf --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py @@ -0,0 +1,25 @@ +```python +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Apply default plotting settings from omicverse. -- ov.plot_set() +# Line 4: Import the scvelo library as scv. -- import scvelo as scv +# Line 5: Load the dentategyrus dataset from scvelo into an AnnData object named adata. -- adata=scv.datasets.dentategyrus() +# Line 6: Display the AnnData object named adata. -- adata +# Line 8: Preprocess the AnnData object adata using omicverse's preprocess function with specified parameters. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 9: Display the preprocessed AnnData object named adata. -- adata +# Line 11: Run the cytotrace2 algorithm on the AnnData object adata with specified parameters and store the results. -- results = ov.single.cytotrace2(adata, +# Line 12: Specify the directory containing the trained models for cytotrace2. -- use_model_dir="cymodels/5_models_weights", +# Line 13: Specify the species as "mouse" for cytotrace2. -- species="mouse", +# Line 14: Set the batch size for cytotrace2 to 10000. -- batch_size = 10000, +# Line 15: Set the smooth batch size for cytotrace2 to 1000. -- smooth_batch_size = 1000, +# Line 16: Disable parallelization for cytotrace2. -- disable_parallelization = False, +# Line 17: Set the maximum number of cores to None for cytotrace2, which will use all available. -- max_cores = None, +# Line 18: Set the maximum number of principal components to use for cytotrace2 to 200. -- max_pcs = 200, +# Line 19: Set the random seed for cytotrace2 to 14. -- seed = 14, +# Line 20: Set the output directory for cytotrace2 results. -- output_dir = 'cytotrace2_results' +# Line 23: Generate a UMAP embedding plot of adata, colored by clusters and CytoTRACE2_Score. -- ov.utils.embedding(adata,basis='X_umap', +# Line 24: Set plot frame to 'small', colormap to 'Reds' and horizontal spacing. -- color=['clusters','CytoTRACE2_Score'], +# Line 25: Set plot frame to 'small', colormap to 'Reds' and horizontal spacing. -- frameon='small',cmap='Reds',wspace=0.55) +# Line 27: Generate another UMAP embedding plot of adata, colored by CytoTRACE2_Potency and CytoTRACE2_Relative. -- ov.utils.embedding(adata,basis='X_umap', +# Line 28: Set plot frame to 'small', colormap to 'Reds' and horizontal spacing. -- color=['CytoTRACE2_Potency','CytoTRACE2_Relative'], +# Line 29: Set plot frame to 'small', colormap to 'Reds' and horizontal spacing. -- frameon='small',cmap='Reds',wspace=0.55) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py new file mode 100644 index 00000000..f35fe220 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py @@ -0,0 +1,129 @@ +```python +# Line 1: Imports the omicverse library as ov -- import omicverse as ov +# Line 2: Imports the scanpy library as sc -- import scanpy as sc +# Line 3: Imports the matplotlib.pyplot library as plt -- import matplotlib.pyplot as plt +# Line 5: Sets the plotting style using omicverse's plot_set function -- ov.plot_set() +# Line 7: Downloads gene ID annotation pairs using omicverse's utility function -- ov.utils.download_geneid_annotation_pair() +# Line 9: Reads count data from a file using omicverse's read function -- data=ov.read('data/counts.txt',index_col=0,header=1) +# Line 11: Renames the columns of the data by extracting the file name and removing the '.bam' suffix -- data.columns=[i.split('/')[-1].replace('.bam','') for i in data.columns] +# Line 12: Displays the head of the data -- data.head() +# Line 14: Maps gene IDs in the data using omicverse's Matrix_ID_mapping function with a given gene pair file. -- data=ov.bulk.Matrix_ID_mapping(data,'genesets/pair_GRCm39.tsv') +# Line 15: Displays the head of the updated data after mapping. -- data.head() +# Line 17: Creates a DEG object from the data using omicverse's pyDEG function -- dds=ov.bulk.pyDEG(data) +# Line 19: Removes duplicate indices from the DEG object -- dds.drop_duplicates_index() +# Line 20: Prints a success message after removing duplicate indices -- print('... drop_duplicates_index success') +# Line 22: Normalizes the data in the DEG object -- dds.normalize() +# Line 23: Prints a success message after normalizing the data -- print('... estimateSizeFactors and normalize success') +# Line 25: Defines a list of treatment groups for DEG analysis -- treatment_groups=['4-3','4-4'] +# Line 26: Defines a list of control groups for DEG analysis -- control_groups=['1--1','1--2'] +# Line 27: Performs differential gene expression analysis using t-tests with given treatment and control groups -- result=dds.deg_analysis(treatment_groups,control_groups,method='ttest') +# Line 28: Displays the head of the DEG analysis result -- result.head() +# Line 30: Prints the shape of the DEG analysis result dataframe -- print(result.shape) +# Line 31: Filters the result dataframe, keeping only rows where 'log2(BaseMean)' is greater than 1 -- result=result.loc[result['log2(BaseMean)']>1] +# Line 32: Prints the shape of the filtered DEG analysis result dataframe -- print(result.shape) +# Line 35: Sets fold change and p-value thresholds for the DEG results -- dds.foldchange_set(fc_threshold=-1, +# Line 36: Sets the p-value threshold to 0.05 -- pval_threshold=0.05, +# Line 37: Sets the maximum log p-value to 6 -- logp_max=6) +# Line 39: Creates a volcano plot of the DEG analysis results -- dds.plot_volcano(title='DEG Analysis',figsize=(4,4), +# Line 40: Specifies the number of genes to label in the volcano plot and sets font size -- plot_genes_num=8,plot_genes_fontsize=12,) +# Line 42: Generates boxplots for specified genes, comparing treatment and control groups -- dds.plot_boxplot(genes=['Ckap2','Lef1'],treatment_groups=treatment_groups, +# Line 43: Specifies boxplot's figure size, font size, and legend position -- control_groups=control_groups,figsize=(2,3),fontsize=12, +# Line 44: Specifies boxplot's legend position -- legend_bbox=(2,0.55)) +# Line 46: Generates boxplot for gene 'Ckap2', comparing treatment and control groups -- dds.plot_boxplot(genes=['Ckap2'],treatment_groups=treatment_groups, +# Line 47: Specifies boxplot's figure size, font size, and legend position -- control_groups=control_groups,figsize=(2,3),fontsize=12, +# Line 48: Specifies boxplot's legend position -- legend_bbox=(2,0.55)) +# Line 50: Downloads pathway database using omicverse's utility function -- ov.utils.download_pathway_database() +# Line 52: Prepares pathway dictionary from the specified file using omicverse's geneset_prepare -- pathway_dict=ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt',organism='Mouse') +# Line 54: Extracts a list of differentially expressed genes from DEG analysis results -- deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist() +# Line 55: Performs gene set enrichment analysis using omicverse's geneset_enrichment function -- enr=ov.bulk.geneset_enrichment(gene_list=deg_genes, +# Line 56: Provides pathways dict and p-value type, and organism for geneset enrichment -- pathways_dict=pathway_dict, +# Line 57: Automatically determine the type of p-value to use in enrichment analysis -- pvalue_type='auto', +# Line 58: Provides the organism information for geneset enrichment -- organism='mouse') +# Line 60: Plots the gene set enrichment results using omicverse's geneset_plot function -- ov.bulk.geneset_plot(enr,figsize=(2,5),fig_title='Wiki Pathway enrichment', +# Line 61: Sets the color bar location and the bounding box for plot -- cax_loc=[2, 0.45, 0.5, 0.02], +# Line 62: Specifies the bounding box for plot and node diameter -- bbox_to_anchor_used=(-0.25, -13),node_diameter=10, +# Line 63: Sets custom ticks and text knockout for plot -- custom_ticks=[5,7],text_knock=3, +# Line 64: Sets the color map to 'Reds' -- cmap='Reds') +# Line 66: Prepares GO Biological Process pathway dictionary -- pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Mouse') +# Line 67: Performs GO Biological Process gene set enrichment analysis -- enr_go_bp=ov.bulk.geneset_enrichment(gene_list=deg_genes, +# Line 68: Provides pathways dict and p-value type, and organism for geneset enrichment -- pathways_dict=pathway_dict, +# Line 69: Automatically determine the type of p-value to use in enrichment analysis -- pvalue_type='auto', +# Line 70: Provides the organism information for geneset enrichment -- organism='mouse') +# Line 71: Prepares GO Molecular Function pathway dictionary -- pathway_dict=ov.utils.geneset_prepare('genesets/GO_Molecular_Function_2023.txt',organism='Mouse') +# Line 72: Performs GO Molecular Function gene set enrichment analysis -- enr_go_mf=ov.bulk.geneset_enrichment(gene_list=deg_genes, +# Line 73: Provides pathways dict and p-value type, and organism for geneset enrichment -- pathways_dict=pathway_dict, +# Line 74: Automatically determine the type of p-value to use in enrichment analysis -- pvalue_type='auto', +# Line 75: Provides the organism information for geneset enrichment -- organism='mouse') +# Line 76: Prepares GO Cellular Component pathway dictionary -- pathway_dict=ov.utils.geneset_prepare('genesets/GO_Cellular_Component_2023.txt',organism='Mouse') +# Line 77: Performs GO Cellular Component gene set enrichment analysis -- enr_go_cc=ov.bulk.geneset_enrichment(gene_list=deg_genes, +# Line 78: Provides pathways dict and p-value type, and organism for geneset enrichment -- pathways_dict=pathway_dict, +# Line 79: Automatically determine the type of p-value to use in enrichment analysis -- pvalue_type='auto', +# Line 80: Provides the organism information for geneset enrichment -- organism='mouse') +# Line 82: Creates a dictionary containing GO enrichment results -- enr_dict={'BP':enr_go_bp, +# Line 83: Adds Molecular Function GO enrichment results in the dict -- 'MF':enr_go_mf, +# Line 84: Adds Cellular Component GO enrichment results in the dict -- 'CC':enr_go_cc} +# Line 85: Defines color mapping for the GO categories -- colors_dict={ +# Line 86: Adds Red color for Biological Process GO term -- 'BP':ov.pl.red_color[1], +# Line 87: Adds Green color for Molecular Function GO term -- 'MF':ov.pl.green_color[1], +# Line 88: Adds Blue color for Cellular Component GO term -- 'CC':ov.pl.blue_color[1], +# Line 89: Closes color mapping dictionary -- } +# Line 91: Plots multiple gene set enrichment results using omicverse's function -- ov.bulk.geneset_plot_multi(enr_dict,colors_dict,num=3, +# Line 92: Specifies the figure size -- figsize=(2,5), +# Line 93: Sets the text knockout and fontsize -- text_knock=3,fontsize=8, +# Line 94: Sets the color map to 'Reds' -- cmap='Reds' +# Line 95: Closes function call -- ) +# Line 98: Defines a function `geneset_plot_multi` to plot multiple gene set enrichment results. -- def geneset_plot_multi(enr_dict,colors_dict,num:int=5,fontsize=10, +# Line 99: Specifies title, x label, figure size, color map, text knockout, max size, and axis -- fig_title:str='',fig_xlabel:str='Fractions of genes', +# Line 100: Specifies figure size, color map, text knock, max size, and axes -- figsize:tuple=(2,4),cmap:str='YlGnBu', +# Line 101: Specifies text knock, max size, and axes -- text_knock:int=5,text_maxsize:int=20,ax=None, +# Line 102: Closes function definition -- ): +# Line 103: Imports necessary classes from PyComplexHeatmap library -- from PyComplexHeatmap import HeatmapAnnotation,DotClustermapPlotter,anno_label,anno_simple,AnnotationBase +# Line 104: Iterates through the enrichment dictionaries and adds a 'Type' column -- for key in enr_dict.keys(): +# Line 105: Adds 'Type' column in each dictionary -- enr_dict[key]['Type']=key +# Line 106: Concatenates the top 'num' rows of all enrichment results into a single DataFrame -- enr_all=pd.concat([enr_dict[i].iloc[:num] for i in enr_dict.keys()],axis=0) +# Line 107: Shortens and sets text for plot term labels -- enr_all['Term']=[ov.utils.plot_text_set(i.split('(')[0],text_knock=text_knock,text_maxsize=text_maxsize) for i in enr_all.Term.tolist()] +# Line 108: Sets the index of the DataFrame to the modified term labels -- enr_all.index=enr_all.Term +# Line 109: Stores the index values in a new column named "Term1" -- enr_all['Term1']=[i for i in enr_all.index.tolist()] +# Line 110: Deletes the original term column -- del enr_all['Term'] +# Line 112: Assigns the defined colors for each type of GO analysis -- colors=colors_dict +# Line 114: Creates a HeatmapAnnotation for left side with labels, type mapping and axis specification -- left_ha = HeatmapAnnotation( +# Line 115: Configures label annotation with merging, rotation, colors, and rel position -- label=anno_label(enr_all.Type, merge=True,rotation=0,colors=colors,relpos=(1,0.8)), +# Line 116: Configures Category annotation with colors, legend, text etc -- Category=anno_simple(enr_all.Type,cmap='Set1', +# Line 117: Adds details for annotation with text, legend and colors -- add_text=False,legend=False,colors=colors), +# Line 118: Specifies the axis for left annotation and label properties -- axis=0,verbose=0,label_kws={'rotation':45,'horizontalalignment':'left','visible':False}) +# Line 119: Creates a HeatmapAnnotation for right side with labels, type mapping and axis specification -- right_ha = HeatmapAnnotation( +# Line 120: Configures label annotation with merging, rotation, colors, position, arrows etc -- label=anno_label(enr_all.Term1, merge=True,rotation=0,relpos=(0,0.5),arrowprops=dict(visible=True), +# Line 121: Sets colors to labels by mapping to dict with set colors by each type of annotation and sets font -- colors=enr_all.assign(color=enr_all.Type.map(colors)).set_index('Term1').color.to_dict(), +# Line 122: Sets font size and luminance values -- fontsize=fontsize,luminance=0.8,height=2), +# Line 123: Sets the axis and label keyword of annotation -- axis=0,verbose=0,#label_kws={'rotation':45,'horizontalalignment':'left'}, +# Line 124: Specifies the orientation -- orientation='right') +# Line 125: Creates subplots if no axes object given -- if ax==None: +# Line 126: Creates the figure and axes object with specific figure size -- fig, ax = plt.subplots(figsize=figsize) +# Line 127: Assigns the provided axis object if one provided -- else: +# Line 128: Sets the axes to the provided axes object -- ax=ax +# Line 130: Creates dotclustermap plot with data, x,y values and heatmap properties -- cm = DotClustermapPlotter(data=enr_all, x='fraction',y='Term1',value='logp',c='logp',s='num', +# Line 131: Sets the color map -- cmap=cmap, +# Line 132: Sets the row clustering -- row_cluster=True,#col_cluster=True,#hue='Group', +# Line 134: Sets the vmin and vmax value in the heatmap colorbar -- vmin=-1*np.log10(0.1),vmax=-1*np.log10(1e-10), +# Line 137: Sets row and column label properties -- show_rownames=True,show_colnames=False,row_dendrogram=False, +# Line 138: Specifies the side of the labels -- col_names_side='top',row_names_side='right', +# Line 139: Sets the label properties of the x-axis ticks -- xticklabels_kws={'labelrotation': 30, 'labelcolor': 'blue','labelsize':fontsize}, +# Line 141: Sets left and right annotation properties -- left_annotation=left_ha,right_annotation=right_ha, +# Line 142: Sets spines property of the plot -- spines=False, +# Line 143: Splits rows based on type of GO term -- row_split=enr_all.Type,# row_split_gap=1, +# Line 145: Sets the verbosity, legend properties -- verbose=1,legend_gap=10, +# Line 148: Sets x label for plot -- xlabel='Fractions of genes',xlabel_side="bottom", +# Line 149: Sets label padding, font weight, and font size of x axis -- xlabel_kws=dict(labelpad=8,fontweight='normal',fontsize=fontsize+2), +# Line 151: Gets axes from figure -- tesr=plt.gcf().axes +# Line 152: Iterates through each axis on figure -- for ax in plt.gcf().axes: +# Line 153: Check if each of the axes contains get_xlabel property -- if hasattr(ax, 'get_xlabel'): +# Line 154: Checks if xlabel property is Fractions of genes -- if ax.get_xlabel() == 'Fractions of genes': # 假设 colorbar 有一个特定的标签 +# Line 155: Sets the cbar object as the axes object -- cbar = ax +# Line 156: Disables grid in the colorbar -- cbar.grid(False) +# Line 157: Checks if y label is logp -- if ax.get_ylabel() == 'logp': # 假设 colorbar 有一个特定的标签 +# Line 158: Sets the cbar object as the axes object -- cbar = ax +# Line 159: Sets the label size of the axis -- cbar.tick_params(labelsize=fontsize+2) +# Line 160: Sets the y label of the colorbar -- cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2) +# Line 161: Disables grid in the colorbar -- cbar.grid(False) +# Line 162: Returns the axis object -- return ax +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py new file mode 100644 index 00000000..9fdccd5e --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py @@ -0,0 +1,32 @@ +```python +# Line 1: Imports the omicverse library as ov. -- import omicverse as ov +# Line 2: Sets the plotting style for omicverse. -- ov.utils.ov_plot_set() +# Line 3: Reads data from a URL into a pandas DataFrame, using the first column as the index and the second row as the header. -- data=ov.utils.read('https://raw.githubusercontent.com/Starlitnightly/Pyomic/master/sample/counts.txt',index_col=0,header=1) +# Line 5: Replaces `.bam` and leading path info in column names of the DataFrame. -- data.columns=[i.split('/')[-1].replace('.bam','') for i in data.columns] +# Line 6: Displays the first few rows of the DataFrame. -- data.head() +# Line 8: Downloads a gene ID annotation pair file. -- ov.utils.download_geneid_annotation_pair() +# Line 9: Maps gene IDs in the DataFrame using a specified annotation file. -- data=ov.bulk.Matrix_ID_mapping(data,'genesets/pair_GRCm39.tsv') +# Line 10: Displays the first few rows of the updated DataFrame. -- data.head() +# Line 12: Initializes a pyDEG object from the DataFrame for differential expression analysis. -- dds=ov.bulk.pyDEG(data) +# Line 13: Removes duplicate index entries from the pyDEG object. -- dds.drop_duplicates_index() +# Line 14: Prints a success message after removing duplicate indices. -- print('... drop_duplicates_index success') +# Line 16: Defines a list of treatment group labels. -- treatment_groups=['4-3','4-4'] +# Line 17: Defines a list of control group labels. -- control_groups=['1--1','1--2'] +# Line 18: Performs differential expression analysis using DEseq2 method. -- result=dds.deg_analysis(treatment_groups,control_groups,method='DEseq2') +# Line 21: Prints the shape of the result DataFrame. -- print(result.shape) +# Line 22: Filters the results DataFrame to include only genes with a log2(BaseMean) greater than 1. -- result=result.loc[result['log2(BaseMean)']>1] +# Line 23: Prints the shape of the filtered result DataFrame. -- print(result.shape) +# Line 25: Sets fold change and p-value thresholds for differential gene expression analysis. -- dds.foldchange_set(fc_threshold=-1, +# Line 27: Plots a volcano plot of the differential expression analysis results. -- dds.plot_volcano(title='DEG Analysis',figsize=(4,4), +# Line 29: Plots boxplots for specified genes across treatment and control groups. -- dds.plot_boxplot(genes=['Ckap2','Lef1'],treatment_groups=treatment_groups, +# Line 32: Plots a boxplot for a single gene across treatment and control groups. -- dds.plot_boxplot(genes=['Ckap2'],treatment_groups=treatment_groups, +# Line 35: Downloads a pathway database. -- ov.utils.download_pathway_database() +# Line 37: Prepares a gene set dictionary from a specified file. -- pathway_dict=ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt',organism='Mouse') +# Line 39: Converts differential expression results into a ranked gene list for GSEA. -- rnk=dds.ranking2gsea() +# Line 40: Initializes a pyGSEA object for Gene Set Enrichment Analysis. -- gsea_obj=ov.bulk.pyGSEA(rnk,pathway_dict) +# Line 41: Performs gene set enrichment analysis. -- enrich_res=gsea_obj.enrichment() +# Line 42: Displays the first few rows of the enrichment results DataFrame. -- gsea_obj.enrich_res.head() +# Line 44: Plots the gene set enrichment results. -- gsea_obj.plot_enrichment(num=10,node_size=[10,20,30], +# Line 49: Displays the first 5 indices of the enrichment results. -- gsea_obj.enrich_res.index[:5] +# Line 51: Plots a Gene Set Enrichment Analysis plot for a specified gene set. -- fig=gsea_obj.plot_gsea(term_num=1, +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py new file mode 100644 index 00000000..f99afd6a --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py @@ -0,0 +1,176 @@ +```python +# Line 1: -- import omicverse as ov +# Line 1: Imports the omicverse library and assigns it the alias 'ov'. +# Line 2: -- print(f'omicverse version:{ov.__version__}') +# Line 2: Prints the version of the omicverse library. +# Line 3: -- import scanpy as sc +# Line 3: Imports the scanpy library and assigns it the alias 'sc'. +# Line 4: -- print(f'scanpy version:{sc.__version__}') +# Line 4: Prints the version of the scanpy library. +# Line 5: -- ov.ov_plot_set() +# Line 5: Sets the plotting style for omicverse. +# Line 10: -- adata = sc.read_10x_mtx( +# Line 10: Reads 10x Genomics data into an AnnData object. +# Line 11: -- 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file +# Line 11: Specifies the directory containing the matrix files. +# Line 12: -- var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) +# Line 12: Sets gene symbols as variable names. +# Line 13: -- cache=True) # write a cache file for faster subsequent reading +# Line 13: Enables caching for faster reading in subsequent executions. +# Line 17: -- adata=ov.pp.qc(adata, +# Line 17: Applies quality control filtering on the AnnData object. +# Line 18: -- tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250}) +# Line 18: Sets thresholds for mitochondrial percentage, number of UMIs, and detected genes for QC. +# Line 20: -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 20: Preprocesses the AnnData object, including normalization and HVG selection. +# Line 23: -- adata.raw = adata +# Line 23: Stores a copy of the raw data in the `.raw` attribute of the AnnData object. +# Line 24: -- adata = adata[:, adata.var.highly_variable_features] +# Line 24: Filters the AnnData object to keep only highly variable genes. +# Line 27: -- ov.pp.scale(adata) +# Line 27: Scales the gene expression data in the AnnData object. +# Line 30: -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 30: Performs Principal Component Analysis (PCA) on the scaled data. +# Line 33: -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 33: Constructs a neighborhood graph for the AnnData object. +# Line 34: -- use_rep='scaled|original|X_pca') +# Line 34: Specifies the representations used for neighbor graph construction. +# Line 37: -- sc.tl.leiden(adata) +# Line 37: Performs Leiden clustering on the AnnData object. +# Line 40: -- sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca') +# Line 40: Computes a dendrogram based on the Leiden clusters. +# Line 41: -- sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', +# Line 41: Ranks genes based on differential expression between leiden clusters. +# Line 42: -- method='wilcoxon',use_raw=False,) +# Line 42: Uses the Wilcoxon test for gene ranking and does not use the raw data. +# Line 45: -- adata.obsm["X_mde"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) +# Line 45: Computes and stores a Manifold Diffusion Embedding (MDE) in the obsm attribute. +# Line 46: -- adata +# Line 46: Displays the AnnData object. +# Line 48: -- ov.pl.embedding(adata, +# Line 48: Creates an embedding plot. +# Line 49: -- basis='X_mde', +# Line 49: Specifies the embedding basis for plotting. +# Line 50: -- color=['leiden'], +# Line 50: Sets the colors based on the 'leiden' clustering. +# Line 51: -- legend_loc='on data', +# Line 51: Places the legend on the plot. +# Line 52: -- frameon='small', +# Line 52: Sets a smaller frame size for plot. +# Line 53: -- legend_fontoutline=2, +# Line 53: Sets the outline width of legend text. +# Line 54: -- palette=ov.utils.palette()[14:], +# Line 54: Sets the color palette for plotting. +# Line 56: -- import os +# Line 56: Imports the os library for environment variable manipulation. +# Line 57: -- all_markers={'cluster1':['CD3D','CD3E'], +# Line 57: Defines a dictionary of marker genes for specific clusters. +# Line 58: -- 'cluster2':['MS4A1']} +# Line 58: Defines additional marker genes for a specific cluster. +# Line 60: -- os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key +# Line 60: Sets an environment variable for API key. +# Line 61: -- result = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human', +# Line 61: Uses a function to predict cell types using a large language model based on the markers. +# Line 62: -- model='qwen-plus', provider='qwen', +# Line 62: Specifies the large language model and provider. +# Line 63: -- topgenenumber=5) +# Line 63: Specifies the top number of genes to consider. +# Line 64: -- result +# Line 64: Displays the result. +# Line 66: -- all_markers=ov.single.get_celltype_marker(adata,clustertype='leiden',rank=True, +# Line 66: Gets marker genes based on the Leiden clusters. +# Line 67: -- key='rank_genes_groups', +# Line 67: Specifies the key for retrieving the ranked genes. +# Line 68: -- foldchange=2,topgenenumber=5) +# Line 68: Sets the fold change threshold and number of top genes. +# Line 69: -- all_markers +# Line 69: Displays all markers. +# Line 71: -- import os +# Line 71: Imports the os library for environment variable manipulation. +# Line 72: -- os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key +# Line 72: Sets an environment variable for API key. +# Line 73: -- result = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human', +# Line 73: Predicts cell types based on the identified markers using a large language model. +# Line 74: -- model='qwen-plus', provider='qwen', +# Line 74: Specifies the large language model and provider. +# Line 75: -- topgenenumber=5) +# Line 75: Specifies the top number of genes to consider. +# Line 76: -- result +# Line 76: Displays the result. +# Line 78: -- new_result={} +# Line 78: Initializes an empty dictionary. +# Line 79: -- for key in result.keys(): +# Line 79: Iterates through the keys in the result dictionary. +# Line 80: -- new_result[key]=result[key].split(': ')[-1].split(' (')[0].split('. ')[1] +# Line 80: Processes the result strings to extract the cell type name. +# Line 81: -- new_result +# Line 81: Displays the processed results. +# Line 83: -- adata.obs['gpt_celltype'] = adata.obs['leiden'].map(new_result).astype('category') +# Line 83: Maps the cell types from new_result to the obs attribute of the AnnData object. +# Line 85: -- ov.pl.embedding(adata, +# Line 85: Creates an embedding plot with cell type annotation. +# Line 86: -- basis='X_mde', +# Line 86: Specifies the embedding basis for plotting. +# Line 87: -- color=['leiden','gpt_celltype'], +# Line 87: Specifies the colors for the embedding plot. +# Line 88: -- legend_loc='on data', +# Line 88: Sets the location of legend on the data points. +# Line 89: -- frameon='small', +# Line 89: Sets the frame to small size. +# Line 90: -- legend_fontoutline=2, +# Line 90: Sets the font outline width for legend text. +# Line 91: -- palette=ov.utils.palette()[14:], +# Line 91: Sets the color palette for the plot. +# Line 93: -- all_markers={'cluster1':['CD3D','CD3E'], +# Line 93: Defines a dictionary of marker genes for specific clusters. +# Line 94: -- 'cluster2':['MS4A1']} +# Line 94: Defines additional marker genes for a specific cluster. +# Line 96: -- os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key +# Line 96: Sets an environment variable for API key. +# Line 97: -- result = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human', +# Line 97: Predicts cell types using a large language model, specifically gpt-4o. +# Line 98: -- model='gpt-4o', provider='openai', +# Line 98: Specifies the large language model and provider. +# Line 99: -- topgenenumber=5) +# Line 99: Specifies the top number of genes to consider. +# Line 100: -- result +# Line 100: Displays the result. +# Line 102: -- os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key +# Line 102: Sets an environment variable for API key. +# Line 103: -- result = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human', +# Line 103: Predicts cell types using a large language model, specifically qwen-plus. +# Line 104: -- model='qwen-plus', provider='qwen', +# Line 104: Specifies the large language model and provider. +# Line 105: -- topgenenumber=5) +# Line 105: Specifies the top number of genes to consider. +# Line 106: -- result +# Line 106: Displays the result. +# Line 108: -- os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key +# Line 108: Sets an environment variable for API key. +# Line 109: -- result = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human', +# Line 109: Predicts cell types using a large language model, specifically moonshot-v1-8k. +# Line 110: -- model='moonshot-v1-8k', provider='kimi', +# Line 110: Specifies the large language model and provider. +# Line 111: -- topgenenumber=5) +# Line 111: Specifies the top number of genes to consider. +# Line 112: -- result +# Line 112: Displays the result. +# Line 114: -- os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key +# Line 114: Sets an environment variable for API key. +# Line 115: -- result = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human', +# Line 115: Predicts cell types using a large language model, specifically moonshot-v1-8k, with a base URL. +# Line 116: -- model='moonshot-v1-8k', base_url="https://api.moonshot.cn/v1", +# Line 116: Specifies the large language model, provider, and base URL. +# Line 117: -- topgenenumber=5) +# Line 117: Specifies the top number of genes to consider. +# Line 118: -- result +# Line 118: Displays the result. +# Line 120: -- anno_model = 'path/to/your/local/LLM' # '~/models/Qwen2-7B-Instruct' +# Line 120: Defines a variable for a local LLM model path. +# Line 122: -- result = ov.single.gptcelltype_local(all_markers, tissuename='PBMC', speciename='human', +# Line 122: Predicts cell types using a locally hosted large language model. +# Line 123: -- model_name=anno_model, topgenenumber=5) +# Line 123: Specifies the local model and top number of genes. +# Line 124: -- result +# Line 124: Displays the result. +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py new file mode 100644 index 00000000..c3a4bd91 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py @@ -0,0 +1,36 @@ +```python +# Line 1: Imports the omicverse library and assigns it the alias ov. -- import omicverse as ov +# Line 3: Imports the scanpy library and assigns it the alias sc. -- import scanpy as sc +# Line 5: Sets the plotting parameters for omicverse. -- ov.utils.ov_plot_set() +# Line 7: Reads a single-cell data file into an AnnData object using omicverse. -- adata_sc=ov.read('data/sc.h5ad') +# Line 8: Imports the matplotlib.pyplot module and assigns it the alias plt. -- import matplotlib.pyplot as plt +# Line 9: Creates a matplotlib figure and axes with a specified size. -- fig, ax = plt.subplots(figsize=(3,3)) +# Line 10: Generates and displays an embedding plot using omicverse, visualizing the 'Subset' annotation on the UMAP coordinates. -- ov.utils.embedding( +# Line 20: Prints the maximum value of raw expression data before normalization. -- print("RAW",adata_sc.X.max()) +# Line 21: Preprocesses the single-cell data using omicverse, applying shifting and log transformation, selecting highly variable genes, and targeting a specific total sum. -- adata_sc=ov.pp.preprocess(adata_sc,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4) +# Line 22: Stores the preprocessed data as raw attribute. -- adata_sc.raw = adata_sc +# Line 23: Subsets the AnnData object to include only the highly variable genes. -- adata_sc = adata_sc[:, adata_sc.var.highly_variable_features] +# Line 24: Prints the maximum value of normalized expression data. -- print("Normalize",adata_sc.X.max()) +# Line 26: Loads a Visium spatial transcriptomics dataset from scanpy. -- adata = sc.datasets.visium_sge(sample_id="V1_Human_Lymph_Node") +# Line 27: Adds sample information to the obs attribute of the spatial transcriptomics data. -- adata.obs['sample'] = list(adata.uns['spatial'].keys())[0] +# Line 28: Makes the gene names unique in the spatial transcriptomics data. -- adata.var_names_make_unique() +# Line 30: Calculates quality control metrics for the spatial transcriptomics data using scanpy. -- sc.pp.calculate_qc_metrics(adata, inplace=True) +# Line 31: Filters the spatial transcriptomics data by total counts. -- adata = adata[:,adata.var['total_counts']>100] +# Line 32: Calculates spatial variable genes for the spatial transcriptomics data using omicverse. -- adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform="visium",) +# Line 33: Stores the spatial data as raw attribute. -- adata.raw = adata +# Line 34: Subsets the spatial AnnData object to include only spatially variable genes. -- adata = adata[:, adata.var.space_variable_features] +# Line 35: Creates a copy of the processed spatial transcriptomics data. -- adata_sp=adata.copy() +# Line 36: Displays the spatial AnnData object. -- adata_sp +# Line 38: Initializes the Tangram object for spatial mapping using the single-cell and spatial data. -- tg=ov.space.Tangram(adata_sc,adata_sp,clusters='Subset') +# Line 40: Trains the Tangram model using the specified settings. -- tg.train(mode="clusters",num_epochs=500,device="cuda:0") +# Line 42: Performs cell-to-location mapping using the trained Tangram model. -- adata_plot=tg.cell2location() +# Line 43: Displays the column names of observation data for mapped object. -- adata_plot.obs.columns +# Line 45: Defines a list of cell type annotations for plotting. -- annotation_list=['B_Cycling', 'B_GC_LZ', 'T_CD4+_TfH_GC', 'FDC', +# Line 48: Generates and displays a spatial plot using scanpy, visualizing cell types. -- sc.pl.spatial(adata_plot, cmap='magma', +# Line 57: Creates a dictionary mapping the single-cell 'Subset' categories to colors. -- color_dict=dict(zip(adata_sc.obs['Subset'].cat.categories, +# Line 60: Imports the matplotlib module. -- import matplotlib as mpl +# Line 61: Creates a subset of cell types and transforms to string. -- clust_labels = annotation_list[:5] +# Line 62: Converts cell type labels to strings for column name compatibility. -- clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labels +# Line 64: Creates a context for matplotlib rc parameters for specific plot configurations. -- with mpl.rc_context({'figure.figsize': (8, 8),'axes.grid': False}): +# Line 65: Generates and displays a spatial plot using omicverse, visualizing cell types. -- fig = ov.pl.plot_spatial( +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_metacells_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_metacells_annotated.py new file mode 100644 index 00000000..bf597968 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_metacells_annotated.py @@ -0,0 +1,88 @@ +``` +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the scanpy library as sc. -- import scanpy as sc +# Line 3: Import the scvelo library as scv. -- import scvelo as scv +# Line 5: Set plotting parameters using ov.plot_set(). -- ov.plot_set() +# Line 7: Load the pancreas dataset using scv and assign it to adata. -- adata = scv.datasets.pancreas() +# Line 8: Display the loaded AnnData object. -- adata +# Line 11: Perform quality control on the AnnData object using ov.pp.qc, filtering based on mito percentage, number of UMIs, and detected genes and filtering mitochondrial genes. -- adata=ov.pp.qc(adata, +# Line 12: Perform quality control on the AnnData object using ov.pp.qc, filtering based on mito percentage, number of UMIs, and detected genes and filtering mitochondrial genes. -- tresh={'mito_perc': 0.20, 'nUMIs': 500, 'detected_genes': 250}, +# Line 13: Perform quality control on the AnnData object using ov.pp.qc, filtering based on mito percentage, number of UMIs, and detected genes and filtering mitochondrial genes. -- mt_startswith='mt-') +# Line 15: Preprocess the AnnData object using ov.pp.preprocess with shiftlog normalization and Pearson residuals, calculating 2000 highly variable genes. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 18: Store the original AnnData object in the raw attribute. -- adata.raw = adata +# Line 19: Filter the AnnData object to keep only the highly variable genes. -- adata = adata[:, adata.var.highly_variable_features] +# Line 22: Scale the expression data in adata.X using ov.pp.scale(). -- ov.pp.scale(adata) +# Line 25: Perform PCA dimensionality reduction using ov.pp.pca, using scaled data and 50 principal components. -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 27: Create a MetaCell object using ov.single.MetaCell with the scaled, original and pca data, no specific number of metacells, and using GPU if available. -- meta_obj=ov.single.MetaCell(adata,use_rep='scaled|original|X_pca', +# Line 28: Create a MetaCell object using ov.single.MetaCell with the scaled, original and pca data, no specific number of metacells, and using GPU if available. -- n_metacells=None, +# Line 29: Create a MetaCell object using ov.single.MetaCell with the scaled, original and pca data, no specific number of metacells, and using GPU if available. -- use_gpu='cuda:0') +# Line 31: Initialize the archetypes for the MetaCell object. -- meta_obj.initialize_archetypes() +# Line 33: Train the MetaCell model with a minimum of 10 and maximum of 50 iterations. -- meta_obj.train(min_iter=10, max_iter=50) +# Line 35: Save the trained MetaCell model to the specified file path. -- meta_obj.save('seacells/model.pkl') +# Line 37: Load the trained MetaCell model from the specified file path. -- meta_obj.load('seacells/model.pkl') +# Line 39: Predict cell assignments using the trained MetaCell model, assigning soft memberships, using cluster labels and summarizing normalized log data. -- ad=meta_obj.predicted(method='soft',celltype_label='clusters', +# Line 40: Predict cell assignments using the trained MetaCell model, assigning soft memberships, using cluster labels and summarizing normalized log data. -- summarize_layer='lognorm') +# Line 42: Compute cell type purity scores based on clusters labels. -- SEACell_purity = meta_obj.compute_celltype_purity('clusters') +# Line 43: Calculate separation scores using specified representations and nearest neighbor. -- separation = meta_obj.separation(use_rep='scaled|original|X_pca',nth_nbr=1) +# Line 44: Calculate compactness scores using specified representations. -- compactness = meta_obj.compactness(use_rep='scaled|original|X_pca') +# Line 46: Import the seaborn library as sns. -- import seaborn as sns +# Line 47: Import the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt +# Line 48: Set plot parameters with omicverse. -- ov.plot_set() +# Line 49: Create a figure and axes for subplots for evaluation metrics. -- fig, axes = plt.subplots(1,3,figsize=(4,4)) +# Line 50: Create a box plot of the SEACell purity data on the first subplot using a blue color from the ov.utils palette. -- sns.boxplot(data=SEACell_purity, y='clusters_purity',ax=axes[0], +# Line 51: Create a box plot of the SEACell purity data on the first subplot using a blue color from the ov.utils palette. -- color=ov.utils.blue_color[3]) +# Line 52: Create a box plot of the compactness data on the second subplot using a blue color from the ov.utils palette. -- sns.boxplot(data=compactness, y='compactness',ax=axes[1], +# Line 53: Create a box plot of the compactness data on the second subplot using a blue color from the ov.utils palette. -- color=ov.utils.blue_color[4]) +# Line 54: Create a box plot of the separation data on the third subplot using a blue color from the ov.utils palette. -- sns.boxplot(data=separation, y='separation',ax=axes[2], +# Line 55: Create a box plot of the separation data on the third subplot using a blue color from the ov.utils palette. -- color=ov.utils.blue_color[4]) +# Line 56: Adjust the spacing between subplots to avoid overlapping. -- plt.tight_layout() +# Line 57: Set the title of the entire figure and adjust vertical positioning. -- plt.suptitle('Evaluate of MetaCells',fontsize=13,y=1.05) +# Line 58: Iterate through each of the axes to customize the appearance of each plot. -- for ax in axes: +# Line 59: Disable grid lines for the current plot. -- ax.grid(False) +# Line 60: Make the top spine of the current plot invisible. -- ax.spines['top'].set_visible(False) +# Line 61: Make the right spine of the current plot invisible. -- ax.spines['right'].set_visible(False) +# Line 62: Make the bottom spine of the current plot visible. -- ax.spines['bottom'].set_visible(True) +# Line 63: Make the left spine of the current plot visible. -- ax.spines['left'].set_visible(True) +# Line 65: Import the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt +# Line 66: Create a single figure and axes object for embedding. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 67: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- ov.pl.embedding( +# Line 68: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- meta_obj.adata, +# Line 69: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- basis="X_umap", +# Line 70: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- color=['clusters'], +# Line 71: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- frameon='small', +# Line 72: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- title="Meta cells", +# Line 73: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- #legend_loc='on data', +# Line 74: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- legend_fontsize=14, +# Line 75: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- legend_fontoutline=2, +# Line 76: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- size=10, +# Line 77: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- ax=ax, +# Line 78: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- alpha=0.2, +# Line 79: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- #legend_loc='', +# Line 80: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- add_outline=False, +# Line 81: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- #add_outline=True, +# Line 82: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- outline_color='black', +# Line 83: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- outline_width=1, +# Line 84: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- show=False, +# Line 85: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- #palette=ov.utils.blue_color[:], +# Line 86: Generate an embedding plot using ov.pl.embedding using umap coordinates, colored by 'clusters' and customized legend, marker size, transparency, and outline. -- #legend_fontweight='normal' +# Line 87: Overlay meta cells using ov.single.plot_metacells on the existing plot with a specified color. -- ov.single.plot_metacells(ax,meta_obj.adata,color='#CB3E35', +# Line 89: Get mean S score values from meta cells using ov.single.get_obs_value function. -- ov.single.get_obs_value(ad,adata,groupby='S_score', +# Line 90: Get mean S score values from meta cells using ov.single.get_obs_value function. -- type='mean') +# Line 91: Show the head of the annotation data. -- ad.obs.head() +# Line 93: Import the scanpy library as sc. -- import scanpy as sc +# Line 94: Create a copy of the AnnData object to the raw attribute. -- ad.raw=ad.copy() +# Line 95: Calculate highly variable genes using scanpy's highly_variable_genes function, selecting the top 2000 genes. -- sc.pp.highly_variable_genes(ad, n_top_genes=2000, inplace=True) +# Line 96: Filter the AnnData object to keep only the highly variable genes. -- ad=ad[:,ad.var.highly_variable] +# Line 98: Scale the expression data in ad.X using ov.pp.scale(). -- ov.pp.scale(ad) +# Line 99: Perform PCA dimensionality reduction using ov.pp.pca, using scaled data and 30 principal components. -- ov.pp.pca(ad,layer='scaled',n_pcs=30) +# Line 100: Compute neighborhood graph using ov.pp.neighbors, for specified parameters. -- ov.pp.neighbors(ad, n_neighbors=15, n_pcs=20, +# Line 101: Compute neighborhood graph using ov.pp.neighbors, for specified parameters. -- use_rep='scaled|original|X_pca') +# Line 103: Compute UMAP embedding coordinates using ov.pp.umap(). -- ov.pp.umap(ad) +# Line 105: Cast the 'celltype' column in ad.obs as a category type. -- ad.obs['celltype']=ad.obs['celltype'].astype('category') +# Line 106: Reorder the categories in the 'celltype' column of ad.obs to match the order of clusters in adata.obs. -- ad.obs['celltype']=ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories) +# Line 107: Copy color palette associated with 'clusters' from adata to ad under the name 'celltype'. -- ad.uns['celltype_colors']=adata.uns['clusters_colors'] +# Line 109: Generate embedding plot using ov.pl.embedding using umap coordinates, colored by 'celltype' and 'S_score', with specified title and layout adjustments. -- ov.pl.embedding(ad, basis='X_umap', +# Line 110: Generate embedding plot using ov.pl.embedding using umap coordinates, colored by 'celltype' and 'S_score', with specified title and layout adjustments. -- color=["celltype","S_score"], +# Line 111: Generate embedding plot using ov.pl.embedding using umap coordinates, colored by 'celltype' and 'S_score', with specified title and layout adjustments. -- frameon='small',cmap='RdBu_r', +# Line 112: Generate embedding plot using ov.pl.embedding using umap coordinates, colored by 'celltype' and 'S_score', with specified title and layout adjustments. -- wspace=0.5) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py new file mode 100644 index 00000000..a8fd8782 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py @@ -0,0 +1,25 @@ +```python +# Line 1: Imports the omicverse library as ov. -- import omicverse as ov +# Line 2: Sets up the plotting configurations using the ov_plot_set function. -- ov.utils.ov_plot_set() +# Line 9: Imports the scanpy library as sc. -- import scanpy as sc +# Line 10: Reads an AnnData object from the file 'TiME_adata_scvi.h5ad' and stores it in the variable adata. -- adata=sc.read('TiME_adata_scvi.h5ad') +# Line 11: Displays the contents of the AnnData object. -- adata +# Line 15: Computes the neighborhood graph using the 'X_scVI' representation. -- sc.pp.neighbors(adata, use_rep="X_scVI") +# Line 17: Calculates the Minimum Distance Embedding (MDE) of the 'X_scVI' representation and stores it in 'X_mde'. -- adata.obsm["X_mde"] = ov.utils.mde(adata.obsm["X_scVI"]) +# Line 19: Generates and displays an embedding plot of the 'X_mde' using the "patient" column for coloring. -- sc.pl.embedding( +# Line 20: Specifies the embedding basis as "X_mde". -- adata, +# Line 21: Specifies the color mapping using "patient" column. -- basis="X_mde", +# Line 22: Turns the frame off for the plot. -- color=["patient"], +# Line 23: Sets the number of columns for subplots to 1. -- frameon=False, +# Line 24: Closes the function call. -- ncols=1, +# Line 27: Creates a MetaTiME object from the AnnData object using table mode. -- TiME_object=ov.single.MetaTiME(adata,mode='table') +# Line 29: Performs overclustering on the MetaTiME object with resolution 8 and stores the results in 'overcluster'. -- TiME_object.overcluster(resolution=8,clustercol = 'overcluster',) +# Line 31: Predicts the MetaTiME categories for cells and saves the predictions into the 'MetaTiME' column of the AnnData object. -- TiME_object.predictTiME(save_obs_name='MetaTiME') +# Line 33: Generates an embedding plot colored by "MetaTiME" and stores the figure and axes in variables 'fig' and 'ax'. -- fig,ax=TiME_object.plot(cluster_key='MetaTiME',basis='X_mde',dpi=80) +# Line 37: Generates and displays an embedding plot of the 'X_mde' using the "Major_MetaTiME" column for coloring. -- sc.pl.embedding( +# Line 38: Specifies the embedding basis as "X_mde". -- adata, +# Line 39: Specifies the color mapping using "Major_MetaTiME" column. -- basis="X_mde", +# Line 40: Turns the frame off for the plot. -- color=["Major_MetaTiME"], +# Line 41: Sets the number of columns for subplots to 1. -- frameon=False, +# Line 42: Closes the function call. -- ncols=1, +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py new file mode 100644 index 00000000..adce6d70 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py @@ -0,0 +1,41 @@ +```python +# Line 1: import omicverse as ov -- import omicverse as ov +# Line 2: rna=ov.utils.read('data/sample/rna_p_n_raw.h5ad') -- rna=ov.utils.read('data/sample/rna_p_n_raw.h5ad') +# Line 3: atac=ov.utils.read('data/sample/atac_p_n_raw.h5ad') -- atac=ov.utils.read('data/sample/atac_p_n_raw.h5ad') +# Line 5: rna,atac -- rna,atac +# Line 7: test_mofa=ov.single.pyMOFA(omics=[rna,atac], -- test_mofa=ov.single.pyMOFA(omics=[rna,atac], +# Line 8: omics_name=['RNA','ATAC']) -- omics_name=['RNA','ATAC']) +# Line 10: test_mofa.mofa_preprocess() -- test_mofa.mofa_preprocess() +# Line 11: test_mofa.mofa_run(outfile='models/brac_rna_atac.hdf5') -- test_mofa.mofa_run(outfile='models/brac_rna_atac.hdf5') +# Line 13: import omicverse as ov -- import omicverse as ov +# Line 14: ov.utils.ov_plot_set() -- ov.utils.ov_plot_set() +# Line 16: rna=ov.utils.read('data/sample/rna_test.h5ad') -- rna=ov.utils.read('data/sample/rna_test.h5ad') +# Line 18: rna=ov.single.factor_exact(rna,hdf5_path='data/sample/MOFA_POS.hdf5') -- rna=ov.single.factor_exact(rna,hdf5_path='data/sample/MOFA_POS.hdf5') +# Line 19: rna -- rna +# Line 21: ov.single.factor_correlation(adata=rna,cluster='cell_type',factor_list=[1,2,3,4,5]) -- ov.single.factor_correlation(adata=rna,cluster='cell_type',factor_list=[1,2,3,4,5]) +# Line 23: ov.single.get_weights(hdf5_path='data/sample/MOFA_POS.hdf5',view='RNA',factor=1) -- ov.single.get_weights(hdf5_path='data/sample/MOFA_POS.hdf5',view='RNA',factor=1) +# Line 25: pymofa_obj=ov.single.pyMOFAART(model_path='data/sample/MOFA_POS.hdf5') -- pymofa_obj=ov.single.pyMOFAART(model_path='data/sample/MOFA_POS.hdf5') +# Line 27: pymofa_obj.get_factors(rna) -- pymofa_obj.get_factors(rna) +# Line 28: rna -- rna +# Line 30: pymofa_obj.plot_r2() -- pymofa_obj.plot_r2() +# Line 32: pymofa_obj.get_r2() -- pymofa_obj.get_r2() +# Line 34: pymofa_obj.plot_cor(rna,'cell_type') -- pymofa_obj.plot_cor(rna,'cell_type') +# Line 36: pymofa_obj.plot_factor(rna,'cell_type','Epi',figsize=(3,3), -- pymofa_obj.plot_factor(rna,'cell_type','Epi',figsize=(3,3), +# Line 37: factor1=6,factor2=10,) -- factor1=6,factor2=10,) +# Line 39: import scanpy as sc -- import scanpy as sc +# Line 40: sc.pp.neighbors(rna) -- sc.pp.neighbors(rna) +# Line 41: sc.tl.umap(rna) -- sc.tl.umap(rna) +# Line 42: sc.pl.embedding( -- sc.pl.embedding( +# Line 43: rna, -- rna, +# Line 44: basis="X_umap", -- basis="X_umap", +# Line 45: color=["factor6","cell_type"], -- color=["factor6","cell_type"], +# Line 46: frameon=False, -- frameon=False, +# Line 47: ncols=2, -- ncols=2, +# Line 49: show=False, -- show=False, +# Line 50: cmap='Greens', -- cmap='Greens', +# Line 51: vmin=0, -- vmin=0, +# Line 54: pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=6,factor2=10,) -- pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=6,factor2=10,) +# Line 56: pymofa_obj.plot_weights(view='RNA',factor=6,color='#5de25d', -- pymofa_obj.plot_weights(view='RNA',factor=6,color='#5de25d', +# Line 57: ascending=True) -- ascending=True) +# Line 59: pymofa_obj.plot_top_feature_heatmap(view='RNA') -- pymofa_obj.plot_top_feature_heatmap(view='RNA') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py new file mode 100644 index 00000000..43fc2f73 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py @@ -0,0 +1,50 @@ +```python +# Line 1: Import the omicverse library. -- import omicverse as ov +# Line 2: Set the plotting parameters for omicverse. -- ov.utils.ov_plot_set() +# Line 4: Read RNA data from an h5ad file. -- rna=ov.utils.read("chen_rna-emb.h5ad") +# Line 5: Read ATAC data from an h5ad file. -- atac=ov.utils.read("chen_atac-emb.h5ad") +# Line 7: Create a GLUE_pair object using the RNA and ATAC data. -- pair_obj=ov.single.GLUE_pair(rna,atac) +# Line 8: Calculate the correlation between the RNA and ATAC data within the pair_obj object. -- pair_obj.correlation() +# Line 10: Find neighboring cells based on the GLUE pair with a specified depth and store the results in res_pair. -- res_pair=pair_obj.find_neighbor_cell(depth=20) +# Line 11: Save the res_pair results to a CSV file. -- res_pair.to_csv('models/chen_pair_res.csv') +# Line 13: Select the RNA data corresponding to the first omic from res_pair. -- rna1=rna[res_pair['omic_1']] +# Line 14: Select the ATAC data corresponding to the second omic from res_pair. -- atac1=atac[res_pair['omic_2']] +# Line 15: Set the index of the RNA data to the index from res_pair. -- rna1.obs.index=res_pair.index +# Line 16: Set the index of the ATAC data to the index from res_pair. -- atac1.obs.index=res_pair.index +# Line 17: Return the modified RNA and ATAC data. -- rna1,atac1 +# Line 19: Import the MuData class from the mudata library. -- from mudata import MuData +# Line 21: Create a MuData object from the RNA and ATAC data. -- mdata = MuData({'rna': rna1, 'atac': atac1}) +# Line 22: Return the MuData object. -- mdata +# Line 24: Write the MuData object to an h5mu file with gzip compression. -- mdata.write("chen_mu.h5mu",compression='gzip') +# Line 26: Extract the RNA data from the MuData object. -- rna1=mdata['rna'] +# Line 27: Filter the RNA data to keep only highly variable features. -- rna1=rna1[:,rna1.var['highly_variable']==True] +# Line 28: Extract the ATAC data from the MuData object. -- atac1=mdata['atac'] +# Line 29: Filter the ATAC data to keep only highly variable features. -- atac1=atac1[:,atac1.var['highly_variable']==True] +# Line 30: Set the index of the RNA data to the index from res_pair. -- rna1.obs.index=res_pair.index +# Line 31: Set the index of the ATAC data to the index from res_pair. -- atac1.obs.index=res_pair.index +# Line 33: Import the random module. -- import random +# Line 34: Randomly sample 5000 indices from the RNA data's observation indices. -- random_obs_index=random.sample(list(rna1.obs.index),5000) +# Line 36: Import the adjusted_rand_score function from sklearn.metrics. -- from sklearn.metrics import adjusted_rand_score as ari +# Line 37: Calculate the adjusted Rand index (ARI) between cell types in the subsampled RNA and ATAC data. -- ari_random=ari(rna1[random_obs_index].obs['cell_type'], atac1[random_obs_index].obs['cell_type']) +# Line 38: Calculate the ARI between cell types in the full RNA and ATAC data. -- ari_raw=ari(rna1.obs['cell_type'], atac1.obs['cell_type']) +# Line 39: Print the raw and random ARI scores. -- print('raw ari:{}, random ari:{}'.format(ari_raw,ari_random)) +# Line 42: Create a pyMOFA object with the RNA and ATAC data and their names. -- test_mofa=ov.single.pyMOFA(omics=[rna1,atac1], omics_name=['RNA','ATAC']) +# Line 44: Preprocess the data for the MOFA model. -- test_mofa.mofa_preprocess() +# Line 45: Run the MOFA model and save the results to a file. -- test_mofa.mofa_run(outfile='models/chen_rna_atac.hdf5') +# Line 47: Create a pyMOFAART object by loading a pre-trained MOFA model. -- pymofa_obj=ov.single.pyMOFAART(model_path='models/chen_rna_atac.hdf5') +# Line 49: Get the factor values for the RNA data. -- pymofa_obj.get_factors(rna1) +# Line 50: Return the modified RNA data. -- rna1 +# Line 52: Plot the R-squared values for each factor in the MOFA model. -- pymofa_obj.plot_r2() +# Line 54: Get the R-squared values of the MOFA model. -- pymofa_obj.get_r2() +# Line 56: Plot the correlation between factors and the "cell_type" variable. -- pymofa_obj.plot_cor(rna1,'cell_type',figsize=(4,6)) +# Line 58: Get the correlation values between factors and the "cell_type" variable. -- pymofa_obj.get_cor(rna1,'cell_type') +# Line 60: Plot the relationship between specified factors and "cell_type" for the "Ast" cell type. -- pymofa_obj.plot_factor(rna1,'cell_type','Ast',figsize=(3,3), factor1=1,factor2=3,) +# Line 62: Import mde utility from scvi and scanpy. -- from scvi.model.utils import mde +# Line 63: Import scanpy. -- import scanpy as sc +# Line 64: Compute the neighborhood graph on the 'X_glue' representation. -- sc.pp.neighbors(rna1, use_rep="X_glue", metric="cosine") +# Line 65: Compute the minimum-distance embedding of the 'X_glue' representation. -- rna1.obsm["X_mde"] = mde(rna1.obsm["X_glue"]) +# Line 67: Plot embeddings colored by specific factors and cell types. -- sc.pl.embedding( rna1, basis="X_mde", color=["factor1","factor3","cell_type"], frameon=False, ncols=3, show=False, cmap='Greens', vmin=0,) +# Line 76: Plot the weights of genes for specific factors for the RNA view. -- pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=1,factor2=3,) +# Line 78: Plot the weights for the specified factor in RNA data. -- pymofa_obj.plot_weights(view='RNA',factor=1, ascending=False) +# Line 81: Plot a heatmap of the top features for the RNA view. -- pymofa_obj.plot_top_feature_heatmap(view='RNA') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py new file mode 100644 index 00000000..b2b0210f --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py @@ -0,0 +1,15 @@ +```python +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Set plot settings for omicverse. -- ov.utils.ov_plot_set() +# Line 4: Create a list of gene names. -- gene_list=['FAA4','POX1','FAT1','FAS2','FAS1','FAA1','OLE1','YJU3','TGL3','INA1','TGL5'] +# Line 6: Create a dictionary mapping genes to types, assigning the first 5 to 'Type1' and the rest to 'Type2'. -- gene_type_dict=dict(zip(gene_list,['Type1']*5+['Type2']*6)) +# Line 7: Create a dictionary mapping genes to colors, assigning the first 5 to '#F7828A' and the rest to '#9CCCA4'. -- gene_color_dict=dict(zip(gene_list,['#F7828A']*5+['#9CCCA4']*6)) +# Line 9: Retrieve string interaction data for the given gene list from species 4932. -- G_res=ov.bulk.string_interaction(gene_list,4932) +# Line 10: Display the first few rows of the string interaction result. -- G_res.head() +# Line 12: Initialize a pyPPI object with the gene list, gene type dictionary, gene color dictionary, and species. -- ppi=ov.bulk.pyPPI(gene=gene_list, +# Line 13: Set the gene type dictionary. -- gene_type_dict=gene_type_dict, +# Line 14: Set the gene color dictionary. -- gene_color_dict=gene_color_dict, +# Line 15: Set the species as 4932. -- species=4932) +# Line 18: Perform interaction analysis on the pyPPI object. -- ppi.interaction_analysis() +# Line 20: Plot the network of the pyPPI object. -- ppi.plot_network() +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py new file mode 100644 index 00000000..d6d7e7fd --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py @@ -0,0 +1,29 @@ +```python +# Line 1: Imports the omicverse library as ov -- import omicverse as ov +# Line 2: Imports the anndata library -- import anndata +# Line 3: Imports the scanpy library as sc -- import scanpy as sc +# Line 4: Imports the matplotlib.pyplot library as plt -- import matplotlib.pyplot as plt +# Line 5: Imports the numpy library as np -- import numpy as np +# Line 6: Imports the pandas library as pd -- import pandas as pd +# Line 8: Sets the matplotlib backend to inline for notebook display -- %matplotlib inline +# Line 11: Sets the verbosity level of scanpy to 3 (hints) -- sc.settings.verbosity = 3 +# Line 12: Sets the figure parameters for scanpy plots, including DPI and background color -- sc.settings.set_figure_params(dpi=80, facecolor='white') +# Line 14: Imports LinearSegmentedColormap from matplotlib.colors -- from matplotlib.colors import LinearSegmentedColormap +# Line 15: Defines a list of colors named sc_color -- sc_color=['#7CBB5F','#368650','#A499CC','#5E4D9A','#78C2ED','#866017', '#9F987F','#E0DFED', '#EF7B77', '#279AD7','#F0EEF0', '#1F577B', '#A56BA7', '#E0A7C8', '#E069A6', '#941456', '#FCBC10', '#EAEFC5', '#01A0A7', '#75C8CC', '#F0D7BC', '#D5B26C', '#D5DA48', '#B6B812', '#9DC3C3', '#A89C92', '#FEE00C', '#FEF2A1'] +# Line 16: Creates a custom colormap from the sc_color list using LinearSegmentedColormap -- sc_color_cmap = LinearSegmentedColormap.from_list('Custom', sc_color, len(sc_color)) +# Line 18: Reads an AnnData object from an h5ad file -- adata = anndata.read('sample/rna.h5ad') +# Line 19: Displays the AnnData object -- adata +# Line 21: Performs lazy preprocessing of the AnnData object using omicverse's single module and scanpy -- adata=ov.single.scanpy_lazy(adata) +# Line 23: Initializes a scNOCD object using the preprocessed AnnData object -- scbrca=ov.single.scnocd(adata) +# Line 24: Performs matrix transformation using the scNOCD object -- scbrca.matrix_transform() +# Line 25: Performs matrix normalization using the scNOCD object -- scbrca.matrix_normalize() +# Line 26: Configures the GNN model using the scNOCD object -- scbrca.GNN_configure() +# Line 27: Preprocesses the data for the GNN using the scNOCD object -- scbrca.GNN_preprocess() +# Line 28: Runs the GNN model using the scNOCD object -- scbrca.GNN_model() +# Line 29: Gets the GNN result using the scNOCD object -- scbrca.GNN_result() +# Line 30: Generates the GNN plots using the scNOCD object -- scbrca.GNN_plot() +# Line 32: Calculates the nocd scores using the scNOCD object -- scbrca.cal_nocd() +# Line 34: Calculates the nocd scores using the scNOCD object -- scbrca.calculate_nocd() +# Line 36: Generates a UMAP plot colored by 'leiden' and 'nocd', setting spacing and palette -- sc.pl.umap(scbrca.adata, color=['leiden','nocd'],wspace=0.4,palette=sc_color) +# Line 38: Generates a UMAP plot colored by 'leiden' and 'nocd_n', setting spacing and palette -- sc.pl.umap(scbrca.adata, color=['leiden','nocd_n'],wspace=0.4,palette=sc_color) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_preprocess_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_preprocess_annotated.py new file mode 100644 index 00000000..831ed436 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_preprocess_annotated.py @@ -0,0 +1,132 @@ +``` +# Line 1: # Line 1: Import the omicverse library as ov -- import omicverse as ov +# Line 2: # Line 2: Import the scanpy library as sc -- import scanpy as sc +# Line 3: # Line 3: Set the plotting style for omicverse. -- ov.ov_plot_set() +# Line 8: # Line 8: Read 10x matrix data into an AnnData object. -- adata = sc.read_10x_mtx( +# Line 9: # Line 9: Specify the directory containing the .mtx file. -- 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file +# Line 10: # Line 10: Use gene symbols for variable names. -- var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) +# Line 11: # Line 11: Enable caching for faster reading. -- cache=True) # write a cache file for faster subsequent reading +# Line 12: # Line 12: Display the AnnData object. -- adata +# Line 14: # Line 14: Make variable names unique. -- adata.var_names_make_unique() +# Line 15: # Line 15: Make observation names unique. -- adata.obs_names_make_unique() +# Line 17: # Line 17: Perform quality control on the AnnData object using specified thresholds. -- adata=ov.pp.qc(adata, +# Line 18: # Line 18: Set threshold parameters for mito_perc, nUMIs, and detected_genes. -- tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250}) +# Line 19: # Line 19: Display the AnnData object after QC. -- adata +# Line 21: # Line 21: Store counts layer in the AnnData object. -- ov.utils.store_layers(adata,layers='counts') +# Line 22: # Line 22: Display the AnnData object after storing the layer. -- adata +# Line 24: # Line 24: Preprocess the AnnData object using shiftlog and pearson mode, selecting 2000 HVGs. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 25: # Line 25: Display the AnnData object after preprocessing. -- adata +# Line 27: # Line 27: Store raw counts in the `raw` attribute. -- adata.raw = adata +# Line 28: # Line 28: Filter AnnData object to keep only highly variable features. -- adata = adata[:, adata.var.highly_variable_features] +# Line 29: # Line 29: Display the filtered AnnData object. -- adata +# Line 31: # Line 31: Create a copy of the AnnData object named adata_counts. -- adata_counts=adata.copy() +# Line 32: # Line 32: Retrieve counts layer from adata_counts. -- ov.utils.retrieve_layers(adata_counts,layers='counts') +# Line 33: # Line 33: Print the maximum value of normalized adata.X. -- print('normalize adata:',adata.X.max()) +# Line 34: # Line 34: Print the maximum value of raw count adata_counts.X. -- print('raw count adata:',adata_counts.X.max()) +# Line 36: # Line 36: Display the adata_counts object. -- adata_counts +# Line 38: # Line 38: Create a copy of the raw data as an AnnData object into adata_counts. -- adata_counts=adata.raw.to_adata().copy() +# Line 39: # Line 39: Retrieve the counts layer in adata_counts -- ov.utils.retrieve_layers(adata_counts,layers='counts') +# Line 40: # Line 40: Print the maximum value of normalized adata.X. -- print('normalize adata:',adata.X.max()) +# Line 41: # Line 41: Print the maximum value of raw count adata_counts.X. -- print('raw count adata:',adata_counts.X.max()) +# Line 42: # Line 42: Display the adata_counts object. -- adata_counts +# Line 44: # Line 44: Scale the AnnData object. -- ov.pp.scale(adata) +# Line 45: # Line 45: Display the AnnData object after scaling. -- adata +# Line 47: # Line 47: Perform PCA on scaled layer, using 50 principal components. -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 48: # Line 48: Display the AnnData object after PCA. -- adata +# Line 50: # Line 50: Assign the scaled pca to the X_pca embedding -- adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'] +# Line 51: # Line 51: Generate an embedding plot based on X_pca with CST3 coloring. -- ov.utils.embedding(adata, +# Line 52: # Line 52: Set the basis to X_pca for embedding. -- basis='X_pca', +# Line 53: # Line 53: Set color to CST3 gene. -- color='CST3', +# Line 54: # Line 54: Set the frame style of the plot to small. -- frameon='small') +# Line 56: # Line 56: Compute neighborhood graph, using scaled PCA representation. -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 57: # Line 57: Use the scaled PCA representation as the input for the neighborhood graph. -- use_rep='scaled|original|X_pca') +# Line 59: # Line 59: Calculate Multidimensional Energy scaling embedding. -- adata.obsm["X_mde"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) +# Line 60: # Line 60: Display the AnnData object with X_mde calculated. -- adata +# Line 62: # Line 62: Generate an embedding plot based on X_mde, with CST3 coloring. -- ov.utils.embedding(adata, +# Line 63: # Line 63: Set the basis to X_mde for the embedding plot. -- basis='X_mde', +# Line 64: # Line 64: Set color to CST3 gene. -- color='CST3', +# Line 65: # Line 65: Set the frame style to small for the plot. -- frameon='small') +# Line 67: # Line 67: Run UMAP dimensionality reduction. -- sc.tl.umap(adata) +# Line 69: # Line 69: Generate an embedding plot based on X_umap, with CST3 coloring. -- ov.utils.embedding(adata, +# Line 70: # Line 70: Set the basis to X_umap. -- basis='X_umap', +# Line 71: # Line 71: Set the color to CST3. -- color='CST3', +# Line 72: # Line 72: Set the frame style to small for the plot. -- frameon='small') +# Line 74: # Line 74: Run Leiden clustering. -- sc.tl.leiden(adata) +# Line 76: # Line 76: Generate an embedding plot based on X_mde, with Leiden, CST3, and NKG7 coloring. -- ov.utils.embedding(adata, +# Line 77: # Line 77: Set basis to X_mde. -- basis='X_mde', +# Line 78: # Line 78: Color by Leiden cluster, CST3 gene expression and NKG7 gene expression -- color=['leiden', 'CST3', 'NKG7'], +# Line 79: # Line 79: Set the frame style of the plot to small. -- frameon='small') +# Line 81: # Line 81: Import the matplotlib plotting library -- import matplotlib.pyplot as plt +# Line 82: # Line 82: Create a Matplotlib figure and an axes object. -- fig,ax=plt.subplots( figsize = (4,4)) +# Line 84: # Line 84: Generate an embedding plot based on X_mde with Leiden coloring. -- ov.utils.embedding(adata, +# Line 85: # Line 85: Set the basis to X_mde. -- basis='X_mde', +# Line 86: # Line 86: Color the embedding by Leiden clusters. -- color=['leiden'], +# Line 87: # Line 87: Do not show the plot. -- show=False, +# Line 88: # Line 88: Set the axis of the embedding plot. -- ax=ax) +# Line 90: # Line 90: Generate a convex hull plot on top of the X_mde embedding based on leiden clusters. -- ov.utils.plot_ConvexHull(adata, +# Line 91: # Line 91: Set the basis for the convex hull plot to X_mde. -- basis='X_mde', +# Line 92: # Line 92: Set the cluster key to Leiden. -- cluster_key='leiden', +# Line 93: # Line 93: Generate a hull for cluster '0'. -- hull_cluster='0', +# Line 94: # Line 94: Set the axis of the convex hull plot. -- ax=ax) +# Line 97: # Line 97: Import patheffects module from matplotlib. -- from matplotlib import patheffects +# Line 98: # Line 98: Import the matplotlib plotting library. -- import matplotlib.pyplot as plt +# Line 99: # Line 99: Create a matplotlib figure and axes. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 101: # Line 101: Generate an embedding plot based on X_mde, with Leiden coloring. -- ov.utils.embedding(adata, +# Line 102: # Line 102: Set the embedding basis to X_mde. -- basis='X_mde', +# Line 103: # Line 103: Color the points according to leiden cluster. -- color=['leiden'], +# Line 104: # Line 104: Do not show the plot, do not show a legend, do not add an outline, set the frame to small, set the legend font outline and set the axis of the plot. -- show=False, legend_loc=None, add_outline=False, +# Line 105: # Line 105: Set the frame to small, set the legend font outline and set the axis of the plot. -- frameon='small',legend_fontoutline=2,ax=ax +# Line 106: # Line 106: close embedding function -- ) +# Line 108: # Line 108: Generate labels for the given clusters. -- ov.utils.gen_mpl_labels( +# Line 109: # Line 109: Use Leiden as cluster key. -- adata, +# Line 110: # Line 110: Use Leiden clusters -- 'leiden', +# Line 111: # Line 111: Exclude the "None" cluster. -- exclude=("None",), +# Line 112: # Line 112: Set the embedding basis to X_mde. -- basis='X_mde', +# Line 113: # Line 113: Set the axis of the generated label plot -- ax=ax, +# Line 114: # Line 114: Set the label arrow props. -- adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')), +# Line 115: # Line 115: Set the text properties for the generated labels. -- text_kwargs=dict(fontsize= 12 ,weight='bold', +# Line 116: # Line 116: Set the path effect of the label. -- path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ), +# Line 118: # Line 118: Define a list of marker genes. -- marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14', +# Line 119: # Line 119: Define a list of marker genes. -- 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1', +# Line 120: # Line 120: Define a list of marker genes. -- 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP'] +# Line 122: # Line 122: Generate a dotplot of marker genes by leiden cluster. -- sc.pl.dotplot(adata, marker_genes, groupby='leiden', +# Line 123: # Line 123: Standard scale the dotplot by variable. -- standard_scale='var'); +# Line 125: # Line 125: Compute a dendrogram of leiden clusters. -- sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca') +# Line 126: # Line 126: Rank genes for each leiden cluster using t-test on the scaled PCA embeddings. -- sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', +# Line 127: # Line 127: Set the method, use_raw, and key_added for the ranked gene t-test analysis -- method='t-test',use_raw=False,key_added='leiden_ttest') +# Line 128: # Line 128: Generate a dotplot of top ranked genes by leiden cluster from t-test. -- sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', +# Line 129: # Line 129: Set color, the key for results, the standard_scale and number of genes to display for the dot plot. -- cmap='Spectral_r',key='leiden_ttest', +# Line 130: # Line 130: Set the standard scale for the dotplot by variable, display 3 genes. -- standard_scale='var',n_genes=3) +# Line 132: # Line 132: Rank genes for each leiden cluster using t-test on the scaled PCA embeddings. -- sc.tl.rank_genes_groups(adata, groupby='leiden', +# Line 133: # Line 133: Set the method and use_rep for the ranked genes t-test analysis. -- method='t-test',use_rep='scaled|original|X_pca',) +# Line 134: # Line 134: Run consensus scoring of gene groups by leiden cluster -- ov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden') +# Line 135: # Line 135: Generate a dotplot of top ranked genes from cosg by leiden cluster. -- sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', +# Line 136: # Line 136: Set the color map, key, standard scaling, and number of genes to display for the dot plot. -- cmap='Spectral_r',key='leiden_cosg', +# Line 137: # Line 137: Set the standard scaling by variable and number of genes to display for the dot plot. -- standard_scale='var',n_genes=3) +# Line 139: # Line 139: Create an empty dictionary to store rank genes group data. -- data_dict={} +# Line 140: # Line 140: Iterate over each leiden category. -- for i in adata.obs['leiden'].cat.categories: +# Line 141: # Line 141: Retrieve ranked genes for each cluster based on t-test pvalues and store it to the dictionary. -- data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest', +# Line 142: # Line 142: Set cutoff values for pvalue and logfoldchanges -- pval_cutoff=None,log2fc_min=None) +# Line 144: # Line 144: Print the keys of the data dictionary. -- data_dict.keys() +# Line 146: # Line 146: Display the head of the data dictionary for the last category. -- data_dict[i].head() +# Line 148: # Line 148: Create a color dictionary using leiden categories and colors. -- type_color_dict=dict(zip(adata.obs['leiden'].cat.categories, +# Line 149: # Line 149: Use leiden color categories for the type_color_dict. -- adata.uns['leiden_colors'])) +# Line 150: # Line 150: Print type_color_dict -- type_color_dict +# Line 152: # Line 152: Create a stacked volcano plot based on the ranked gene results. -- fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict, +# Line 153: # Line 153: Set the p-value threshold for significance. -- pval_threshold=0.01, +# Line 154: # Line 154: Set the log2 fold change threshold for significance. -- log2fc_threshold=2, +# Line 155: # Line 155: Set the figure size. -- figsize=(8,4), +# Line 156: # Line 156: Set the color for significant genes. -- sig_color='#a51616', +# Line 157: # Line 157: Set the color for non-significant genes. -- normal_color='#c7c7c7', +# Line 158: # Line 158: Set the number of genes to plot. -- plot_genes_num=2, +# Line 159: # Line 159: Set the fontsize of the plot genes. -- plot_genes_fontsize=6, +# Line 160: # Line 160: Set the font weight for the plotted genes. -- plot_genes_weight='bold', +# Line 161: # Line 161: close stacking vol function -- ) +# Line 163: # Line 163: Set initial y min and y max values for stacking plots. -- y_min,y_max=0,0 +# Line 164: # Line 164: Iterate over each cluster in the data dict. -- for i in data_dict.keys(): +# Line 165: # Line 165: Update y min by taking the min of current ymin and logfoldchanges minimum value. -- y_min=min(y_min,data_dict[i]['logfoldchanges'].min()) +# Line 166: # Line 166: Update y max by taking the max of current ymax and logfoldchanges maximum value. -- y_max=max(y_max,data_dict[i]['logfoldchanges'].max()) +# Line 167: # Line 167: Iterate over each leiden category. -- for i in adata.obs['leiden'].cat.categories: +# Line 168: # Line 168: Set the y axis limits for each subplot using calculated y min and max -- axes[i].set_ylim(y_min,y_max) +# Line 169: # Line 169: Set the suptitle for the whole plot figure. -- plt.suptitle('Stacking_vol',fontsize=12) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_preprocess_cpu_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_preprocess_cpu_annotated.py new file mode 100644 index 00000000..107ca176 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_preprocess_cpu_annotated.py @@ -0,0 +1,136 @@ +``` +# Line 1: import scanpy as sc -- import scanpy as sc +# Line 2: import omicverse as ov -- import omicverse as ov +# Line 3: ov.plot_set() -- ov.plot_set() +# Line 5: !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz -- !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz +# Line 6: !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz -- !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz +# Line 8: adata = sc.read_10x_mtx( -- adata = sc.read_10x_mtx( +# Line 9: 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file -- 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file +# Line 10: var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) -- var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) +# Line 11: cache=True) # write a cache file for faster subsequent reading -- cache=True) # write a cache file for faster subsequent reading +# Line 12: adata -- adata +# Line 14: adata.var_names_make_unique() -- adata.var_names_make_unique() +# Line 15: adata.obs_names_make_unique() -- adata.obs_names_make_unique() +# Line 17: %%time -- %%time +# Line 18: adata=ov.pp.qc(adata, -- adata=ov.pp.qc(adata, +# Line 19: tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}, -- tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}, +# Line 20: doublets_method='sccomposite', -- doublets_method='sccomposite', +# Line 21: batch_key=None) -- batch_key=None) +# Line 22: adata -- adata +# Line 24: %%time -- %%time +# Line 25: adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 26: adata -- adata +# Line 28: %%time -- %%time +# Line 29: adata.raw = adata -- adata.raw = adata +# Line 30: adata = adata[:, adata.var.highly_variable_features] -- adata = adata[:, adata.var.highly_variable_features] +# Line 31: adata -- adata +# Line 33: %%time -- %%time +# Line 34: ov.pp.scale(adata) -- ov.pp.scale(adata) +# Line 35: adata -- adata +# Line 37: %%time -- %%time +# Line 38: ov.pp.pca(adata,layer='scaled',n_pcs=50) -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 39: adata -- adata +# Line 41: adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'] -- adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'] +# Line 42: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 43: basis='X_pca', -- basis='X_pca', +# Line 44: color='CST3', -- color='CST3', +# Line 45: frameon='small') -- frameon='small') +# Line 47: %%time -- %%time +# Line 48: ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50, -- ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 49: use_rep='scaled|original|X_pca') -- use_rep='scaled|original|X_pca') +# Line 51: %%time -- %%time +# Line 52: ov.pp.umap(adata) -- ov.pp.umap(adata) +# Line 54: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 55: basis='X_umap', -- basis='X_umap', +# Line 56: color='CST3', -- color='CST3', +# Line 57: frameon='small') -- frameon='small') +# Line 59: ov.pp.mde(adata,embedding_dim=2,n_neighbors=15, basis='X_mde', -- ov.pp.mde(adata,embedding_dim=2,n_neighbors=15, basis='X_mde', +# Line 60: n_pcs=50, use_rep='scaled|original|X_pca',) -- n_pcs=50, use_rep='scaled|original|X_pca',) +# Line 62: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 63: basis='X_mde', -- basis='X_mde', +# Line 64: color='CST3', -- color='CST3', +# Line 65: frameon='small') -- frameon='small') +# Line 67: adata_raw=adata.raw.to_adata() -- adata_raw=adata.raw.to_adata() +# Line 68: ov.pp.score_genes_cell_cycle(adata_raw,species='human') -- ov.pp.score_genes_cell_cycle(adata_raw,species='human') +# Line 70: ov.pl.embedding(adata_raw, -- ov.pl.embedding(adata_raw, +# Line 71: basis='X_mde', -- basis='X_mde', +# Line 72: color='phase', -- color='phase', +# Line 73: frameon='small') -- frameon='small') +# Line 75: ov.pp.leiden(adata,resolution=1) -- ov.pp.leiden(adata,resolution=1) +# Line 77: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 78: basis='X_mde', -- basis='X_mde', +# Line 79: color=['leiden', 'CST3', 'NKG7'], -- color=['leiden', 'CST3', 'NKG7'], +# Line 80: frameon='small') -- frameon='small') +# Line 82: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 83: fig,ax=plt.subplots( figsize = (4,4)) -- fig,ax=plt.subplots( figsize = (4,4)) +# Line 85: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 86: basis='X_mde', -- basis='X_mde', +# Line 87: color=['leiden'], -- color=['leiden'], +# Line 88: frameon='small', -- frameon='small', +# Line 89: show=False, -- show=False, +# Line 90: ax=ax) -- ax=ax) +# Line 92: ov.pl.ConvexHull(adata, -- ov.pl.ConvexHull(adata, +# Line 93: basis='X_mde', -- basis='X_mde', +# Line 94: cluster_key='leiden', -- cluster_key='leiden', +# Line 95: hull_cluster='0', -- hull_cluster='0', +# Line 96: ax=ax) -- ax=ax) +# Line 99: from matplotlib import patheffects -- from matplotlib import patheffects +# Line 100: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 101: fig, ax = plt.subplots(figsize=(4,4)) -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 103: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 104: basis='X_mde', -- basis='X_mde', +# Line 105: color=['leiden'], -- color=['leiden'], +# Line 106: show=False, legend_loc=None, add_outline=False, -- show=False, legend_loc=None, add_outline=False, +# Line 107: frameon='small',legend_fontoutline=2,ax=ax -- frameon='small',legend_fontoutline=2,ax=ax +# Line 110: ov.utils.gen_mpl_labels( -- ov.utils.gen_mpl_labels( +# Line 111: adata, -- adata, +# Line 112: 'leiden', -- 'leiden', +# Line 113: exclude=("None",), -- exclude=("None",), +# Line 114: basis='X_mde', -- basis='X_mde', +# Line 115: ax=ax, -- ax=ax, +# Line 116: adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')), -- adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')), +# Line 117: text_kwargs=dict(fontsize= 12 ,weight='bold', -- text_kwargs=dict(fontsize= 12 ,weight='bold', +# Line 118: path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ), -- path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ), +# Line 121: marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14', -- marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14', +# Line 122: 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1', -- 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1', +# Line 123: 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP'] -- 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP'] +# Line 125: sc.pl.dotplot(adata, marker_genes, groupby='leiden', -- sc.pl.dotplot(adata, marker_genes, groupby='leiden', +# Line 126: standard_scale='var'); -- standard_scale='var'); +# Line 128: sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca') -- sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca') +# Line 129: sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', -- sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', +# Line 130: method='t-test',use_raw=False,key_added='leiden_ttest') -- method='t-test',use_raw=False,key_added='leiden_ttest') +# Line 131: sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', -- sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', +# Line 132: cmap='Spectral_r',key='leiden_ttest', -- cmap='Spectral_r',key='leiden_ttest', +# Line 133: standard_scale='var',n_genes=3) -- standard_scale='var',n_genes=3) +# Line 135: sc.tl.rank_genes_groups(adata, groupby='leiden', -- sc.tl.rank_genes_groups(adata, groupby='leiden', +# Line 136: method='t-test',use_rep='scaled|original|X_pca',) -- method='t-test',use_rep='scaled|original|X_pca',) +# Line 137: ov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden') -- ov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden') +# Line 138: sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', -- sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', +# Line 139: cmap='Spectral_r',key='leiden_cosg', -- cmap='Spectral_r',key='leiden_cosg', +# Line 140: standard_scale='var',n_genes=3) -- standard_scale='var',n_genes=3) +# Line 142: data_dict={} -- data_dict={} +# Line 143: for i in adata.obs['leiden'].cat.categories: -- for i in adata.obs['leiden'].cat.categories: +# Line 144: data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest', -- data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest', +# Line 145: pval_cutoff=None,log2fc_min=None) -- pval_cutoff=None,log2fc_min=None) +# Line 147: data_dict.keys() -- data_dict.keys() +# Line 149: data_dict[i].head() -- data_dict[i].head() +# Line 151: type_color_dict=dict(zip(adata.obs['leiden'].cat.categories, -- type_color_dict=dict(zip(adata.obs['leiden'].cat.categories, +# Line 152: adata.uns['leiden_colors'])) -- adata.uns['leiden_colors'])) +# Line 153: type_color_dict -- type_color_dict +# Line 155: fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict, -- fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict, +# Line 156: pval_threshold=0.01, -- pval_threshold=0.01, +# Line 157: log2fc_threshold=2, -- log2fc_threshold=2, +# Line 158: figsize=(8,4), -- figsize=(8,4), +# Line 159: sig_color='#a51616', -- sig_color='#a51616', +# Line 160: normal_color='#c7c7c7', -- normal_color='#c7c7c7', +# Line 161: plot_genes_num=2, -- plot_genes_num=2, +# Line 162: plot_genes_fontsize=6, -- plot_genes_fontsize=6, +# Line 163: plot_genes_weight='bold', -- plot_genes_weight='bold', +# Line 166: y_min,y_max=0,0 -- y_min,y_max=0,0 +# Line 167: for i in data_dict.keys(): -- for i in data_dict.keys(): +# Line 168: y_min=min(y_min,data_dict[i]['logfoldchanges'].min()) -- y_min=min(y_min,data_dict[i]['logfoldchanges'].min()) +# Line 169: y_max=max(y_max,data_dict[i]['logfoldchanges'].max()) -- y_max=max(y_max,data_dict[i]['logfoldchanges'].max()) +# Line 170: for i in adata.obs['leiden'].cat.categories: -- for i in adata.obs['leiden'].cat.categories: +# Line 171: axes[i].set_ylim(y_min,y_max) -- axes[i].set_ylim(y_min,y_max) +# Line 172: plt.suptitle('Stacking_vol',fontsize=12) -- plt.suptitle('Stacking_vol',fontsize=12) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py new file mode 100644 index 00000000..fe6e2b61 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py @@ -0,0 +1,122 @@ +```python +# Line 1: import omicverse as ov -- import omicverse as ov +# Line 2: import scanpy as sc -- import scanpy as sc +# Line 3: ov.plot_set() -- ov.plot_set() +# Line 4: ov.settings.gpu_init() -- ov.settings.gpu_init() +# Line 9: adata = sc.read_10x_mtx( -- adata = sc.read_10x_mtx( +# Line 10: 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file -- 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file +# Line 11: var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) -- var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) +# Line 12: cache=True) # write a cache file for faster subsequent reading -- cache=True) # write a cache file for faster subsequent reading +# Line 13: adata -- adata +# Line 15: adata.var_names_make_unique() -- adata.var_names_make_unique() +# Line 16: adata.obs_names_make_unique() -- adata.obs_names_make_unique() +# Line 18: ov.pp.anndata_to_GPU(adata) -- ov.pp.anndata_to_GPU(adata) +# Line 20: adata=ov.pp.qc(adata, -- adata=ov.pp.qc(adata, +# Line 21: tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}, -- tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}, +# Line 22: batch_key=None) -- batch_key=None) +# Line 23: adata -- adata +# Line 25: adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 26: adata -- adata +# Line 28: adata.raw = adata -- adata.raw = adata +# Line 29: adata = adata[:, adata.var.highly_variable_features] -- adata = adata[:, adata.var.highly_variable_features] +# Line 30: adata -- adata +# Line 32: ov.pp.scale(adata) -- ov.pp.scale(adata) +# Line 33: adata -- adata +# Line 35: ov.pp.pca(adata,layer='scaled',n_pcs=50) -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 36: adata -- adata +# Line 38: adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'] -- adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'] +# Line 39: ov.utils.embedding(adata, -- ov.utils.embedding(adata, +# Line 40: basis='X_pca', -- basis='X_pca', +# Line 41: color='CST3', -- color='CST3', +# Line 42: frameon='small') -- frameon='small') +# Line 44: ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50, -- ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50, +# Line 45: use_rep='scaled|original|X_pca',method='cagra') -- use_rep='scaled|original|X_pca',method='cagra') +# Line 47: adata.obsm["X_mde"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) -- adata.obsm["X_mde"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) +# Line 48: adata -- adata +# Line 50: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 51: basis='X_mde', -- basis='X_mde', +# Line 52: color='CST3', -- color='CST3', +# Line 53: frameon='small') -- frameon='small') +# Line 55: ov.pp.umap(adata) -- ov.pp.umap(adata) +# Line 57: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 58: basis='X_umap', -- basis='X_umap', +# Line 59: color='CST3', -- color='CST3', +# Line 60: frameon='small') -- frameon='small') +# Line 62: ov.pp.leiden(adata) -- ov.pp.leiden(adata) +# Line 64: ov.pp.anndata_to_CPU(adata) -- ov.pp.anndata_to_CPU(adata) +# Line 66: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 67: basis='X_mde', -- basis='X_mde', +# Line 68: color=['leiden', 'CST3', 'NKG7'], -- color=['leiden', 'CST3', 'NKG7'], +# Line 69: frameon='small') -- frameon='small') +# Line 71: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 72: fig,ax=plt.subplots( figsize = (4,4)) -- fig,ax=plt.subplots( figsize = (4,4)) +# Line 74: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 75: basis='X_mde', -- basis='X_mde', +# Line 76: color=['leiden'], -- color=['leiden'], +# Line 77: frameon='small', -- frameon='small', +# Line 78: show=False, -- show=False, +# Line 79: ax=ax) -- ax=ax) +# Line 81: ov.pl.ConvexHull(adata, -- ov.pl.ConvexHull(adata, +# Line 82: basis='X_mde', -- basis='X_mde', +# Line 83: cluster_key='leiden', -- cluster_key='leiden', +# Line 84: hull_cluster='0', -- hull_cluster='0', +# Line 85: ax=ax) -- ax=ax) +# Line 88: from matplotlib import patheffects -- from matplotlib import patheffects +# Line 89: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 90: fig, ax = plt.subplots(figsize=(4,4)) -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 92: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 93: basis='X_mde', -- basis='X_mde', +# Line 94: color=['leiden'], -- color=['leiden'], +# Line 95: show=False, legend_loc=None, add_outline=False, -- show=False, legend_loc=None, add_outline=False, +# Line 96: frameon='small',legend_fontoutline=2,ax=ax -- frameon='small',legend_fontoutline=2,ax=ax +# Line 99: ov.utils.gen_mpl_labels( -- ov.utils.gen_mpl_labels( +# Line 100: adata, -- adata, +# Line 101: 'leiden', -- 'leiden', +# Line 102: exclude=("None",), -- exclude=("None",), +# Line 103: basis='X_mde', -- basis='X_mde', +# Line 104: ax=ax, -- ax=ax, +# Line 105: adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')), -- adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')), +# Line 106: text_kwargs=dict(fontsize= 12 ,weight='bold', -- text_kwargs=dict(fontsize= 12 ,weight='bold', +# Line 107: path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ), -- path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ), +# Line 109: marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14', -- marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14', +# Line 110: 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1', -- 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1', +# Line 111: 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP'] -- 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP'] +# Line 113: sc.pl.dotplot(adata, marker_genes, groupby='leiden', -- sc.pl.dotplot(adata, marker_genes, groupby='leiden', +# Line 114: standard_scale='var'); -- standard_scale='var'); +# Line 116: sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca') -- sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca') +# Line 117: sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', -- sc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca', +# Line 118: method='t-test',use_raw=False,key_added='leiden_ttest') -- method='t-test',use_raw=False,key_added='leiden_ttest') +# Line 119: sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', -- sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', +# Line 120: cmap='Spectral_r',key='leiden_ttest', -- cmap='Spectral_r',key='leiden_ttest', +# Line 121: standard_scale='var',n_genes=3) -- standard_scale='var',n_genes=3) +# Line 123: sc.tl.rank_genes_groups(adata, groupby='leiden', -- sc.tl.rank_genes_groups(adata, groupby='leiden', +# Line 124: method='t-test',use_rep='scaled|original|X_pca',) -- method='t-test',use_rep='scaled|original|X_pca',) +# Line 125: ov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden') -- ov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden') +# Line 126: sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', -- sc.pl.rank_genes_groups_dotplot(adata,groupby='leiden', +# Line 127: cmap='Spectral_r',key='leiden_cosg', -- cmap='Spectral_r',key='leiden_cosg', +# Line 128: standard_scale='var',n_genes=3) -- standard_scale='var',n_genes=3) +# Line 130: data_dict={} -- data_dict={} +# Line 131: for i in adata.obs['leiden'].cat.categories: -- for i in adata.obs['leiden'].cat.categories: +# Line 132: data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest', -- data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest', +# Line 133: pval_cutoff=None,log2fc_min=None) -- pval_cutoff=None,log2fc_min=None) +# Line 135: data_dict.keys() -- data_dict.keys() +# Line 137: data_dict[i].head() -- data_dict[i].head() +# Line 139: type_color_dict=dict(zip(adata.obs['leiden'].cat.categories, -- type_color_dict=dict(zip(adata.obs['leiden'].cat.categories, +# Line 140: adata.uns['leiden_colors'])) -- adata.uns['leiden_colors'])) +# Line 141: type_color_dict -- type_color_dict +# Line 143: fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict, -- fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict, +# Line 144: pval_threshold=0.01, -- pval_threshold=0.01, +# Line 145: log2fc_threshold=2, -- log2fc_threshold=2, +# Line 146: figsize=(8,4), -- figsize=(8,4), +# Line 147: sig_color='#a51616', -- sig_color='#a51616', +# Line 148: normal_color='#c7c7c7', -- normal_color='#c7c7c7', +# Line 149: plot_genes_num=2, -- plot_genes_num=2, +# Line 150: plot_genes_fontsize=6, -- plot_genes_fontsize=6, +# Line 151: plot_genes_weight='bold', -- plot_genes_weight='bold', +# Line 155: for i in data_dict.keys(): -- for i in data_dict.keys(): +# Line 156: y_min=min(y_min,data_dict[i]['logfoldchanges'].min()) -- y_min=min(y_min,data_dict[i]['logfoldchanges'].min()) +# Line 157: y_max=max(y_max,data_dict[i]['logfoldchanges'].max()) -- y_max=max(y_max,data_dict[i]['logfoldchanges'].max()) +# Line 158: for i in adata.obs['leiden'].cat.categories: -- for i in adata.obs['leiden'].cat.categories: +# Line 159: axes[i].set_ylim(y_min,y_max) -- axes[i].set_ylim(y_min,y_max) +# Line 160: plt.suptitle('Stacking_vol',fontsize=12) -- plt.suptitle('Stacking_vol',fontsize=12) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_scdeg_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_scdeg_annotated.py new file mode 100644 index 00000000..6388f103 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_scdeg_annotated.py @@ -0,0 +1,92 @@ +``` +# Line 1: Imports the omicverse library, aliased as ov. -- import omicverse as ov +# Line 2: Imports the scanpy library, aliased as sc. -- import scanpy as sc +# Line 3: Imports the scvelo library, aliased as scv. -- import scvelo as scv +# Line 5: Sets the plotting style for omicverse. -- ov.utils.ov_plot_set() +# Line 7: Loads the pancreas dataset from scvelo and stores it in adata. -- adata = scv.datasets.pancreas() +# Line 8: Displays the adata object. -- adata +# Line 10: Finds the maximum value in the adata.X matrix. -- adata.X.max() +# Line 12: Performs quality control on the adata object using the specified thresholds. -- adata=ov.pp.qc(adata, +# Line 13: tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250}) +# Line 14: Preprocesses the adata object using shiftlog normalization and Pearson residuals, selecting 2000 high variable genes. -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 17: Saves a copy of the original adata object to adata.raw. -- adata.raw = adata +# Line 18: Filters the adata object to keep only highly variable genes. -- adata = adata[:, adata.var.highly_variable_features] +# Line 20: Scales the data matrix in adata.X. -- ov.pp.scale(adata) +# Line 22: Performs PCA dimensionality reduction using the scaled layer with 50 components. -- ov.pp.pca(adata,layer='scaled',n_pcs=50) +# Line 24: Finds the maximum value in the adata.X matrix after scaling and PCA. -- adata.X.max() +# Line 26: Creates a new adata object containing only cells from Alpha and Beta clusters. -- test_adata=adata[adata.obs['clusters'].isin(['Alpha','Beta'])] +# Line 27: Displays the test_adata object. -- test_adata +# Line 29: Performs differential expression analysis using pyDEG on the log-normalized data. -- dds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T) +# Line 31: Removes duplicate indices from the dds object. -- dds.drop_duplicates_index() +# Line 32: Prints a success message after removing duplicate indices. -- print('... drop_duplicates_index success') +# Line 34: Creates a list of cell indices for the Alpha treatment group. -- treatment_groups=test_adata.obs[test_adata.obs['clusters']=='Alpha'].index.tolist() +# Line 35: Creates a list of cell indices for the Beta control group. -- control_groups=test_adata.obs[test_adata.obs['clusters']=='Beta'].index.tolist() +# Line 36: Performs differential expression analysis between treatment and control groups using a t-test. -- result=dds.deg_analysis(treatment_groups,control_groups,method='ttest') +# Line 39: Sorts the results by q-value and displays the top entries. -- result.sort_values('qvalue').head() +# Line 41: Sets the fold change, p-value, and log p-value thresholds for the dds object. -- dds.foldchange_set(fc_threshold=-1, +# Line 42: pval_threshold=0.05, +# Line 43: logp_max=10) +# Line 45: Generates a volcano plot for differential expression analysis results. -- dds.plot_volcano(title='DEG Analysis',figsize=(4,4), +# Line 46: plot_genes_num=8,plot_genes_fontsize=12,) +# Line 48: Generates a box plot of the expression of Irx1 and Adra2a genes for treatment and control groups. -- dds.plot_boxplot(genes=['Irx1','Adra2a'],treatment_groups=treatment_groups, +# Line 49: control_groups=control_groups,figsize=(2,3),fontsize=12, +# Line 50: legend_bbox=(2,0.55)) +# Line 52: Generates an embedding plot with cells colored by cluster, Irx1, and Adra2a expression. -- ov.utils.embedding(adata, +# Line 53: basis='X_umap', +# Line 54: frameon='small', +# Line 55: color=['clusters','Irx1','Adra2a']) +# Line 57: Creates a MetaCell object for single-cell analysis. -- meta_obj=ov.single.MetaCell(adata,use_rep='scaled|original|X_pca',n_metacells=150, +# Line 58: use_gpu=True) +# Line 60: Initializes the archetypes for the MetaCell object. -- meta_obj.initialize_archetypes() +# Line 62: Trains the MetaCell object. -- meta_obj.train(min_iter=10, max_iter=50) +# Line 64: Saves the trained MetaCell model to a file. -- meta_obj.save('seacells/model.pkl') +# Line 66: Loads the trained MetaCell model from a file. -- meta_obj.load('seacells/model.pkl') +# Line 68: Generates predicted cell type labels using the soft method. -- ad=meta_obj.predicted(method='soft',celltype_label='clusters', +# Line 69: summarize_layer='lognorm') +# Line 71: Prints the minimum and maximum values in the predicted cell type matrix. -- ad.X.min(),ad.X.max() +# Line 73: Imports the matplotlib.pyplot module, aliased as plt. -- import matplotlib.pyplot as plt +# Line 74: Creates a figure and an axes object for plotting. -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 75: Generates an embedding plot of meta-cells colored by cluster, with specified customizations. -- ov.utils.embedding( +# Line 76: meta_obj.adata, +# Line 77: basis="X_umap", +# Line 78: color=['clusters'], +# Line 79: frameon='small', +# Line 80: title="Meta cells", +# Line 81: #legend_loc='on data', +# Line 82: legend_fontsize=14, +# Line 83: legend_fontoutline=2, +# Line 84: size=10, +# Line 85: ax=ax, +# Line 86: alpha=0.2, +# Line 87: #legend_loc='', +# Line 88: add_outline=False, +# Line 89: #add_outline=True, +# Line 90: outline_color='black', +# Line 91: outline_width=1, +# Line 92: show=False, +# Line 93: #palette=ov.utils.blue_color[:], +# Line 94: #legend_fontweight='normal' +# Line 95: Plots the meta-cells with a red color. -- ov.single._metacell.plot_metacells(ax,meta_obj.adata,color='#CB3E35', +# Line 96: ) +# Line 98: Creates a new adata object containing only meta-cells with Alpha and Beta cell type labels. -- test_adata=ad[ad.obs['celltype'].isin(['Alpha','Beta'])] +# Line 99: Displays the test_adata object. -- test_adata +# Line 101: Performs differential expression analysis using pyDEG on the meta-cell data. -- dds_meta=ov.bulk.pyDEG(test_adata.to_df().T) +# Line 103: Removes duplicate indices from the dds_meta object. -- dds_meta.drop_duplicates_index() +# Line 104: Prints a success message after removing duplicate indices. -- print('... drop_duplicates_index success') +# Line 106: Creates a list of meta-cell indices for the Alpha treatment group. -- treatment_groups=test_adata.obs[test_adata.obs['celltype']=='Alpha'].index.tolist() +# Line 107: Creates a list of meta-cell indices for the Beta control group. -- control_groups=test_adata.obs[test_adata.obs['celltype']=='Beta'].index.tolist() +# Line 108: Performs differential expression analysis on meta-cells between treatment and control groups using a t-test. -- result=dds_meta.deg_analysis(treatment_groups,control_groups,method='ttest') +# Line 110: Sorts the meta-cell DEG results by q-value and displays the top entries. -- result.sort_values('qvalue').head() +# Line 112: Sets the fold change, p-value, and log p-value thresholds for the dds_meta object. -- dds_meta.foldchange_set(fc_threshold=-1, +# Line 113: pval_threshold=0.05, +# Line 114: logp_max=10) +# Line 116: Generates a volcano plot for meta-cell differential expression analysis results. -- dds_meta.plot_volcano(title='DEG Analysis',figsize=(4,4), +# Line 117: plot_genes_num=8,plot_genes_fontsize=12,) +# Line 119: Generates a box plot of the expression of Ctxn2 and Mnx1 genes for treatment and control groups. -- dds_meta.plot_boxplot(genes=['Ctxn2','Mnx1'],treatment_groups=treatment_groups, +# Line 120: control_groups=control_groups,figsize=(2,3),fontsize=12, +# Line 121: legend_bbox=(2,0.55)) +# Line 123: Generates an embedding plot with cells colored by cluster, Ctxn2, and Mnx1 expression. -- ov.utils.embedding(adata, +# Line 124: basis='X_umap', +# Line 125: frameon='small', +# Line 126: color=['clusters','Ctxn2','Mnx1']) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_scdrug_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_scdrug_annotated.py new file mode 100644 index 00000000..02589247 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_scdrug_annotated.py @@ -0,0 +1,79 @@ +``` +# Line 1: import omicverse as ov -- import omicverse as ov +# Line 2: import scanpy as sc -- import scanpy as sc +# Line 3: import infercnvpy as cnv -- import infercnvpy as cnv +# Line 4: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 5: import os -- import os +# Line 7: sc.settings.verbosity = 3 -- sc.settings.verbosity = 3 +# Line 8: sc.settings.set_figure_params(dpi=80, facecolor='white') -- sc.settings.set_figure_params(dpi=80, facecolor='white') +# Line 11: adata = cnv.datasets.maynard2020_3k() -- adata = cnv.datasets.maynard2020_3k() +# Line 13: ov.utils.get_gene_annotation( -- ov.utils.get_gene_annotation( +# Line 14: adata, gtf="gencode.v43.basic.annotation.gtf.gz", -- adata, gtf="gencode.v43.basic.annotation.gtf.gz", +# Line 15: gtf_by="gene_name" -- gtf_by="gene_name" +# Line 16: ) -- ) +# Line 19: adata=adata[:,~adata.var['chrom'].isnull()] -- adata=adata[:,~adata.var['chrom'].isnull()] +# Line 20: adata.var['chromosome']=adata.var['chrom'] -- adata.var['chromosome']=adata.var['chrom'] +# Line 21: adata.var['start']=adata.var['chromStart'] -- adata.var['start']=adata.var['chromStart'] +# Line 22: adata.var['end']=adata.var['chromEnd'] -- adata.var['end']=adata.var['chromEnd'] +# Line 23: adata.var['ensg']=adata.var['gene_id'] -- adata.var['ensg']=adata.var['gene_id'] +# Line 24: adata.var.loc[:, ["ensg", "chromosome", "start", "end"]].head() -- adata.var.loc[:, ["ensg", "chromosome", "start", "end"]].head() +# Line 26: adata -- adata +# Line 29: cnv.tl.infercnv( -- cnv.tl.infercnv( +# Line 30: adata, -- adata, +# Line 31: reference_key="cell_type", -- reference_key="cell_type", +# Line 32: reference_cat=[ -- reference_cat=[ +# Line 33: "B cell", -- "B cell", +# Line 34: "Macrophage", -- "Macrophage", +# Line 35: "Mast cell", -- "Mast cell", +# Line 36: "Monocyte", -- "Monocyte", +# Line 37: "NK cell", -- "NK cell", +# Line 38: "Plasma cell", -- "Plasma cell", +# Line 39: "T cell CD4", -- "T cell CD4", +# Line 40: "T cell CD8", -- "T cell CD8", +# Line 41: "T cell regulatory", -- "T cell regulatory", +# Line 42: "mDC", -- "mDC", +# Line 43: "pDC", -- "pDC", +# Line 44: ], -- ], +# Line 45: window_size=250, -- window_size=250, +# Line 46: ) -- ) +# Line 47: cnv.tl.pca(adata) -- cnv.tl.pca(adata) +# Line 48: cnv.pp.neighbors(adata) -- cnv.pp.neighbors(adata) +# Line 49: cnv.tl.leiden(adata) -- cnv.tl.leiden(adata) +# Line 50: cnv.tl.umap(adata) -- cnv.tl.umap(adata) +# Line 51: cnv.tl.cnv_score(adata) -- cnv.tl.cnv_score(adata) +# Line 53: sc.pl.umap(adata, color="cnv_score", show=False) -- sc.pl.umap(adata, color="cnv_score", show=False) +# Line 55: adata.obs["cnv_status"] = "normal" -- adata.obs["cnv_status"] = "normal" +# Line 56: adata.obs.loc[ -- adata.obs.loc[ +# Line 57: adata.obs["cnv_score"]>0.03, "cnv_status" -- adata.obs["cnv_score"]>0.03, "cnv_status" +# Line 58: ] = "tumor" -- ] = "tumor" +# Line 60: sc.pl.umap(adata, color="cnv_status", show=False) -- sc.pl.umap(adata, color="cnv_status", show=False) +# Line 62: tumor=adata[adata.obs['cnv_status']=='tumor'] -- tumor=adata[adata.obs['cnv_status']=='tumor'] +# Line 63: tumor.X.max() -- tumor.X.max() +# Line 65: adata=tumor -- adata=tumor +# Line 66: print('Preprocessing...') -- print('Preprocessing...') +# Line 67: sc.pp.filter_cells(adata, min_genes=200) -- sc.pp.filter_cells(adata, min_genes=200) +# Line 68: sc.pp.filter_genes(adata, min_cells=3) -- sc.pp.filter_genes(adata, min_cells=3) +# Line 69: adata.var['mt'] = adata.var_names.str.startswith('MT-') -- adata.var['mt'] = adata.var_names.str.startswith('MT-') +# Line 70: sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) -- sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) +# Line 71: if not (adata.obs.pct_counts_mt == 0).all(): -- if not (adata.obs.pct_counts_mt == 0).all(): +# Line 72: adata = adata[adata.obs.pct_counts_mt < 30, :] -- adata = adata[adata.obs.pct_counts_mt < 30, :] +# Line 74: adata.raw = adata.copy() -- adata.raw = adata.copy() +# Line 76: sc.pp.highly_variable_genes(adata) -- sc.pp.highly_variable_genes(adata) +# Line 77: adata = adata[:, adata.var.highly_variable] -- adata = adata[:, adata.var.highly_variable] +# Line 78: sc.pp.scale(adata) -- sc.pp.scale(adata) +# Line 79: sc.tl.pca(adata, svd_solver='arpack') -- sc.tl.pca(adata, svd_solver='arpack') +# Line 81: sc.pp.neighbors(adata, n_pcs=20) -- sc.pp.neighbors(adata, n_pcs=20) +# Line 82: sc.tl.umap(adata) -- sc.tl.umap(adata) +# Line 84: ov.utils.download_GDSC_data() -- ov.utils.download_GDSC_data() +# Line 85: ov.utils.download_CaDRReS_model() -- ov.utils.download_CaDRReS_model() +# Line 87: adata, res,plot_df = ov.single.autoResolution(adata,cpus=4) -- adata, res,plot_df = ov.single.autoResolution(adata,cpus=4) +# Line 89: results_file = os.path.join('./', 'scanpyobj.h5ad') -- results_file = os.path.join('./', 'scanpyobj.h5ad') +# Line 90: adata.write(results_file) -- adata.write(results_file) +# Line 92: results_file = os.path.join('./', 'scanpyobj.h5ad') -- results_file = os.path.join('./', 'scanpyobj.h5ad') +# Line 93: adata=sc.read(results_file) -- adata=sc.read(results_file) +# Line 96: !git clone https://github.com/CSB5/CaDRReS-Sc -- !git clone https://github.com/CSB5/CaDRReS-Sc +# Line 98: import ov -- import ov +# Line 99: job=ov.single.Drug_Response(adata,scriptpath='CaDRReS-Sc', -- job=ov.single.Drug_Response(adata,scriptpath='CaDRReS-Sc', +# Line 100: modelpath='models/', -- modelpath='models/', +# Line 101: output='result') -- output='result') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py new file mode 100644 index 00000000..c1cdd303 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py @@ -0,0 +1,39 @@ +```python +# Line 1: import os -- import os +# Line 3: import scanpy as sc -- import scanpy as sc +# Line 4: import omicverse as ov -- import omicverse as ov +# Line 5: ov.plot_set() -- ov.plot_set() +# Line 8: adata = sc.read('./data/liver_test.h5ad') -- adata = sc.read('./data/liver_test.h5ad') +# Line 10: adata -- adata +# Line 13: from scipy.sparse import csc_matrix -- from scipy.sparse import csc_matrix +# Line 14: adata.X = csc_matrix(adata.X) -- adata.X = csc_matrix(adata.X) +# Line 16: adata_GS_uniformed = ov.externel.scMulan.GeneSymbolUniform(input_adata=adata, -- adata_GS_uniformed = ov.externel.scMulan.GeneSymbolUniform(input_adata=adata, +# Line 17: output_dir="./data", -- output_dir="./data", +# Line 18: output_prefix='liver') -- output_prefix='liver') +# Line 22: adata_GS_uniformed=sc.read_h5ad('./data/liver_uniformed.h5ad') -- adata_GS_uniformed=sc.read_h5ad('./data/liver_uniformed.h5ad') +# Line 24: adata_GS_uniformed -- adata_GS_uniformed +# Line 28: if adata_GS_uniformed.X.max() > 10: -- if adata_GS_uniformed.X.max() > 10: +# Line 29: sc.pp.normalize_total(adata_GS_uniformed, target_sum=1e4) -- sc.pp.normalize_total(adata_GS_uniformed, target_sum=1e4) +# Line 30: sc.pp.log1p(adata_GS_uniformed) -- sc.pp.log1p(adata_GS_uniformed) +# Line 35: ckp_path = './ckpt/ckpt_scMulan.pt' -- ckp_path = './ckpt/ckpt_scMulan.pt' +# Line 37: scml = ov.externel.scMulan.model_inference(ckp_path, adata_GS_uniformed) -- scml = ov.externel.scMulan.model_inference(ckp_path, adata_GS_uniformed) +# Line 38: base_process = scml.cuda_count() -- base_process = scml.cuda_count() +# Line 40: scml.get_cell_types_and_embds_for_adata(parallel=True, n_process = 1) -- scml.get_cell_types_and_embds_for_adata(parallel=True, n_process = 1) +# Line 43: adata_mulan = scml.adata.copy() -- adata_mulan = scml.adata.copy() +# Line 46: ov.pp.scale(adata_mulan) -- ov.pp.scale(adata_mulan) +# Line 47: ov.pp.pca(adata_mulan) -- ov.pp.pca(adata_mulan) +# Line 50: ov.pp.mde(adata_mulan,embedding_dim=2,n_neighbors=15, basis='X_mde', -- ov.pp.mde(adata_mulan,embedding_dim=2,n_neighbors=15, basis='X_mde', +# Line 51: n_pcs=10, use_rep='scaled|original|X_pca',) -- n_pcs=10, use_rep='scaled|original|X_pca',) +# Line 54: ov.pl.embedding(adata_mulan,basis='X_mde', -- ov.pl.embedding(adata_mulan,basis='X_mde', +# Line 55: color=["cell_type_from_scMulan",], -- color=["cell_type_from_scMulan",], +# Line 56: ncols=1,frameon='small') -- ncols=1,frameon='small') +# Line 58: adata_mulan.obsm['X_umap']=adata_mulan.obsm['X_mde'] -- adata_mulan.obsm['X_umap']=adata_mulan.obsm['X_mde'] +# Line 61: ov.externel.scMulan.cell_type_smoothing(adata_mulan, threshold=0.1) -- ov.externel.scMulan.cell_type_smoothing(adata_mulan, threshold=0.1) +# Line 65: ov.pl.embedding(adata_mulan,basis='X_mde', -- ov.pl.embedding(adata_mulan,basis='X_mde', +# Line 66: color=["cell_type_from_mulan_smoothing","cell_type"], -- color=["cell_type_from_mulan_smoothing","cell_type"], +# Line 67: ncols=1,frameon='small') -- ncols=1,frameon='small') +# Line 69: adata_mulan -- adata_mulan +# Line 71: top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20] -- top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20] +# Line 74: selected_cell_types = top_celltypes -- selected_cell_types = top_celltypes +# Line 75: ov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True) -- ov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py new file mode 100644 index 00000000..550644d1 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py @@ -0,0 +1,21 @@ +```python +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the mde function from the omicverse.utils module. -- from omicverse.utils import mde +# Line 3: Set the working directory to 'result_human_pancreas'. -- workdir = 'result_human_pancreas' +# Line 4: Set plotting parameters using the ov_plot_set function from omicverse.utils. -- ov.utils.ov_plot_set() +# Line 5: Read an AnnData object from 'simba_adata_raw.h5ad' using ov.utils.read. -- adata=ov.utils.read('simba_adata_raw.h5ad') +# Line 7: Initialize a pySIMBA object with the AnnData object and the work directory. -- simba_object=ov.single.pySIMBA(adata,workdir) +# Line 9: Preprocess the SIMBA object with specified batch key, minimum cell count, method, number of top genes and number of bins. -- simba_object.preprocess(batch_key='batch',min_n_cells=3, +# Line 10: method='lib_size',n_top_genes=3000,n_bins=5) +# Line 12: Generate the graph for the SIMBA object. -- simba_object.gen_graph() +# Line 14: Train the SIMBA object with 6 worker processes. -- simba_object.train(num_workers=6) +# Line 16: Load the saved graph from the specified path. -- simba_object.load('result_human_pancreas/pbg/graph0') +# Line 18: Apply batch correction to the AnnData object using the SIMBA object. -- adata=simba_object.batch_correction() +# Line 19: Display the corrected AnnData object. -- adata +# Line 21: Compute the MDE embedding and store it in adata.obsm. -- adata.obsm["X_mde"] = mde(adata.obsm["X_simba"]) +# Line 23: Generate an embedding plot using X_mde as basis and color by cell_type1 and batch. -- sc.pl.embedding(adata,basis='X_mde',color=['cell_type1','batch']) +# Line 25: Import the scanpy library as sc. -- import scanpy as sc +# Line 26: Compute the neighbor graph using the X_simba representation. -- sc.pp.neighbors(adata, use_rep="X_simba") +# Line 27: Compute the UMAP embedding. -- sc.tl.umap(adata) +# Line 28: Plot the UMAP embedding colored by cell_type1 and batch. -- sc.pl.umap(adata,color=['cell_type1','batch']) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py new file mode 100644 index 00000000..0b486d85 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py @@ -0,0 +1,56 @@ +```python +# Line 1: Import the scanpy library for single-cell analysis. -- import scanpy as sc +# Line 2: Import the pandas library for data manipulation. -- import pandas as pd +# Line 3: Import the numpy library for numerical operations. -- import numpy as np +# Line 4: Import the omicverse library for spatial omics analysis. -- import omicverse as ov +# Line 5: Import the matplotlib.pyplot library for plotting. -- import matplotlib.pyplot as plt +# Line 7: Set the plotting style using omicverse utilities. -- ov.utils.ov_plot_set() +# Line 9: Import the anndata library for handling annotated data objects. -- import anndata +# Line 10: Read single-cell gene expression data from a CSV file into a pandas DataFrame. -- raw_data=pd.read_csv('data/pdac/sc_data.csv', index_col=0) +# Line 11: Create an AnnData object from the transposed single-cell gene expression DataFrame. -- single_data=anndata.AnnData(raw_data.T) +# Line 12: Read single-cell metadata and assign the 'Cell_type' column to the AnnData object's obs attribute. -- single_data.obs = pd.read_csv('data/pdac/sc_meta.csv', index_col=0)[['Cell_type']] +# Line 13: Display the single_data AnnData object. -- single_data +# Line 15: Read spatial transcriptomics gene expression data from a CSV file into a pandas DataFrame. -- raw_data=pd.read_csv('data/pdac/st_data.csv', index_col=0) +# Line 16: Create an AnnData object from the transposed spatial transcriptomics gene expression DataFrame. -- spatial_data=anndata.AnnData(raw_data.T) +# Line 17: Read spatial transcriptomics metadata and assign it to the AnnData object's obs attribute. -- spatial_data.obs = pd.read_csv('data/pdac/st_meta.csv', index_col=0) +# Line 18: Display the spatial_data AnnData object. -- spatial_data +# Line 20: Initialize the Single2Spatial model from omicverse to integrate single-cell and spatial data. -- st_model=ov.bulk2single.Single2Spatial(single_data=single_data, +# Line 21: Specify the spatial data for the Single2Spatial model. -- spatial_data=spatial_data, +# Line 22: Specify the cell type annotation key. -- celltype_key='Cell_type', +# Line 23: Specify the spot coordinate keys. -- spot_key=['xcoord','ycoord'], +# Line 27: Train the Single2Spatial model and create an AnnData object containing spatial predictions. -- sp_adata=st_model.train(spot_num=500, +# Line 28: Specify the cell number parameter for training. -- cell_num=10, +# Line 29: Specify the directory for saving the model. -- df_save_dir='data/pdac/predata_net/save_model', +# Line 30: Specify the file name for saving the model. -- df_save_name='pdac_df', +# Line 31: Specify training parameters k, num_epochs, batch_size, and predicted_size. -- k=10,num_epochs=1000,batch_size=1000,predicted_size=32) +# Line 34: Load a pre-trained Single2Spatial model from a saved file. -- sp_adata=st_model.load(modelsize=14478,df_load_dir='data/pdac/predata_net/save_model/pdac_df.pth', +# Line 35: Specify loading parameters k and predicted_size for the model. -- k=10,predicted_size=32) +# Line 37: Perform spatial spot assessment using the trained model. -- sp_adata_spot=st_model.spot_assess() +# Line 39: Create a spatial embedding plot for gene expression using scanpy's embedding function. -- sc.pl.embedding( +# Line 40: Specify the spatial embedding basis. -- sp_adata, +# Line 41: Specify the genes to color the embedding plot by. -- basis="X_spatial", +# Line 42: Turn off frame and set the number of columns for the plot. -- color=['REG1A', 'CLDN1', 'KRT16', 'MUC5B'], +# Line 43: Display the plot. -- frameon=False, +# Line 44: Turn off displaying plot. -- ncols=4, +# Line 49: Create a spatial embedding plot for spatial spots using scanpy's embedding function. -- sc.pl.embedding( +# Line 50: Specify the spatial embedding basis. -- sp_adata_spot, +# Line 51: Specify the genes to color the embedding plot by. -- basis="X_spatial", +# Line 52: Turn off frame and set the number of columns for the plot. -- color=['REG1A', 'CLDN1', 'KRT16', 'MUC5B'], +# Line 53: Turn off displaying plot. -- frameon=False, +# Line 54: Turn off displaying plot. -- ncols=4, +# Line 59: Create a spatial embedding plot for cell types in spatial spots using scanpy's embedding function. -- sc.pl.embedding( +# Line 60: Specify the spatial embedding basis. -- sp_adata_spot, +# Line 61: Specify the cell types to color the embedding plot by. -- basis="X_spatial", +# Line 62: Turn off frame and set the number of columns for the plot. -- color=['Acinar cells','Cancer clone A','Cancer clone B','Ductal'], +# Line 63: Turn off frame and set the number of columns for the plot. -- frameon=False, +# Line 64: Turn off displaying plot. -- ncols=4, +# Line 65: Turn off displaying plot. -- show=False, +# Line 70: Create a spatial embedding plot for cell types using scanpy's embedding function. -- sc.pl.embedding( +# Line 71: Specify the spatial embedding basis. -- sp_adata, +# Line 72: Specify the cell type annotation to color the embedding plot by. -- basis="X_spatial", +# Line 73: Turn off frame and set the number of columns for the plot. -- color=['Cell_type'], +# Line 74: Turn off frame and set the number of columns for the plot. -- frameon=False, +# Line 75: Turn off displaying plot. -- ncols=4, +# Line 76: Use a specific color palette from omicverse for the plot. -- show=False, +# Line 77: Use a specific color palette from omicverse for the plot. -- palette=ov.utils.ov_palette()[11:] +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_single_batch_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_single_batch_annotated.py new file mode 100644 index 00000000..8c5a0ca1 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_single_batch_annotated.py @@ -0,0 +1,60 @@ +``` +# Line 1: import omicverse as ov -- import omicverse as ov +# Line 3: import scanpy as sc -- import scanpy as sc +# Line 5: ov.utils.ov_plot_set() -- Sets the plotting style for omicverse. +# Line 7: adata1=ov.read('neurips2021_s1d3.h5ad') -- Reads the first AnnData object from the specified H5AD file. +# Line 8: adata1.obs['batch']='s1d3' -- Adds a 'batch' column to the observations of the first AnnData object with value 's1d3'. +# Line 9: adata2=ov.read('neurips2021_s2d1.h5ad') -- Reads the second AnnData object from the specified H5AD file. +# Line 10: adata2.obs['batch']='s2d1' -- Adds a 'batch' column to the observations of the second AnnData object with value 's2d1'. +# Line 11: adata3=ov.read('neurips2021_s3d7.h5ad') -- Reads the third AnnData object from the specified H5AD file. +# Line 12: adata3.obs['batch']='s3d7' -- Adds a 'batch' column to the observations of the third AnnData object with value 's3d7'. +# Line 14: adata=sc.concat([adata1,adata2,adata3],merge='same') -- Concatenates the three AnnData objects into a single AnnData object. +# Line 15: adata -- Displays the concatenated AnnData object. +# Line 17: adata.obs['batch'].unique() -- Displays the unique values of the 'batch' column in the observations of the concatenated AnnData object. +# Line 19: import numpy as np -- Imports the NumPy library as np. +# Line 20: adata.X=adata.X.astype(np.int64) -- Casts the data matrix of the AnnData object to a 64-bit integer type. +# Line 22: adata=ov.pp.qc(adata, tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}, batch_key='batch') -- Performs quality control filtering on the AnnData object based on specified thresholds and batch key. +# Line 24: adata -- Displays the quality controlled AnnData object. +# Line 26: adata=ov.pp.preprocess(adata,mode='shiftlog|pearson', n_HVGs=3000,batch_key=None) -- Preprocesses the AnnData object, including shift-log transformation and highly variable gene selection. +# Line 28: adata -- Displays the preprocessed AnnData object. +# Line 30: adata.raw = adata -- Stores a copy of the current AnnData object in the .raw attribute. +# Line 31: adata = adata[:, adata.var.highly_variable_features] -- Subsets the AnnData object to only include highly variable genes. +# Line 32: adata -- Displays the AnnData object after subsetting. +# Line 34: adata.write_h5ad('neurips2021_batch_normlog.h5ad',compression='gzip') -- Writes the AnnData object to a compressed H5AD file. +# Line 36: ov.pp.scale(adata) -- Scales the data matrix of the AnnData object. +# Line 37: ov.pp.pca(adata,layer='scaled',n_pcs=50,mask_var='highly_variable_features') -- Performs PCA on the scaled data using the highly variable features mask. +# Line 39: adata.obsm["X_mde_pca"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"]) -- Computes an MDE embedding from the PCA results and stores it in the .obsm. +# Line 41: ov.utils.embedding(adata, basis='X_mde_pca',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed PCA coordinates, color coded by batch and cell type. +# Line 43: adata_harmony=ov.single.batch_correction(adata,batch_key='batch', methods='harmony',n_pcs=50) -- Performs batch correction using Harmony. +# Line 44: adata -- Displays the AnnData object after Harmony batch correction. +# Line 46: adata.obsm["X_mde_harmony"] = ov.utils.mde(adata.obsm["X_harmony"]) -- Computes an MDE embedding from the Harmony corrected data and stores it in .obsm. +# Line 48: ov.utils.embedding(adata, basis='X_mde_harmony',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed Harmony coordinates, color coded by batch and cell type. +# Line 50: adata_combat=ov.single.batch_correction(adata,batch_key='batch', methods='combat',n_pcs=50) -- Performs batch correction using ComBat. +# Line 51: adata -- Displays the AnnData object after ComBat batch correction. +# Line 53: adata.obsm["X_mde_combat"] = ov.utils.mde(adata.obsm["X_combat"]) -- Computes an MDE embedding from the ComBat corrected data and stores it in .obsm. +# Line 55: ov.utils.embedding(adata, basis='X_mde_combat',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed ComBat coordinates, color coded by batch and cell type. +# Line 57: adata_scanorama=ov.single.batch_correction(adata,batch_key='batch', methods='scanorama',n_pcs=50) -- Performs batch correction using Scanorama. +# Line 58: adata -- Displays the AnnData object after Scanorama batch correction. +# Line 60: adata.obsm["X_mde_scanorama"] = ov.utils.mde(adata.obsm["X_scanorama"]) -- Computes an MDE embedding from the Scanorama corrected data and stores it in .obsm. +# Line 62: ov.utils.embedding(adata, basis='X_mde_scanorama',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed Scanorama coordinates, color coded by batch and cell type. +# Line 64: adata_scvi=ov.single.batch_correction(adata,batch_key='batch', methods='scVI',n_layers=2, n_latent=30, gene_likelihood="nb") -- Performs batch correction using scVI. +# Line 65: adata -- Displays the AnnData object after scVI batch correction. +# Line 67: adata.obsm["X_mde_scVI"] = ov.utils.mde(adata.obsm["X_scVI"]) -- Computes an MDE embedding from the scVI corrected data and stores it in .obsm. +# Line 69: ov.utils.embedding(adata, basis='X_mde_scVI',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed scVI coordinates, color coded by batch and cell type. +# Line 71: LDA_obj=ov.utils.LDA_topic(adata,feature_type='expression', highly_variable_key='highly_variable_features', layers='counts',batch_key='batch',learning_rate=1e-3) -- Initializes an LDA topic model using expression data and considering batch effects. +# Line 73: LDA_obj.plot_topic_contributions(6) -- Plots the contribution of the top 6 topics. +# Line 75: LDA_obj.predicted(15) -- Predicts the topic for each cell using the LDA model with 15 topics. +# Line 77: adata.obsm["X_mde_mira_topic"] = ov.utils.mde(adata.obsm["X_topic_compositions"]) -- Computes an MDE embedding of topic compositions and stores it in the .obsm. +# Line 78: adata.obsm["X_mde_mira_feature"] = ov.utils.mde(adata.obsm["X_umap_features"]) -- Computes an MDE embedding of UMAP features and stores it in the .obsm. +# Line 80: ov.utils.embedding(adata, basis='X_mde_mira_topic',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed topic compositions, color coded by batch and cell type. +# Line 83: ov.utils.embedding(adata, basis='X_mde_mira_feature',frameon='small', color=['batch','cell_type'],show=False) -- Generates an embedding plot using the MDE transformed UMAP features, color coded by batch and cell type. +# Line 85: adata.write_h5ad('neurips2021_batch_all.h5ad',compression='gzip') -- Writes the AnnData object containing all batch correction results to a compressed H5AD file. +# Line 87: adata=sc.read('neurips2021_batch_all.h5ad') -- Reads the AnnData object back from the specified H5AD file. +# Line 89: adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'].copy() -- Copies the PCA embedding from scaled data into a new '.obsm' key. +# Line 90: adata.obsm['X_mira_topic']=adata.obsm['X_topic_compositions'].copy() -- Copies the topic composition embedding to a new '.obsm' key. +# Line 91: adata.obsm['X_mira_feature']=adata.obsm['X_umap_features'].copy() -- Copies the UMAP feature embedding to a new '.obsm' key. +# Line 93: from scib_metrics.benchmark import Benchmarker -- Imports the Benchmarker class from the scib_metrics library. +# Line 94: bm = Benchmarker( adata, batch_key="batch", label_key="cell_type", embedding_obsm_keys=["X_pca", "X_combat", "X_harmony", 'X_scanorama','X_mira_topic','X_mira_feature','X_scVI'], n_jobs=8, ) -- Initializes a Benchmarker object for evaluating batch correction methods. +# Line 99: bm.benchmark() -- Runs the benchmark to evaluate the batch correction results. +# Line 101: bm.plot_results_table(min_max_scale=False) -- Plots a table summarizing the benchmark results. +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_slat_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_slat_annotated.py new file mode 100644 index 00000000..5e618efc --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_slat_annotated.py @@ -0,0 +1,147 @@ +``` +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the os module. -- import os +# Line 3: Import the scanpy library as sc. -- import scanpy as sc +# Line 4: Import the numpy library as np. -- import numpy as np +# Line 5: Import the pandas library as pd. -- import pandas as pd +# Line 6: Import the torch library. -- import torch +# Line 7: Set the plot style using omicverse. -- ov.plot_set() +# Line 9: Import specific modules from the omicverse external scSLAT library. -- from omicverse.externel.scSLAT.model import load_anndatas, Cal_Spatial_Net, run_SLAT, scanpy_workflow, spatial_match +# Line 10: Import specific visualization modules from the omicverse external scSLAT library. -- from omicverse.externel.scSLAT.viz import match_3D_multi, hist, Sankey, match_3D_celltype, Sankey,Sankey_multi,build_3D +# Line 11: Import the region_statistics module from the omicverse external scSLAT library. -- from omicverse.externel.scSLAT.metrics import region_statistics +# Line 12: Read the first AnnData object from an H5AD file. -- adata1 = sc.read_h5ad('data/E115_Stereo.h5ad') +# Line 13: Read the second AnnData object from an H5AD file. -- adata2 = sc.read_h5ad('data/E125_Stereo.h5ad') +# Line 14: Add a 'week' observation to adata1, setting it to 'E11.5'. -- adata1.obs['week']='E11.5' +# Line 15: Add a 'week' observation to adata2, setting it to 'E12.5'. -- adata2.obs['week']='E12.5' +# Line 16: Generate a spatial plot for adata1 colored by 'annotation' with a spot size of 3. -- sc.pl.spatial(adata1, color='annotation', spot_size=3) +# Line 17: Generate a spatial plot for adata2 colored by 'annotation' with a spot size of 3. -- sc.pl.spatial(adata2, color='annotation', spot_size=3) +# Line 18: Calculate the spatial network for adata1 using KNN with a k_cutoff of 20. -- Cal_Spatial_Net(adata1, k_cutoff=20, model='KNN') +# Line 19: Calculate the spatial network for adata2 using KNN with a k_cutoff of 20. -- Cal_Spatial_Net(adata2, k_cutoff=20, model='KNN') +# Line 20: Load edges and features from a list of AnnData objects, using DPCA features and not checking order. -- edges, features = load_anndatas([adata1, adata2], feature='DPCA', check_order=False) +# Line 21: Run the SLAT algorithm to get embeddings and time information. -- embd0, embd1, time = run_SLAT(features, edges, LGCN_layer=5) +# Line 22: Perform spatial matching between the embeddings of the two timepoints, not reordering, returning best match, index and distances. -- best, index, distance = spatial_match([embd0, embd1], reorder=False, adatas=[adata1,adata2]) +# Line 23: Create a numpy array representing the matching pairs between the two timepoints. -- matching = np.array([range(index.shape[0]), best]) +# Line 24: Extract the best match distances from the distance matrix. -- best_match = distance[:,0] +# Line 25: Calculate and print region statistics based on the matching distances. -- region_statistics(best_match, start=0.5, number_of_interval=10) +# Line 27: Import the matplotlib.pyplot module as plt. -- import matplotlib.pyplot as plt +# Line 28: Create a list containing the matching information. -- matching_list=[matching] +# Line 29: Build a 3D model using the two AnnData objects and matching list. -- model = build_3D([adata1,adata2], matching_list,subsample_size=300, ) +# Line 30: Draw a 3D visualization of the model, hiding axes. -- ax=model.draw_3D(hide_axis=True, line_color='#c2c2c2', height=1, size=[6,6], line_width=1) +# Line 32: Add low quality index as obs to adata2 -- adata2.obs['low_quality_index']= best_match +# Line 33: Convert low quality index to float -- adata2.obs['low_quality_index'] = adata2.obs['low_quality_index'].astype(float) +# Line 35: Access spatial coordinates stored in adata2.obsm. -- adata2.obsm['spatial'] +# Line 36: Generate a spatial plot of adata2, coloring by the 'low_quality_index', spot size 3, with the title "Quality". -- sc.pl.spatial(adata2, color='low_quality_index', spot_size=3, title='Quality') +# Line 38: Create a Sankey plot for the two AnnData objects based on annotation and given matching. -- fig=Sankey_multi(adata_li=[adata1,adata2], +# Line 39: Set the prefixes for each object to E11.5, E12.5. -- prefix_li=['E11.5','E12.5'], +# Line 40: Set the matching list to the previously defined matching array. -- matching_li=[matching], +# Line 41: Set the clusters to annotation and filter number to 10. -- clusters='annotation',filter_num=10, +# Line 42: Set node opacity to 0.8 -- node_opacity = 0.8, +# Line 43: Set link opacity to 0.2 -- link_opacity = 0.2, +# Line 44: Set layout to specified size. -- layout=[800,500], +# Line 45: Set font size to 12 -- font_size=12, +# Line 46: Set font color to black. -- font_color='Black', +# Line 47: Set save name to none. -- save_name=None, +# Line 48: Set format to png. -- format='png', +# Line 49: Set width to 1200. -- width=1200, +# Line 50: Set height to 1000. -- height=1000, +# Line 51: Return the figure object. -- return_fig=True) +# Line 52: Display the created Sankey plot. -- fig.show() +# Line 54: Save the Sankey plot as an HTML file. -- fig.write_html("slat_sankey.html") +# Line 56: Create a color dictionary for adata1's annotations. -- color_dict1=dict(zip(adata1.obs['annotation'].cat.categories, +# Line 57: Map the colors to adata1's annotations. -- adata1.uns['annotation_colors'].tolist())) +# Line 58: Create a pandas DataFrame for adata1 containing spatial information and celltype information. -- adata1_df = pd.DataFrame({'index':range(embd0.shape[0]), +# Line 59: Get x spatial coordinates from adata1 -- 'x': adata1.obsm['spatial'][:,0], +# Line 60: Get y spatial coordinates from adata1 -- 'y': adata1.obsm['spatial'][:,1], +# Line 61: Get celltype information from adata1 -- 'celltype':adata1.obs['annotation'], +# Line 62: Get color based on the celltype for each cell in adata1. -- 'color':adata1.obs['annotation'].map(color_dict1) +# Line 63: End of the dataframe declaration for adata1 -- } +# Line 64: Create a color dictionary for adata2's annotations. -- color_dict2=dict(zip(adata2.obs['annotation'].cat.categories, +# Line 65: Map the colors to adata2's annotations. -- adata2.uns['annotation_colors'].tolist())) +# Line 66: Create a pandas DataFrame for adata2 containing spatial information and celltype information. -- adata2_df = pd.DataFrame({'index':range(embd1.shape[0]), +# Line 67: Get x spatial coordinates from adata2 -- 'x': adata2.obsm['spatial'][:,0], +# Line 68: Get y spatial coordinates from adata2 -- 'y': adata2.obsm['spatial'][:,1], +# Line 69: Get celltype information from adata2 -- 'celltype':adata2.obs['annotation'], +# Line 70: Get color based on the celltype for each cell in adata2. -- 'color':adata2.obs['annotation'].map(color_dict2) +# Line 71: End of the dataframe declaration for adata2 -- } +# Line 73: Create a 3D celltype-specific alignment visualization. -- kidney_align = match_3D_celltype(adata1_df, adata2_df, matching, meta='celltype', +# Line 74: Highlight specific cell types during the alignment visualization. -- highlight_celltype = [['Urogenital ridge'],['Kidney','Ovary']], +# Line 75: Set the subsample size for the alignment to 10000, the highlight line color to blue and to scale the coordinate. -- subsample_size=10000, highlight_line = ['blue'], scale_coordinate = True ) +# Line 76: Draw the 3D alignment visualization, specifying size, line width, point sizes, and hiding axes. -- kidney_align.draw_3D(size= [6, 6], line_width =0.8, point_size=[0.6,0.6], hide_axis=True) +# Line 78: Define a function to calculate matching cells based on a specific query cell. -- def cal_matching_cell(target_adata,query_adata,matching,query_cell,clusters='annotation',): +# Line 79: Create a DataFrame for target_adata containing spatial information and celltype information. -- adata1_df = pd.DataFrame({'index':range(target_adata.shape[0]), +# Line 80: Get x spatial coordinates from target_adata. -- 'x': target_adata.obsm['spatial'][:,0], +# Line 81: Get y spatial coordinates from target_adata. -- 'y': target_adata.obsm['spatial'][:,1], +# Line 82: Get celltype information from target_adata based on given cluster. -- 'celltype':target_adata.obs[clusters]}) +# Line 83: Create a DataFrame for query_adata containing spatial information and celltype information. -- adata2_df = pd.DataFrame({'index':range(query_adata.shape[0]), +# Line 84: Get x spatial coordinates from query_adata. -- 'x': query_adata.obsm['spatial'][:,0], +# Line 85: Get y spatial coordinates from query_adata. -- 'y': query_adata.obsm['spatial'][:,1], +# Line 86: Get celltype information from query_adata based on given cluster. -- 'celltype':query_adata.obs[clusters]}) +# Line 87: Create a new anndata based on matching of the celltype in query_cell in the query_adata based on the matching from the target adata -- query_adata = target_adata[matching[1,adata2_df.loc[adata2_df.celltype==query_cell,'index'].values],:] +# Line 88: Commented out code which would add the target cell type and index to the query adata dataframe. -- #adata2_df['target_celltype'] = adata1_df.iloc[matching[1,:],:]['celltype'].to_list() +# Line 89: Commented out code which would add the target cell type and index to the query adata dataframe. -- #adata2_df['target_obs_names'] = adata1_df.iloc[matching[1,:],:].index.to_list() +# Line 91: Returns the query adata containing the matched cells. -- return query_adata +# Line 94: Call cal_matching_cell to extract the target cells corresponding to 'Kidney' cells in the second adata. -- query_adata=cal_matching_cell(target_adata=adata1, +# Line 95: Pass adata2 as the query adata to the cal_matching_cell. -- query_adata=adata2, +# Line 96: Pass the matching array to the cal_matching_cell. -- matching=matching, +# Line 97: Specify the query cell as Kidney. -- query_cell='Kidney',clusters='annotation') +# Line 98: Returns the query_adata. -- query_adata +# Line 100: Initialize the column 'kidney_anno' in adata1 to empty strings. -- adata1.obs['kidney_anno']='' +# Line 101: Set the 'kidney_anno' of the matching cell in adata1 according to its annotation in the corresponding cell from query_adata. -- adata1.obs.loc[query_adata.obs.index,'kidney_anno']=query_adata.obs['annotation'] +# Line 103: Generate a spatial plot of adata1, coloring by 'kidney_anno', spot size 3, using a specified palette. -- sc.pl.spatial(adata1, color='kidney_anno', spot_size=3, +# Line 104: Specify the palette for the spatial plot. -- palette=['#F5F5F5','#ff7f0e', 'green',]) +# Line 106: Concatenate the query_adata with the kidney cell from adata2 and combine the anndata objects. -- kidney_lineage_ad=sc.concat([query_adata,adata2[adata2.obs['annotation']=='Kidney']],merge='same') +# Line 107: Preprocess the concatenated AnnData object using shiftlog|pearson method, HVGs=3000 and target_sum=1e4. -- kidney_lineage_ad=ov.pp.preprocess(kidney_lineage_ad,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4) +# Line 108: Store the original count data in raw object. -- kidney_lineage_ad.raw = kidney_lineage_ad +# Line 109: Select the highly variable features from the AnnData object. -- kidney_lineage_ad = kidney_lineage_ad[:, kidney_lineage_ad.var.highly_variable_features] +# Line 110: Scale the data in the AnnData object. -- ov.pp.scale(kidney_lineage_ad) +# Line 111: Perform PCA on the scaled data. -- ov.pp.pca(kidney_lineage_ad) +# Line 112: Calculate neighbors using scaled,original,X_pca and cosine distance. -- ov.pp.neighbors(kidney_lineage_ad,use_rep='scaled|original|X_pca',metric="cosine") +# Line 113: Perform Leiden clustering on the AnnData object. -- ov.utils.cluster(kidney_lineage_ad,method='leiden',resolution=1) +# Line 114: Calculate UMAP embeddings. -- ov.pp.umap(kidney_lineage_ad) +# Line 116: Generate a UMAP plot colored by 'annotation','week','leiden' with small frame. -- ov.pl.embedding(kidney_lineage_ad,basis='X_umap', +# Line 117: Specify the colors for the embedding plot. -- color=['annotation','week','leiden'], +# Line 118: Specify the frameon for the embedding plot. -- frameon='small') +# Line 120: Generate a dotplot for specified genes grouped by Leiden clusters. -- sc.pl.dotplot(kidney_lineage_ad,{'nephron progenitors':['Wnt9b','Osr1','Nphs1','Lhx1','Pax2','Pax8'], +# Line 121: Define a second gene group for the dotplot. -- 'metanephric':['Eya1','Shisa3','Foxc1'], +# Line 122: Define a third gene group for the dotplot. -- 'kidney':['Wt1','Wnt4','Nr2f2','Dach1','Cd44']} , +# Line 123: Specify the group to show the dotplot on and hide the dendrogram and specify colorbar title. -- 'leiden',dendrogram=False,colorbar_title='Expression') +# Line 125: Add a column called re_anno to the kidney lineage adata. -- kidney_lineage_ad.obs['re_anno'] = 'Unknown' +# Line 126: Sets the re_anno category for leiden cluster 4. -- kidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden.isin(['4']),'re_anno'] = 'Nephron progenitors (E11.5)' +# Line 127: Sets the re_anno category for leiden clusters 2,3,1,5. -- kidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden.isin(['2','3','1','5']),'re_anno'] = 'Metanephron progenitors (E11.5)' +# Line 128: Sets the re_anno category for leiden cluster 0. -- kidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden=='0','re_anno'] = 'Kidney (E12.5)' +# Line 130: Commented out line that was supposed to filter cells by leiden cluster 3 -- # kidney_all = kidney_all[kidney_all.obs.leiden!='3',:] +# Line 131: Convert leiden cluster column to list -- kidney_lineage_ad.obs.leiden = list(kidney_lineage_ad.obs.leiden) +# Line 132: Generate a UMAP plot colored by 'annotation', 're_anno' with small frame. -- ov.pl.embedding(kidney_lineage_ad,basis='X_umap', +# Line 133: Specify the colors for the embedding plot -- color=['annotation','re_anno'], +# Line 134: Specify the frameon for the embedding plot. -- frameon='small') +# Line 136: Initialize the column 'kidney_anno' in adata1 to empty strings. -- adata1.obs['kidney_anno']='' +# Line 137: Set the 'kidney_anno' of the cells in adata1 where week is E11.5 according to re_anno of the kidney lineage cells of E11.5. -- adata1.obs.loc[kidney_lineage_ad[kidney_lineage_ad.obs['week']=='E11.5'].obs.index,'kidney_anno']=kidney_lineage_ad[kidney_lineage_ad.obs['week']=='E11.5'].obs['re_anno'] +# Line 139: Import matplotlib.pyplot as plt. -- import matplotlib.pyplot as plt +# Line 140: Create a subplots object with a size of 8x8. -- fig, ax = plt.subplots(1, 1, figsize=(8, 8)) +# Line 141: Generate a spatial plot of adata1 colored by 'kidney_anno', size 1.5, using the palette specified and specify show=False. -- sc.pl.spatial(adata1, color='kidney_anno', spot_size=1.5, +# Line 142: Specify the palette to use for the spatial plot and do not show the spatial plot immediately. -- palette=['#F5F5F5','#ff7f0e', 'green',],show=False,ax=ax) +# Line 144: Assign kidney_lineage_ad to test_adata. -- test_adata=kidney_lineage_ad +# Line 145: Create a pyDEG object from the transposed lognorm data of test_adata. -- dds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T) +# Line 146: Remove duplicates from the index. -- dds.drop_duplicates_index() +# Line 147: Print the success of dropping duplicates from the index. -- print('... drop_duplicates_index success') +# Line 148: Get the list of index for the cells in test_adata where the week is E12.5 as the treatment group. -- treatment_groups=test_adata.obs[test_adata.obs['week']=='E12.5'].index.tolist() +# Line 149: Get the list of index for the cells in test_adata where the week is E11.5 as the control group. -- control_groups=test_adata.obs[test_adata.obs['week']=='E11.5'].index.tolist() +# Line 150: Run deg analysis with ttest method with treatment groups and control groups. -- result=dds.deg_analysis(treatment_groups,control_groups,method='ttest') +# Line 151: Sets the foldchange threshold, pvalue threshold and max log pvalue for the foldchange set. -- # -1 means automatically calculates +# Line 152: Sets the foldchange threshold, pvalue threshold and logp_max. -- dds.foldchange_set(fc_threshold=-1, +# Line 153: Sets the pvalue and max logp threshold. -- pval_threshold=0.05, +# Line 154: Sets the logp_max. -- logp_max=10) +# Line 156: Generate a volcano plot for the DEG analysis with title "DEG Analysis". -- dds.plot_volcano(title='DEG Analysis',figsize=(4,4), +# Line 157: Set number of genes to plot to 8 and the font size to 12. -- plot_genes_num=8,plot_genes_fontsize=12,) +# Line 159: Gets the index of the top 3 up regulated genes from the DEG result. -- up_gene=dds.result.loc[dds.result['sig']=='up'].sort_values('qvalue')[:3].index.tolist() +# Line 160: Gets the index of the top 3 down regulated genes from the DEG result. -- down_gene=dds.result.loc[dds.result['sig']=='down'].sort_values('qvalue')[:3].index.tolist() +# Line 161: Combines the up regulated and down regulated gene lists. -- deg_gene=up_gene+down_gene +# Line 163: Generate a dotplot of specified genes grouped by 're_anno'. -- sc.pl.dotplot(kidney_lineage_ad,deg_gene, +# Line 164: Specify the group for the dotplot. -- groupby='re_anno') +# Line 166: Compute the dendrogram based on re_anno on the specified scale data. -- sc.tl.dendrogram(kidney_lineage_ad,'re_anno',use_rep='scaled|original|X_pca') +# Line 167: Perform ranked gene group analysis using t-test based on re_anno and scaled data. -- sc.tl.rank_genes_groups(kidney_lineage_ad, 're_anno', use_rep='scaled|original|X_pca', +# Line 168: Specify method to use for gene ranking. -- method='t-test',use_raw=False,key_added='re_anno_ttest') +# Line 169: Generate a dotplot of the ranked gene groups with the group name set to re_anno. -- sc.pl.rank_genes_groups_dotplot(kidney_lineage_ad,groupby='re_anno', +# Line 170: Specify the cmap, key and standard scale for the dotplot and the number of genes to show for each group. -- cmap='RdBu_r',key='re_anno_ttest', +# Line 171: Set the standard scale and number of genes. -- standard_scale='var',n_genes=3) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py new file mode 100644 index 00000000..a7f7f0d8 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py @@ -0,0 +1,31 @@ +```python +# Line 1: Import the omicverse library and alias it as ov. -- import omicverse as ov +# Line 3: Import the scanpy library and alias it as sc. -- import scanpy as sc +# Line 5: Set plotting parameters for omicverse. -- ov.utils.ov_plot_set() +# Line 7: Read Visium spatial transcriptomics data into an AnnData object. -- adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5') +# Line 8: Make variable names in the AnnData object unique. -- adata.var_names_make_unique() +# Line 10: Calculate quality control metrics for the AnnData object. -- sc.pp.calculate_qc_metrics(adata, inplace=True) +# Line 11: Filter the AnnData object to keep genes with total counts greater than 100. -- adata = adata[:,adata.var['total_counts']>100] +# Line 12: Compute spatial variable genes using the spatial variance of a gene and add the results to adata.var. -- adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform="visium",) +# Line 13: Store the original count data in adata.raw. -- adata.raw = adata +# Line 14: Filter adata to keep only spatially variable genes, as calculated by ov.space.svg. -- adata = adata[:, adata.var.space_variable_features] +# Line 15: Display the AnnData object (no-op). -- adata +# Line 18: Import the pandas library and alias it as pd. -- import pandas as pd +# Line 19: Import the os library. -- import os +# Line 20: Read ground truth annotation data into a pandas DataFrame, using the first column as index. -- Ann_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0) +# Line 21: Set the column name of the annotation DataFrame to 'Ground Truth'. -- Ann_df.columns = ['Ground Truth'] +# Line 22: Add ground truth annotations to the AnnData object's observation metadata using the data in the Ann_df dataframe. -- adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth'] +# Line 23: Generate a spatial plot of the AnnData object colored by the ground truth annotation. -- sc.pl.spatial(adata, img_key="hires", color=["Ground Truth"]) +# Line 25: Create a PySpaceFlow object with AnnData object as an input. -- sf_obj=ov.space.pySpaceFlow(adata) +# Line 27: Train the PySpaceFlow model with specified parameters. -- sf_obj.train(spatial_regularization_strength=0.1, +# Line 28: Continuation of training parameters -- z_dim=50, lr=1e-3, epochs=1000, +# Line 29: Continuation of training parameters -- max_patience=50, min_stop=100, +# Line 30: Continuation of training parameters -- random_seed=42, gpu=0, +# Line 31: Continuation of training parameters -- regularization_acceleration=True, edge_subset_sz=1000000) +# Line 33: Calculate pseudo-spatial mapping (pSM) using the trained PySpaceFlow model. -- sf_obj.cal_pSM(n_neighbors=20,resolution=1, +# Line 34: Continuation of pSM parameters -- max_cell_for_subsampling=5000,psm_key='pSM_spaceflow') +# Line 36: Generate a spatial plot colored by both 'pSM_spaceflow' and 'Ground Truth', using RdBu_r colormap. -- sc.pl.spatial(adata, color=['pSM_spaceflow','Ground Truth'],cmap='RdBu_r') +# Line 38: Cluster the AnnData object using a Gaussian Mixture Model on the spaceflow representation. -- ov.utils.cluster(adata,use_rep='spaceflow',method='GMM',n_components=7,covariance_type='full', +# Line 39: Continuation of GMM parameters -- tol=1e-9, max_iter=1000, random_state=3607) +# Line 41: Generate a spatial plot colored by GMM cluster assignment and the ground truth annotations. -- sc.pl.spatial(adata, color=['gmm_cluster',"Ground Truth"]) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py new file mode 100644 index 00000000..6b174081 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py @@ -0,0 +1,66 @@ +```python +# Line 1: Import the omicverse library, aliased as ov. -- import omicverse as ov +# Line 3: Import the scanpy library, aliased as sc. -- import scanpy as sc +# Line 5: Set the plotting parameters for omicverse. -- ov.plot_set() +# Line 7: Reads Visium spatial data into an AnnData object named adata, specifying the data path and count file. -- adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5') +# Line 8: Makes gene names unique in the AnnData object adata. -- adata.var_names_make_unique() +# Line 10: Calculate quality control metrics for the AnnData object and store them in place. -- sc.pp.calculate_qc_metrics(adata, inplace=True) +# Line 11: Filters the AnnData object, keeping only genes with a total count greater than 100. -- adata = adata[:,adata.var['total_counts']>100] +# Line 12: Performs spatial variable gene selection using the 'prost' mode with a specified number of genes, target sum, and platform. -- adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform="visium",) +# Line 13: Displays the adata object. -- adata +# Line 15: Writes the AnnData object to an h5ad file with gzip compression. -- adata.write('data/cluster_svg.h5ad',compression='gzip') +# Line 19: Imports the pandas library, aliased as pd. -- import pandas as pd +# Line 20: Imports the os library. -- import os +# Line 21: Reads a tab-separated file into a Pandas DataFrame, setting the first column as the index. -- Ann_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0) +# Line 22: Assigns the column name 'Ground Truth' to the Pandas DataFrame. -- Ann_df.columns = ['Ground Truth'] +# Line 23: Adds the 'Ground Truth' annotations to the AnnData object's observation metadata. -- adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth'] +# Line 24: Plots spatial data with annotations colored by 'Ground Truth'. -- sc.pl.spatial(adata, img_key="hires", color=["Ground Truth"]) +# Line 27: Initializes a GraphST model. -- model = ov.externel.GraphST.GraphST(adata, device='cuda:0') +# Line 30: Trains the GraphST model and updates the AnnData object. -- adata = model.train(n_pcs=30) +# Line 32: Performs clustering using mclust with specified parameters. -- ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10, modelNames='EEV', random_state=112, ) +# Line 34: Refines the mclust labels and saves them to a new column. -- adata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust') +# Line 36: Computes the neighborhood graph for the data using PCA representation. -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20, use_rep='graphst|original|X_pca') +# Line 37: Performs clustering using the louvain algorithm. -- ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='louvain',resolution=0.7) +# Line 38: Performs clustering using the leiden algorithm. -- ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='leiden',resolution=0.7) +# Line 39: Refines louvain labels and saves them to a new column. -- adata.obs['louvain_GraphST'] = ov.utils.refine_label(adata, radius=50, key='louvain') +# Line 40: Refines leiden labels and saves them to a new column. -- adata.obs['leiden_GraphST'] = ov.utils.refine_label(adata, radius=50, key='leiden') +# Line 42: Generates spatial plots using the calculated cluster labels and the "Ground Truth". -- sc.pl.spatial(adata, color=['mclust_GraphST','leiden_GraphST', 'louvain_GraphST',"Ground Truth"]) +# Line 46: Assigns the first spatial coordinate from spatial obsm to 'X' in adata.obs. -- adata.obs['X'] = adata.obsm['spatial'][:,0] +# Line 47: Assigns the second spatial coordinate from spatial obsm to 'Y' in adata.obs. -- adata.obs['Y'] = adata.obsm['spatial'][:,1] +# Line 48: Accesses the first element in the 'X' column of adata.obs. -- adata.obs['X'][0] +# Line 50: Initializes a pySTAGATE model. -- STA_obj=ov.space.pySTAGATE(adata,num_batch_x=3,num_batch_y=2, spatial_key=['X','Y'],rad_cutoff=200,num_epoch = 1000,lr=0.001, weight_decay=1e-4,hidden_dims = [512, 30], device='cuda:0') +# Line 55: Trains the STAGATE model. -- STA_obj.train() +# Line 57: Predicts results from the trained STAGATE model. -- STA_obj.predicted() +# Line 58: Displays the adata object. -- adata +# Line 60: Performs clustering on the STAGATE representation. -- ov.utils.cluster(adata,use_rep='STAGATE',method='mclust',n_components=8, modelNames='EEV', random_state=112, ) +# Line 62: Refines the mclust labels and saves them to a new column. -- adata.obs['mclust_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='mclust') +# Line 64: Computes neighborhood graph using the STAGATE representation. -- sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20, use_rep='STAGATE') +# Line 65: Performs clustering using the louvain algorithm. -- ov.utils.cluster(adata,use_rep='STAGATE',method='louvain',resolution=0.5) +# Line 66: Performs clustering using the leiden algorithm. -- ov.utils.cluster(adata,use_rep='STAGATE',method='leiden',resolution=0.5) +# Line 67: Refines the louvain labels and saves them to a new column. -- adata.obs['louvain_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='louvain') +# Line 68: Refines the leiden labels and saves them to a new column. -- adata.obs['leiden_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='leiden') +# Line 70: Generates spatial plots of STAGATE clusterings along with Ground Truth. -- sc.pl.spatial(adata, color=['mclust_STAGATE','leiden_STAGATE', 'louvain_STAGATE',"Ground Truth"]) +# Line 72: Sorts and displays the top 10 genes based on their PI value. -- adata.var.sort_values('PI',ascending=False).head(10) +# Line 74: Sets the gene to plot to be 'MBP'. -- plot_gene = 'MBP' +# Line 75: Imports the matplotlib library. -- import matplotlib.pyplot as plt +# Line 76: Creates a figure and subplots for visualization. -- fig, axs = plt.subplots(1, 2, figsize=(8, 4)) +# Line 77: Creates a spatial plot of raw gene expression. -- sc.pl.spatial(adata, img_key="hires", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99') +# Line 78: Creates a spatial plot of STAGATE gene expression. -- sc.pl.spatial(adata, img_key="hires", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99') +# Line 81: Calculates pseudospacial similarity matrix pSM -- STA_obj.cal_pSM(n_neighbors=20,resolution=1, max_cell_for_subsampling=5000) +# Line 82: Displays the adata object. -- adata +# Line 84: Generates a spatial plot visualizing 'Ground Truth' and the 'pSM_STAGATE'. -- sc.pl.spatial(adata, color=['Ground Truth','pSM_STAGATE'], cmap='RdBu_r') +# Line 86: Imports the adjusted_rand_score function from scikit-learn. -- from sklearn.metrics.cluster import adjusted_rand_score +# Line 88: Creates an observation dataframe, dropping all rows with NA values. -- obs_df = adata.obs.dropna() +# Line 90: Calculates and prints Adjusted Rand Index between mclust_GraphST labels and ground truth. -- ARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth']) +# Line 91: Prints the adjusted rand index of mclust_GraphST vs ground truth. -- print('mclust_GraphST: Adjusted rand index = %.2f' %ARI) +# Line 93: Calculates and prints Adjusted Rand Index between leiden_GraphST labels and ground truth. -- ARI = adjusted_rand_score(obs_df['leiden_GraphST'], obs_df['Ground Truth']) +# Line 94: Prints the adjusted rand index of leiden_GraphST vs ground truth. -- print('leiden_GraphST: Adjusted rand index = %.2f' %ARI) +# Line 96: Calculates and prints Adjusted Rand Index between louvain_GraphST labels and ground truth. -- ARI = adjusted_rand_score(obs_df['louvain_GraphST'], obs_df['Ground Truth']) +# Line 97: Prints the adjusted rand index of louvain_GraphST vs ground truth. -- print('louvain_GraphST: Adjusted rand index = %.2f' %ARI) +# Line 99: Calculates and prints Adjusted Rand Index between mclust_STAGATE labels and ground truth. -- ARI = adjusted_rand_score(obs_df['mclust_STAGATE'], obs_df['Ground Truth']) +# Line 100: Prints the adjusted rand index of mclust_STAGATE vs ground truth. -- print('mclust_STAGATE: Adjusted rand index = %.2f' %ARI) +# Line 102: Calculates and prints Adjusted Rand Index between leiden_STAGATE labels and ground truth. -- ARI = adjusted_rand_score(obs_df['leiden_STAGATE'], obs_df['Ground Truth']) +# Line 103: Prints the adjusted rand index of leiden_STAGATE vs ground truth. -- print('leiden_STAGATE: Adjusted rand index = %.2f' %ARI) +# Line 105: Calculates and prints Adjusted Rand Index between louvain_STAGATE labels and ground truth. -- ARI = adjusted_rand_score(obs_df['louvain_STAGATE'], obs_df['Ground Truth']) +# Line 106: Prints the adjusted rand index of louvain_STAGATE vs ground truth. -- print('louvain_STAGATE: Adjusted rand index = %.2f' %ARI) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py new file mode 100644 index 00000000..5023d6de --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py @@ -0,0 +1,56 @@ +```python +# Line 1: Imports the csr_matrix class from the scipy.sparse module for creating sparse matrices. -- from scipy.sparse import csr_matrix +# Line 2: Imports the omicverse library as ov. -- import omicverse as ov +# Line 3: Imports the scanpy library as sc. -- import scanpy as sc +# Line 4: Imports the anndata library as ad. -- import anndata as ad +# Line 5: Imports the pandas library as pd. -- import pandas as pd +# Line 6: Imports the os module for interacting with the operating system. -- import os +# Line 8: Sets the plotting style for omicverse. -- ov.utils.ov_plot_set() +# Line 10: Initializes an empty list called Batch_list. -- Batch_list = [] +# Line 11: Initializes an empty list called adj_list. -- adj_list = [] +# Line 12: Defines a list of section IDs, likely corresponding to different datasets. -- section_ids = ['Slide-seqV2_MoB', 'Stereo-seq_MoB'] +# Line 13: Prints the section IDs. -- print(section_ids) +# Line 14: Defines a variable 'pathway' which is a string representing the path to the STAligner directory. -- pathway = '/storage/zengjianyangLab/hulei/scRNA-seq/scripts/STAligner' +# Line 16: Starts a loop that iterates through each section ID. -- for section_id in section_ids: +# Line 17: Prints the current section ID. -- print(section_id) +# Line 18: Reads an AnnData object from an h5ad file based on the section ID. -- adata = sc.read_h5ad(os.path.join(pathway,section_id+".h5ad")) +# Line 20: Checks if the data matrix (adata.X) is a pandas DataFrame. -- if isinstance(adata.X, pd.DataFrame): +# Line 21: If adata.X is a DataFrame, converts it to a sparse matrix in CSR format. -- adata.X = csr_matrix(adata.X) +# Line 22: If adata.X is not a DataFrame, pass does nothing and continue -- else: +# Line 23: Does nothing if the if statement on Line 20 is false -- pass +# Line 25: Makes the variable names in adata unique by appending '++' to duplicates. -- adata.var_names_make_unique(join="++") +# Line 28: Creates unique observation names by appending the section ID. -- adata.obs_names = [x+'_'+section_id for x in adata.obs_names] +# Line 31: Calculates spatial network based on spot coordinates and saves to adata.uns['adj']. -- ov.space.Cal_Spatial_Net(adata, rad_cutoff=50) # the spatial network are saved in adata.uns[‘adj’] +# Line 34: Identifies highly variable genes using the seurat_v3 method, selecting the top 10000. -- sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=10000) +# Line 35: Normalizes counts such that each cell has a total count equal to target_sum. -- sc.pp.normalize_total(adata, target_sum=1e4) +# Line 36: Applies a log transformation to normalized counts. -- sc.pp.log1p(adata) +# Line 38: Subsets the AnnData object to only keep highly variable genes. -- adata = adata[:, adata.var['highly_variable']] +# Line 39: Appends the adjacency matrix to adj_list. -- adj_list.append(adata.uns['adj']) +# Line 40: Appends the processed AnnData object to the Batch_list. -- Batch_list.append(adata) +# Line 43: Prints the list of AnnData Objects that have been loaded -- Batch_list +# Line 45: Concatenates the AnnData objects in Batch_list into a single object, adding a 'slice_name' column and using section_ids as keys. -- adata_concat = ad.concat(Batch_list, label="slice_name", keys=section_ids) +# Line 46: Creates a batch_name column from the slice_name column and converts to category type -- adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category') +# Line 47: Prints the shape of the concatenated AnnData object. -- print('adata_concat.shape: ', adata_concat.shape) +# Line 49: Comment: Measures the time for this code block. -- %%time +# Line 50: Creates a list of tuples indicating the order of slice integration (iter_comb). -- iter_comb = [(i, i + 1) for i in range(len(section_ids) - 1)] +# Line 53: Creates an STAligner object for integrating spatial transcriptomics data. -- STAligner_obj = ov.space.pySTAligner(adata_concat, verbose=True, knn_neigh = 100, n_epochs = 600, iter_comb = iter_comb, +# Line 54: Initializes the STAligner object with the concatenated adata, sets training parameters, and specifies the batch key. -- batch_key = 'batch_name', key_added='STAligner', Batch_list = Batch_list) +# Line 56: Trains the STAligner model. -- STAligner_obj.train() +# Line 58: Gets the predicted latent representation from the trained model. -- adata = STAligner_obj.predicted() +# Line 60: Computes the nearest neighbors graph using the STAligner embedding. -- sc.pp.neighbors(adata, use_rep='STAligner', random_state=666) +# Line 61: Clusters the cells using the Leiden algorithm based on the STAligner representation. -- ov.utils.cluster(adata,use_rep='STAligner',method='leiden',resolution=0.4) +# Line 62: Runs UMAP for dimensionality reduction using the STAligner embedding. -- sc.tl.umap(adata, random_state=666) +# Line 63: Generates and displays a UMAP plot, coloring by batch and cluster, and sets plot space. -- sc.pl.umap(adata, color=['batch_name',"leiden"],wspace=0.5) +# Line 66: Imports the matplotlib.pyplot module as plt for plotting. -- import matplotlib.pyplot as plt +# Line 67: Sets spot size for the spatial plots. -- spot_size = 50 +# Line 68: Sets title size for the spatial plots. -- title_size = 15 +# Line 69: Creates a figure and two subplots for spatial visualization. -- fig, ax = plt.subplots(1, 2, figsize=(6, 3), gridspec_kw={'wspace': 0.05, 'hspace': 0.2}) +# Line 70: Generates spatial plots for 'Slide-seqV2_MoB' colored by cluster and removes the legend. -- _sc_0 = sc.pl.spatial(adata[adata.obs['batch_name'] == 'Slide-seqV2_MoB'], img_key=None, color=['leiden'], title=['Slide-seqV2'], +# Line 71: Sets parameters for the first spatial plot such as legend font size, show plot flag, axis object, and removes plot frame -- legend_fontsize=10, show=False, ax=ax[0], frameon=False, spot_size=spot_size, legend_loc=None) +# Line 72: Sets the title of the first spatial plot and sets the size. -- _sc_0[0].set_title('Slide-seqV2', size=title_size) +# Line 74: Generates spatial plots for 'Stereo-seq_MoB' colored by cluster and removes the legend. -- _sc_1 = sc.pl.spatial(adata[adata.obs['batch_name'] == 'Stereo-seq_MoB'], img_key=None, color=['leiden'], title=['Stereo-seq'], +# Line 75: Sets parameters for the second spatial plot such as legend font size, show plot flag, axis object, and removes plot frame -- legend_fontsize=10, show=False, ax=ax[1], frameon=False, spot_size=spot_size) +# Line 76: Sets the title of the second spatial plot and sets the size. -- _sc_1[0].set_title('Stereo-seq',size=title_size) +# Line 77: Inverts the y-axis of the second spatial plot. -- _sc_1[0].invert_yaxis() +# Line 78: Displays the generated plots. -- plt.show() +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_starfysh_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_starfysh_annotated.py new file mode 100644 index 00000000..118c0308 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_starfysh_annotated.py @@ -0,0 +1,132 @@ +``` +# Line 1: # Line 1: Import the scanpy library for single-cell analysis. -- import scanpy as sc +# Line 2: # Line 2: Import the omicverse library for multi-omics analysis. -- import omicverse as ov +# Line 3: # Line 3: Set plotting parameters using omicverse. -- ov.plot_set() +# Line 5: # Line 5: Import specific modules from the starfysh package within omicverse. -- from omicverse.externel.starfysh import (AA, utils, plot_utils, post_analysis) +# Line 6: # Line 6: Import the starfysh model implementation. -- from omicverse.externel.starfysh import _starfysh as sf_model +# Line 8: # Line 8: Specify the path to the data directory. -- data_path = 'data/star_data' +# Line 9: # Line 9: Specify the sample ID. -- sample_id = 'CID44971_TNBC' +# Line 10: # Line 10: Specify the name of the signature gene file. -- sig_name = 'bc_signatures_version_1013.csv' +# Line 12: # Line 12: Load the AnnData object and normalized data using the specified paths and sample id, keeping 2000 highly variable genes. -- adata, adata_normed = utils.load_adata(data_folder=data_path, +# Line 13: # Line 13: Specify the sample id (comment). -- sample_id=sample_id, # sample id +# Line 14: # Line 14: Specify the number of highly variable genes to keep. -- n_genes=2000 # number of highly variable genes to keep +# Line 16: # Line 16: Import the pandas library for data manipulation. -- import pandas as pd +# Line 17: # Line 17: Import the os library for file system interactions. -- import os +# Line 18: # Line 18: Read the gene signature file into a pandas DataFrame. -- gene_sig = pd.read_csv(os.path.join(data_path, sig_name)) +# Line 19: # Line 19: Filter the gene signature DataFrame based on genes present in the AnnData object. -- gene_sig = utils.filter_gene_sig(gene_sig, adata.to_df()) +# Line 20: # Line 20: Display the head of the gene signature DataFrame. -- gene_sig.head() +# Line 22: # Line 22: Preprocess the image data, extracting spatial information. -- img_metadata = utils.preprocess_img(data_path, +# Line 23: # Line 23: Pass the sample ID for preprocessing. -- sample_id, +# Line 24: # Line 24: Pass the adata index for preprocessing. -- adata_index=adata.obs.index, +# Line 26: # Line 26: Extract the image, map information, and scaling factor from the processed metadata. -- img, map_info, scalefactor = img_metadata['img'], img_metadata['map_info'], img_metadata['scalefactor'] +# Line 27: # Line 27: Calculate the UMAP embeddings for the AnnData object. -- umap_df = utils.get_umap(adata, display=True) +# Line 30: # Line 30: Import the matplotlib library for plotting. -- import matplotlib.pyplot as plt +# Line 31: # Line 31: Create a new figure with a specific size and resolution for the image. -- plt.figure(figsize=(6, 6), dpi=80) +# Line 32: # Line 32: Display the loaded image. -- plt.imshow(img) +# Line 34: # Line 34: Display the head of the spatial mapping information DataFrame. -- map_info.head() +# Line 36: # Line 36: Define the arguments for the Visium analysis, including adata, gene signatures, and spatial data. -- visium_args = utils.VisiumArguments(adata, +# Line 37: # Line 37: Include the normalized adata. -- adata_normed, +# Line 38: # Line 38: Include the gene signatures. -- gene_sig, +# Line 39: # Line 39: Include the img_metadata. -- img_metadata, +# Line 40: # Line 40: Specify the number of anchor spots. -- n_anchors=60, +# Line 41: # Line 41: Specify the window size for spatial analysis. -- window_size=3, +# Line 42: # Line 42: Specify the sample ID. -- sample_id=sample_id +# Line 44: # Line 44: Get the modified AnnData and normalized data using the VisiumArguments object. -- adata, adata_normed = visium_args.get_adata() +# Line 45: # Line 45: Get the anchor spot DataFrame using the VisiumArguments object. -- anchors_df = visium_args.get_anchors() +# Line 47: # Line 47: Add a log library size column to the AnnData's observation data using the VisiumArguments object. -- adata.obs['log library size']=visium_args.log_lib +# Line 48: # Line 48: Add a windowed log library size column to the AnnData's observation data using the VisiumArguments object. -- adata.obs['windowed log library size']=visium_args.win_loglib +# Line 50: # Line 50: Plot the spatial distribution of 'log library size' using scanpy. -- sc.pl.spatial(adata, cmap='magma', +# Line 52: # Line 52: Specify the feature to color by, and the number of columns. -- color='log library size', +# Line 53: # Line 53: Specify plot parameters. -- ncols=4, size=1.3, +# Line 54: # Line 54: Specify image key. -- img_key='hires', +# Line 59: # Line 59: Plot the spatial distribution of 'windowed log library size' using scanpy. -- sc.pl.spatial(adata, cmap='magma', +# Line 61: # Line 61: Specify the feature to color by, and the number of columns. -- color='windowed log library size', +# Line 62: # Line 62: Specify plot parameters. -- ncols=4, size=1.3, +# Line 63: # Line 63: Specify image key. -- img_key='hires', +# Line 68: # Line 68: Plot the spatial distribution of 'IL7R' gene expression using scanpy. -- sc.pl.spatial(adata, cmap='magma', +# Line 70: # Line 70: Specify the feature to color by, and the number of columns. -- color='IL7R', +# Line 71: # Line 71: Specify plot parameters. -- ncols=4, size=1.3, +# Line 72: # Line 72: Specify image key. -- img_key='hires', +# Line 77: # Line 77: Plot the anchor spots and their corresponding signatures using the plot_utils. -- plot_utils.plot_anchor_spots(umap_df, +# Line 78: # Line 78: Pass pure spots. -- visium_args.pure_spots, +# Line 79: # Line 79: Pass the signature means. -- visium_args.sig_mean, +# Line 80: # Line 80: Specify the bounding box x coordinate. -- bbox_x=2 +# Line 82: # Line 82: Initialize an ArchetypalAnalysis object using the normalized AnnData object. -- aa_model = AA.ArchetypalAnalysis(adata_orig=adata_normed) +# Line 83: # Line 83: Compute archetypes and return archetypal scores, dictionary of archetypes, major index, and explained variance. -- archetype, arche_dict, major_idx, evs = aa_model.compute_archetypes(cn=40) +# Line 85: # Line 85: Find the archetypal spots, using major archetypes. -- arche_df = aa_model.find_archetypal_spots(major=True) +# Line 87: # Line 87: Find marker genes associated with each archetypal cluster. -- markers_df = aa_model.find_markers(n_markers=30, display=False) +# Line 89: # Line 89: Map the archetypes to the closest anchors. -- map_df, map_dict = aa_model.assign_archetypes(anchors_df) +# Line 91: # Line 91: Find the most distant archetypes that are not assigned to any annotated cell types. -- distant_arches = aa_model.find_distant_archetypes(anchors_df, n=3) +# Line 93: # Line 93: Plot the explained variance ratios. -- plot_utils.plot_evs(evs, kmin=aa_model.kmin) +# Line 95: # Line 95: Plot the archetypes. -- aa_model.plot_archetypes(do_3d=False, major=True, disp_cluster=False) +# Line 97: # Line 97: Plot the archetype mapping results. -- aa_model.plot_mapping(map_df) +# Line 99: # Line 99: Refine the anchor spots based on the results of archetypal analysis. -- visium_args = utils.refine_anchors( +# Line 100: # Line 100: Pass visium arguments. -- visium_args, +# Line 101: # Line 101: Pass archetypal analysis model. -- aa_model, +# Line 103: # Line 103: Specify number of genes. -- n_genes=5, +# Line 106: # Line 106: Get the updated AnnData object and normalized data. -- adata, adata_normed = visium_args.get_adata() +# Line 107: # Line 107: Get the updated gene signatures. -- gene_sig = visium_args.gene_sig +# Line 108: # Line 108: Get the cell type names from gene signature. -- cell_types = gene_sig.columns +# Line 110: # Line 110: Import the torch library for deep learning. -- import torch +# Line 111: # Line 111: Specify the number of repeats for model training. -- n_repeats = 3 +# Line 112: # Line 112: Specify the number of epochs for model training. -- epochs = 200 +# Line 113: # Line 113: Specify the patience for early stopping. -- patience = 50 +# Line 114: # Line 114: Set the device to GPU if available, otherwise use CPU. -- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +# Line 116: # Line 116: Run the starfysh model training using the provided parameters. -- model, loss = utils.run_starfysh(visium_args, +# Line 117: # Line 117: Specify the number of repeats for model training. -- n_repeats=n_repeats, +# Line 118: # Line 118: Specify the number of epochs. -- epochs=epochs, +# Line 120: # Line 120: Pass the device. -- device=device +# Line 122: # Line 122: Get updated adata objects after training. -- adata, adata_normed = visium_args.get_adata() +# Line 123: # Line 123: Evaluate the trained starfysh model and get inference and generative outputs. -- inference_outputs, generative_outputs,adata_ = sf_model.model_eval(model, +# Line 124: # Line 124: Pass the adata object. -- adata, +# Line 125: # Line 125: Pass the visium arguments. -- visium_args, +# Line 126: # Line 126: Specify poe. -- poe=False, +# Line 127: # Line 127: Pass the device to use. -- device=device) +# Line 129: # Line 129: Import the numpy library for numerical operations. -- import numpy as np +# Line 130: # Line 130: Get the number of cell types from the gene signature. -- n_cell_types = gene_sig.shape[1] +# Line 131: # Line 131: Select a random index for cell type plotting. -- idx = np.random.randint(0, n_cell_types) +# Line 132: # Line 132: Plot the mean expression versus inferred proportion for a random cell type. -- post_analysis.gene_mean_vs_inferred_prop(inference_outputs, +# Line 133: # Line 133: Pass the VisiumArguments. -- visium_args, +# Line 134: # Line 134: Pass the index. -- idx=idx, +# Line 135: # Line 135: Specify figure size. -- figsize=(4,4) +# Line 137: # Line 137: Plot the spatial distribution of inferred expression for the 'ql_m' feature. -- plot_utils.pl_spatial_inf_feature(adata_, feature='ql_m', cmap='Blues') +# Line 139: # Line 139: Define a function to convert cell data to proportion data. -- def cell2proportion(adata): +# Line 140: # Line 140: Create a new AnnData object for plotting using the expression matrix of the given adata. -- adata_plot=sc.AnnData(adata.X) +# Line 141: # Line 141: Copy the observation data to the new AnnData object. -- adata_plot.obs=utils.extract_feature(adata_, 'qc_m').obs.copy() +# Line 142: # Line 142: Copy the variable data. -- adata_plot.var=adata.var.copy() +# Line 143: # Line 143: Copy the observation matrix. -- adata_plot.obsm=adata.obsm.copy() +# Line 144: # Line 144: Copy the observation pair wise data. -- adata_plot.obsp=adata.obsp.copy() +# Line 145: # Line 145: Copy the unstructured data. -- adata_plot.uns=adata.uns.copy() +# Line 146: # Line 146: Return the new AnnData object. -- return adata_plot +# Line 147: # Line 147: Convert the adata_ object to a proportion object by calling cell2proportion function. -- adata_plot=cell2proportion(adata_) +# Line 149: # Line 149: Show the adata_plot object. -- adata_plot +# Line 151: # Line 151: Plot the spatial distribution of Basal, LumA, LumB. -- sc.pl.spatial(adata_plot, cmap='Spectral_r', +# Line 153: # Line 153: Specify the features to color by and number of columns. -- color=['Basal','LumA','LumB'], +# Line 154: # Line 154: Specify plot parameters. -- ncols=4, size=1.3, +# Line 155: # Line 155: Specify the image key. -- img_key='hires', +# Line 156: # Line 156: Specify the min and max value for coloring. -- vmin=0, vmax='p90' +# Line 159: # Line 159: Plot UMAP embeddings colored by the expression of Basal, LumA, MBC, and Normal epithelial. -- ov.pl.embedding(adata_plot, +# Line 160: # Line 160: Specify the basis. -- basis='z_umap', +# Line 161: # Line 161: Specify the features to color by. -- color=['Basal', 'LumA', 'MBC', 'Normal epithelial'], +# Line 162: # Line 162: Specify frameon parameter. -- frameon='small', +# Line 163: # Line 163: Specify the min and max values, as well as cmap. -- vmin=0, vmax='p90', +# Line 164: # Line 164: Specify the color map. -- cmap='Spectral_r', +# Line 167: # Line 167: Predict cell type-specific expression using the trained model. -- pred_exprs = sf_model.model_ct_exp(model, +# Line 168: # Line 168: Pass the adata object. -- adata, +# Line 169: # Line 169: Pass the visium arguments. -- visium_args, +# Line 170: # Line 170: Pass the device. -- device=device) +# Line 172: # Line 172: Specify the gene and celltype for visualization. -- gene='IL7R' +# Line 173: # Line 173: Specify the gene and celltype for visualization. -- gene_celltype='Tem' +# Line 174: # Line 174: Add inferred expression of specified gene/cell type to adata_.layers. -- adata_.layers[f'infer_{gene_celltype}']=pred_exprs[gene_celltype] +# Line 176: # Line 176: Plot the spatial distribution of the predicted 'IL7R' expression. -- sc.pl.spatial(adata_, cmap='Spectral_r', +# Line 178: # Line 178: Specify the color to use for plotting, plot title, and layer name. -- color=gene, +# Line 179: # Line 179: Specify the title for the plot. -- title=f'{gene} (Predicted expression)\n{gene_celltype}', +# Line 180: # Line 180: Specify the layer to color by. -- layer=f'infer_{gene_celltype}', +# Line 181: # Line 181: Specify plot parameters. -- ncols=4, size=1.3, +# Line 182: # Line 182: Specify image key. -- img_key='hires', +# Line 187: # Line 187: Specify the output directory. -- outdir = './results/' +# Line 188: # Line 188: Create the output directory if it doesn't exist. -- if not os.path.exists(outdir): +# Line 189: # Line 189: Create output directory. -- os.mkdir(outdir) +# Line 191: # Line 191: Save the trained model's state dictionary to disk. -- torch.save(model.state_dict(), os.path.join(outdir, 'starfysh_model.pt')) +# Line 193: # Line 193: Save the AnnData object to disk in h5ad format. -- adata.write(os.path.join(outdir, 'st.h5ad')) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py new file mode 100644 index 00000000..a2845c8d --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py @@ -0,0 +1,70 @@ +```python +# Line 1: Import the omicverse library -- import omicverse as ov +# Line 3: Import the scvelo library as scv -- import scvelo as scv +# Line 4: Import the scanpy library as sc -- import scanpy as sc +# Line 5: Set plot parameters using omicverse -- ov.plot_set() +# Line 7: Read an h5ad file into an AnnData object named adata -- adata = sc.read_h5ad('mouse_brain.h5ad') +# Line 8: Show the AnnData object -- adata +# Line 10: Create an STT object with spatial and region information -- STT_obj=ov.space.STT(adata,spatial_loc='xy_loc',region='Region') +# Line 12: Estimate stages for the STT object -- STT_obj.stage_estimate() +# Line 14: Train the STT model with specific parameters -- STT_obj.train(n_states = 9, n_iter = 15, weight_connectivities = 0.5, +# Line 15: Continue training the STT model with specific parameters -- n_neighbors = 50,thresh_ms_gene = 0.2, spa_weight =0.3) +# Line 17: Plot embedding colored by attractors -- ov.pl.embedding(adata, basis="xy_loc", +# Line 18: Continue plotting embedding colored by attractors -- color=["attractor"],frameon='small', +# Line 19: Continue plotting embedding colored by attractors -- palette=ov.pl.sc_color[11:]) +# Line 21: Plot embedding colored by region -- ov.pl.embedding(adata, basis="xy_loc", +# Line 22: Continue plotting embedding colored by region -- color=["Region"],frameon='small', +# Line 23: Continue plotting embedding colored by region -- ) +# Line 25: Prepare pathway gene sets from a file -- pathway_dict=ov.utils.geneset_prepare('genesets/KEGG_2019_Mouse.txt',organism='Mouse') +# Line 27: Compute pathway scores -- STT_obj.compute_pathway(pathway_dict) +# Line 29: Create a plot of pathway scores -- fig = STT_obj.plot_pathway(figsize = (10,8),size = 100,fontsize = 12) +# Line 30: Loop through the axes of the pathway plot -- for ax in fig.axes: +# Line 31: Set x-axis label for pathway plot with specified font size -- ax.set_xlabel('Embedding 1', fontsize=20) # Adjust font size as needed +# Line 32: Set y-axis label for pathway plot with specified font size -- ax.set_ylabel('Embedding 2', fontsize=20) # Adjust font size as needed +# Line 33: Show the pathway plot -- fig.show() +# Line 35: Import the matplotlib.pyplot module -- import matplotlib.pyplot as plt +# Line 36: Create a figure and axes for a single plot -- fig, ax = plt.subplots(1, 1, figsize=(4, 4)) +# Line 37: Plot tensor pathway for 'Wnt signaling pathway' -- STT_obj.plot_tensor_pathway(pathway_name = 'Wnt signaling pathway',basis = 'xy_loc', +# Line 38: Continue plotting tensor pathway -- ax=ax) +# Line 40: Create another figure and axes for a single plot -- fig, ax = plt.subplots(1, 1, figsize=(4, 4)) +# Line 41: Plot tensor pathway for 'TGF-beta signaling pathway' -- STT_obj.plot_tensor_pathway( 'TGF-beta signaling pathway',basis = 'xy_loc', +# Line 42: Continue plotting tensor pathway -- ax=ax) +# Line 44: Plot tensor for attractors with filter and density parameters -- STT_obj.plot_tensor(list_attractor = [1,3,5,6], +# Line 45: Continue plotting tensor for attractors with filter and density parameters -- filter_cells = True, member_thresh = 0.1, density = 1) +# Line 48: Construct landscape for STT object using coordinate key -- STT_obj.construct_landscape(coord_key = 'X_xy_loc') +# Line 50: Plot embedding colored by attractors and regions on the transformed coordinates -- sc.pl.embedding(adata, color = ['attractor', 'Region'],basis= 'trans_coord') +# Line 52: Infer lineage information with specific parameters -- STT_obj.infer_lineage(si=3,sf=4, method = 'MPPT',flux_fraction=0.8,color_palette_name = 'tab10',size_point = 8, +# Line 53: Continue inferring lineage with specific parameters -- size_text=12) +# Line 55: Create a sankey diagram -- fig = STT_obj.plot_sankey(adata.obs['attractor'].tolist(),adata.obs['Region'].tolist()) +# Line 60: Write AnnData object to a file -- STT_obj.adata.write('data/mouse_brain_adata.h5ad') +# Line 61: Write aggregated AnnData object to a file -- STT_obj.adata_aggr.write('data/mouse_brain_adata_aggr.h5ad') +# Line 63: Read AnnData object from a file -- adata=ov.read('data/mouse_brain_adata.h5ad') +# Line 64: Read aggregated AnnData object from a file -- adata_aggr=ov.read('data/mouse_brain_adata_aggr.h5ad') +# Line 66: Create an STT object with spatial and region information, loading from adata -- STT_obj=ov.space.STT(adata,spatial_loc='xy_loc',region='Region') +# Line 67: Load the STT object with stored data -- STT_obj.load(adata,adata_aggr) +# Line 69: Sort the variable r2_test values -- adata.var['r2_test'].sort_values(ascending=False) +# Line 71: Plot the top genes -- STT_obj.plot_top_genes(top_genes = 6, ncols = 2, figsize = (8,8),) +# Line 73: Import the matplotlib.pyplot module -- import matplotlib.pyplot as plt +# Line 74: Create a figure and axes for a subplot -- fig, axes = plt.subplots(1, 4, figsize=(12, 3)) +# Line 75: Plot embedding for 'Sim1' expression using Ms layer -- ov.pl.embedding(adata, basis="xy_loc", +# Line 76: Continue plotting embedding for 'Sim1' expression using Ms layer -- color=["Sim1"],frameon='small', +# Line 77: Continue plotting embedding for 'Sim1' expression using Ms layer -- title='Sim1:Ms',show=False, +# Line 78: Continue plotting embedding for 'Sim1' expression using Ms layer -- layer='Ms',cmap='RdBu_r',ax=axes[0] +# Line 79: Continue plotting embedding for 'Sim1' expression using Ms layer -- ) +# Line 80: Plot embedding for 'Sim1' expression using Mu layer -- ov.pl.embedding(adata, basis="xy_loc", +# Line 81: Continue plotting embedding for 'Sim1' expression using Mu layer -- color=["Sim1"],frameon='small', +# Line 82: Continue plotting embedding for 'Sim1' expression using Mu layer -- title='Sim1:Mu',show=False, +# Line 83: Continue plotting embedding for 'Sim1' expression using Mu layer -- layer='Mu',cmap='RdBu_r',ax=axes[1] +# Line 84: Continue plotting embedding for 'Sim1' expression using Mu layer -- ) +# Line 85: Plot embedding for 'Sim1' expression using velocity layer -- ov.pl.embedding(adata, basis="xy_loc", +# Line 86: Continue plotting embedding for 'Sim1' expression using velocity layer -- color=["Sim1"],frameon='small', +# Line 87: Continue plotting embedding for 'Sim1' expression using velocity layer -- title='Sim1:Velo',show=False, +# Line 88: Continue plotting embedding for 'Sim1' expression using velocity layer -- layer='velo',cmap='RdBu_r',ax=axes[2] +# Line 89: Continue plotting embedding for 'Sim1' expression using velocity layer -- ) +# Line 90: Plot embedding for 'Sim1' expression using expression layer -- ov.pl.embedding(adata, basis="xy_loc", +# Line 91: Continue plotting embedding for 'Sim1' expression using expression layer -- color=["Sim1"],frameon='small', +# Line 92: Continue plotting embedding for 'Sim1' expression using expression layer -- title='Sim1:exp',show=False, +# Line 94: Continue plotting embedding for 'Sim1' expression using expression layer -- cmap='RdBu_r',ax=axes[3] +# Line 95: Continue plotting embedding for 'Sim1' expression using expression layer -- ) +# Line 96: Adjust the layout to fit the subplots -- plt.tight_layout() +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_tcga_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_tcga_annotated.py new file mode 100644 index 00000000..122006d6 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_tcga_annotated.py @@ -0,0 +1,23 @@ +``` +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the scanpy library as sc. -- import scanpy as sc +# Line 3: Set plotting parameters using the omicverse library. -- ov.plot_set() +# Line 5: Assign the path to the GDC sample sheet to the variable gdc_sample_sheep. -- gdc_sample_sheep='data/TCGA_OV/gdc_sample_sheet.2024-07-05.tsv' +# Line 6: Assign the path to the GDC downloaded files to the variable gdc_download_files. -- gdc_download_files='data/TCGA_OV/gdc_download_20240705_180129.081531' +# Line 7: Assign the path to the clinical cart file to the variable clinical_cart. -- clinical_cart='data/TCGA_OV/clinical.cart.2024-07-05' +# Line 8: Create a pyTCGA object using the defined file paths and assign it to aml_tcga. -- aml_tcga=ov.bulk.pyTCGA(gdc_sample_sheep,gdc_download_files,clinical_cart) +# Line 9: Initialize the AnnData object within the pyTCGA object aml_tcga. -- aml_tcga.adata_init() +# Line 11: Write the AnnData object within aml_tcga to an h5ad file with gzip compression. -- aml_tcga.adata.write_h5ad('data/TCGA_OV/ov_tcga_raw.h5ad',compression='gzip') +# Line 13: Assign the path to the GDC sample sheet to the variable gdc_sample_sheep. -- gdc_sample_sheep='data/TCGA_OV/gdc_sample_sheet.2024-07-05.tsv' +# Line 14: Assign the path to the GDC downloaded files to the variable gdc_download_files. -- gdc_download_files='data/TCGA_OV/gdc_download_20240705_180129.081531' +# Line 15: Assign the path to the clinical cart file to the variable clinical_cart. -- clinical_cart='data/TCGA_OV/clinical.cart.2024-07-05' +# Line 16: Create a pyTCGA object using the defined file paths and assign it to aml_tcga. -- aml_tcga=ov.bulk.pyTCGA(gdc_sample_sheep,gdc_download_files,clinical_cart) +# Line 17: Read an AnnData object from an h5ad file into aml_tcga. -- aml_tcga.adata_read('data/TCGA_OV/ov_tcga_raw.h5ad') +# Line 19: Initialize the metadata for the AnnData object within aml_tcga. -- aml_tcga.adata_meta_init() +# Line 21: Initialize survival information in the pyTCGA object. -- aml_tcga.survial_init() +# Line 22: Access the AnnData object within aml_tcga. -- aml_tcga.adata +# Line 24: Perform a survival analysis for the gene 'MYC' using deseq normalized data and plot the result. -- aml_tcga.survival_analysis('MYC',layer='deseq_normalize',plot=True) +# Line 26: Perform survival analysis for all genes in the dataset. -- aml_tcga.survial_analysis_all() +# Line 27: Access the AnnData object within aml_tcga. -- aml_tcga.adata +# Line 29: Write the modified AnnData object in aml_tcga to an h5ad file with gzip compression. -- aml_tcga.adata.write_h5ad('data/TCGA_OV/ov_tcga_survial_all.h5ad',compression='gzip') +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_tosica_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_tosica_annotated.py new file mode 100644 index 00000000..b8f86a2b --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_tosica_annotated.py @@ -0,0 +1,77 @@ +``` +# Line 1: Import the omicverse library as ov. -- import omicverse as ov +# Line 2: Import the scanpy library as sc. -- import scanpy as sc +# Line 3: Set the plotting style for omicverse. -- ov.utils.ov_plot_set() +# Line 4: Read the reference data from a h5ad file into an AnnData object. -- ref_adata = sc.read('demo_train.h5ad') +# Line 5: Select all rows and columns with original var_names for ref_adata. -- ref_adata = ref_adata[:,ref_adata.var_names] +# Line 6: Print the ref_adata AnnData object. -- print(ref_adata) +# Line 7: Print the value counts of the 'Celltype' column in ref_adata's obs. -- print(ref_adata.obs.Celltype.value_counts()) +# Line 8: Read the query data from a h5ad file into an AnnData object. -- query_adata = sc.read('demo_test.h5ad') +# Line 9: Select all rows and columns of query_adata based on ref_adata's var_names. -- query_adata = query_adata[:,ref_adata.var_names] +# Line 10: Print the query_adata AnnData object. -- print(query_adata) +# Line 11: Print the value counts of the 'Celltype' column in query_adata's obs. -- print(query_adata.obs.Celltype.value_counts()) +# Line 12: Make the variable names of ref_adata unique. -- ref_adata.var_names_make_unique() +# Line 13: Make the variable names of query_adata unique. -- query_adata.var_names_make_unique() +# Line 14: Find the intersection of variable names between query_adata and ref_adata and store as a list. -- ret_gene=list(set(query_adata.var_names) & set(ref_adata.var_names)) +# Line 15: Calculate the length of the ret_gene list. -- len(ret_gene) +# Line 16: Subset query_adata to keep only the genes in ret_gene. -- query_adata=query_adata[:,ret_gene] +# Line 17: Subset ref_adata to keep only the genes in ret_gene. -- ref_adata=ref_adata[:,ret_gene] +# Line 18: Print the maximum value of the X matrix for both ref_adata and query_adata. -- print(f"The max of ref_adata is {ref_adata.X.max()}, query_data is {query_adata.X.max()}",) +# Line 19: Download the TOSICA gene set file from the omicverse utils. -- ov.utils.download_tosica_gmt() +# Line 20: Initialize a pyTOSICA object using ref_adata, specifying gene set path, depth, label, project path and batch size. -- tosica_obj=ov.single.pyTOSICA(adata=ref_adata, +# Line 21: gmt_path='genesets/GO_bp.gmt', depth=1, +# Line 22: label_name='Celltype', +# Line 23: project_path='hGOBP_demo', +# Line 24: batch_size=8) +# Line 25: Train the TOSICA model for 5 epochs. -- tosica_obj.train(epochs=5) +# Line 26: Save the trained TOSICA model. -- tosica_obj.save() +# Line 27: Load the trained TOSICA model. -- tosica_obj.load() +# Line 28: Predict cell states for query_adata using the trained TOSICA model, saving results into new_adata. -- new_adata=tosica_obj.predicted(pre_adata=query_adata) +# Line 29: Scale the query_adata object. -- ov.pp.scale(query_adata) +# Line 30: Perform PCA on the scaled data of query_adata, keeping 50 components. -- ov.pp.pca(query_adata,layer='scaled',n_pcs=50) +# Line 31: Compute the neighborhood graph of query_adata using the scaled PCA data with 15 neighbors. -- sc.pp.neighbors(query_adata, n_neighbors=15, n_pcs=50, +# Line 32: use_rep='scaled|original|X_pca') +# Line 33: Compute the multidimensional embedding (MDE) of query_adata's scaled PCA data and save as "X_mde". -- query_adata.obsm["X_mde"] = ov.utils.mde(query_adata.obsm["scaled|original|X_pca"]) +# Line 34: Print the modified query_adata object. -- query_adata +# Line 35: Copy the obsm from query_adata to new_adata based on the overlapping obs indices. -- new_adata.obsm=query_adata[new_adata.obs.index].obsm.copy() +# Line 36: Copy the obsp from query_adata to new_adata based on the overlapping obs indices. -- new_adata.obsp=query_adata[new_adata.obs.index].obsp.copy() +# Line 37: Print the modified new_adata object. -- new_adata +# Line 38: Import the numpy library as np. -- import numpy as np +# Line 39: Create a numpy array of hex color codes as a string of unicode characters, and set the dtype. -- col = np.array([ +# Line 40: "#98DF8A","#E41A1C" ,"#377EB8", "#4DAF4A" ,"#984EA3" ,"#FF7F00" ,"#FFFF33" ,"#A65628" ,"#F781BF" ,"#999999","#1F77B4","#FF7F0E","#279E68","#FF9896" +# Line 41: ]).astype('', color='gray'),size=12) +# Line 23: Sets the title of the plot with a specified fontsize. -- plt.title('Venn4',fontsize=13) +# Line 25: Saves the current figure to a PNG file with specified DPI and bounding box settings. -- fig.savefig("figures/bulk_venn4.png",dpi=300,bbox_inches = 'tight') +# Line 27: Creates a new figure and axes object with a specified figure size. -- fig,ax=plt.subplots(figsize = (4,4)) +# Line 29: Defines a dictionary named 'sets' containing sets of numerical values, using string keys. -- sets = { +# Line 30: Assigns the set {1, 2, 3} to the key 'Set1:name'. -- 'Set1:name': {1,2,3}, +# Line 31: Assigns the set {1, 2, 3, 4} to the key 'Set2'. -- 'Set2': {1,2,3,4}, +# Line 32: Assigns the set {3, 4} to the key 'Set3'. -- 'Set3': {3,4}, +# Line 35: Creates a Venn diagram using omicverse's venn function, with specified sets, axes, fontsize, and color palette. -- ov.pl.venn(sets=sets,ax=ax,fontsize=5.5, +# Line 36: Sets color palette for venn diagram. -- palette=ov.pl.red_color) +# Line 38: Sets the title of the plot with a specified fontsize. -- plt.title('Venn3',fontsize=13) +# Line 40: Reads a CSV file into a pandas DataFrame using omicverse's read function, using the first column as index. -- result=ov.read('data/dds_result.csv',index_col=0) +# Line 41: Displays the first few rows of the DataFrame using the 'head' method. -- result.head() +# Line 43: Generates a volcano plot using omicverse's volcano function, with various customization options. -- ov.pl.volcano(result,pval_name='qvalue',fc_name='log2FoldChange', +# Line 44: Specifies thresholds for p-value and fold change, as well as limits for the axes. -- pval_threshold=0.05,fc_max=1.5,fc_min=-1.5, +# Line 45: Specifies max values for p-value and foldchange. -- pval_max=10,FC_max=10, +# Line 46: Sets the figure size, title, and title font properties of the volcano plot. -- figsize=(4,4),title='DEGs in Bulk',titlefont={'weight':'normal','size':14,}, +# Line 47: Defines the colors for up-regulated, down-regulated, and non-significant points in volcano plot. -- up_color='#e25d5d',down_color='#7388c1',normal_color='#d7d7d7', +# Line 48: Sets the font colors for up-regulated, down-regulated, and non-significant labels in volcano plot. -- up_fontcolor='#e25d5d',down_fontcolor='#7388c1',normal_fontcolor='#d7d7d7', +# Line 49: Sets the legend position, number of columns, and fontsize for the legend in volcano plot. -- legend_bbox=(0.8, -0.2),legend_ncol=2,legend_fontsize=12, +# Line 50: Sets parameter for gene plotting and label size for volcano plot. -- plot_genes=None,plot_genes_num=10,plot_genes_fontsize=11, +# Line 51: Sets the fontsize of the ticks in volcano plot. -- ticks_fontsize=12,) +# Line 53: Imports the seaborn library as sns. -- import seaborn as sns +# Line 54: Loads the "tips" dataset from seaborn. -- data = sns.load_dataset("tips") +# Line 55: Displays the first few rows of the "tips" dataset. -- data.head() +# Line 57: Generates a boxplot using omicverse's boxplot function, with specified data, hue, x/y values, palette, figure size, fontsize, and title. -- fig,ax=ov.pl.boxplot(data,hue='sex',x_value='day',y_value='total_bill', +# Line 58: Sets plot properties for boxplot. -- palette=ov.pl.red_color, +# Line 59: Sets plot properties for boxplot. -- figsize=(4,2),fontsize=12,title='Tips',) +# Line 61: Adds a p-value annotation to the boxplot using omicverse's add_palue function. -- ov.pl.add_palue(ax,line_x1=-0.5,line_x2=0.5,line_y=40, +# Line 62: Sets parameters for the p-value annotation, line_x1, line_x2, line_y. -- text_y=0.2, +# Line 63: Adds text for p value annotation. -- text='$p={}$'.format(round(0.001,3)), +# Line 64: Sets the font size, color, and alignment for p-value annotation. -- fontsize=11,fontcolor='#000000', +# Line 65: Sets the horizontal alignment for the p-value annotation. -- horizontalalignment='center',) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_visualize_colorsystem_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_visualize_colorsystem_annotated.py new file mode 100644 index 00000000..82b0578a --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_visualize_colorsystem_annotated.py @@ -0,0 +1,84 @@ +``` +# Line 1: import omicverse as ov -- import omicverse as ov +# Line 2: import scanpy as sc -- import scanpy as sc +# Line 4: ov.plot_set() -- ov.plot_set() +# Line 6: adata = ov.read('data/DentateGyrus/10X43_1.h5ad') -- adata = ov.read('data/DentateGyrus/10X43_1.h5ad') +# Line 7: adata -- adata +# Line 9: fb=ov.pl.ForbiddenCity() -- fb=ov.pl.ForbiddenCity() +# Line 11: from IPython.display import HTML -- from IPython.display import HTML +# Line 12: HTML(fb.visual_color(loc_range=(0,384), -- HTML(fb.visual_color(loc_range=(0,384), +# Line 13: num_per_row=24)) -- num_per_row=24)) +# Line 15: fb.get_color(name='凝夜紫') -- fb.get_color(name='凝夜紫') +# Line 17: import matplotlib.pyplot as plt -- import matplotlib.pyplot as plt +# Line 18: fig, axes = plt.subplots(1,3,figsize=(9,3)) -- fig, axes = plt.subplots(1,3,figsize=(9,3)) +# Line 19: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 20: basis='X_umap', -- basis='X_umap', +# Line 21: frameon='small', -- frameon='small', +# Line 22: color=["clusters"], -- color=["clusters"], +# Line 23: palette=fb.red[:], -- palette=fb.red[:], +# Line 24: ncols=3, -- ncols=3, +# Line 25: show=False, -- show=False, +# Line 26: legend_loc=None, -- legend_loc=None, +# Line 27: ax=axes[0]) -- ax=axes[0]) +# Line 29: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 30: basis='X_umap', -- basis='X_umap', +# Line 31: frameon='small', -- frameon='small', +# Line 32: color=["clusters"], -- color=["clusters"], +# Line 33: palette=fb.pink1[:], -- palette=fb.pink1[:], +# Line 34: ncols=3,show=False, -- ncols=3,show=False, +# Line 35: legend_loc=None, -- legend_loc=None, +# Line 36: ax=axes[1]) -- ax=axes[1]) +# Line 38: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 39: basis='X_umap', -- basis='X_umap', +# Line 40: frameon='small', -- frameon='small', +# Line 41: color=["clusters"], -- color=["clusters"], +# Line 42: palette=fb.red1[:4]+fb.blue1, -- palette=fb.red1[:4]+fb.blue1, +# Line 43: ncols=3,show=False, -- ncols=3,show=False, +# Line 44: ax=axes[2]) -- ax=axes[2]) +# Line 48: color_dict={'Astrocytes': '#e40414', -- color_dict={'Astrocytes': '#e40414', +# Line 49: 'Cajal Retzius': '#ec5414', -- 'Cajal Retzius': '#ec5414', +# Line 50: 'Cck-Tox': '#ec4c2c', -- 'Cck-Tox': '#ec4c2c', +# Line 51: 'Endothelial': '#d42c24', -- 'Endothelial': '#d42c24', +# Line 52: 'GABA': '#2c5ca4', -- 'GABA': '#2c5ca4', +# Line 53: 'Granule immature': '#acd4ec', -- 'Granule immature': '#acd4ec', +# Line 54: 'Granule mature': '#a4bcdc', -- 'Granule mature': '#a4bcdc', +# Line 55: 'Microglia': '#8caccc', -- 'Microglia': '#8caccc', +# Line 56: 'Mossy': '#8cacdc', -- 'Mossy': '#8cacdc', +# Line 57: 'Neuroblast': '#6c9cc4', -- 'Neuroblast': '#6c9cc4', +# Line 58: 'OL': '#6c94cc', -- 'OL': '#6c94cc', +# Line 59: 'OPC': '#5c74bc', -- 'OPC': '#5c74bc', +# Line 60: 'Radial Glia-like': '#4c94c4', -- 'Radial Glia-like': '#4c94c4', +# Line 61: 'nIPC': '#3474ac'} -- 'nIPC': '#3474ac'} +# Line 63: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 64: basis='X_umap', -- basis='X_umap', +# Line 65: frameon='small', -- frameon='small', +# Line 66: color=["clusters"], -- color=["clusters"], +# Line 67: palette=color_dict, -- palette=color_dict, +# Line 68: ncols=3,show=False, -- ncols=3,show=False, +# Line 69: ) -- ) +# Line 72: colors=[ -- colors=[ +# Line 73: fb.get_color_rgb('群青'), -- fb.get_color_rgb('群青'), +# Line 74: fb.get_color_rgb('半见'), -- fb.get_color_rgb('半见'), +# Line 75: fb.get_color_rgb('丹罽'), -- fb.get_color_rgb('丹罽'), +# Line 76: ] -- ] +# Line 77: fb.get_cmap_seg(colors) -- fb.get_cmap_seg(colors) +# Line 79: colors=[ -- colors=[ +# Line 80: fb.get_color_rgb('群青'), -- fb.get_color_rgb('群青'), +# Line 81: fb.get_color_rgb('山矾'), -- fb.get_color_rgb('山矾'), +# Line 82: fb.get_color_rgb('丹罽'), -- fb.get_color_rgb('丹罽'), +# Line 83: ] -- ] +# Line 84: fb.get_cmap_seg(colors) -- fb.get_cmap_seg(colors) +# Line 86: colors=[ -- colors=[ +# Line 87: fb.get_color_rgb('山矾'), -- fb.get_color_rgb('山矾'), +# Line 88: fb.get_color_rgb('丹罽'), -- fb.get_color_rgb('丹罽'), +# Line 89: ] -- ] +# Line 90: fb.get_cmap_seg(colors) -- fb.get_cmap_seg(colors) +# Line 92: ov.pl.embedding(adata, -- ov.pl.embedding(adata, +# Line 93: basis='X_umap', -- basis='X_umap', +# Line 94: frameon='small', -- frameon='small', +# Line 95: color=["Sox7"], -- color=["Sox7"], +# Line 96: cmap=fb.get_cmap_seg(colors), -- cmap=fb.get_cmap_seg(colors), +# Line 97: ncols=3,show=False, -- ncols=3,show=False, +# Line 98: #vmin=-1,vmax=1 -- #vmin=-1,vmax=1 +# Line 99: ) -- ) +``` \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_visualize_single_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_visualize_single_annotated.py new file mode 100644 index 00000000..a48dc118 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_visualize_single_annotated.py @@ -0,0 +1,406 @@ +``` +# Line 1: -- import omicverse as ov +# Line 1: Imports the omicverse library and aliases it as ov. +# Line 2: -- import scanpy as sc +# Line 2: Imports the scanpy library and aliases it as sc. +# Line 4: -- ov.plot_set() +# Line 4: Sets the plotting style for omicverse. +# Line 6: -- adata = ov.read('data/DentateGyrus/10X43_1.h5ad') +# Line 6: Reads an AnnData object from an h5ad file into the variable 'adata' using omicverse's read function. +# Line 7: -- adata +# Line 7: Displays the AnnData object 'adata'. +# Line 9: -- optim_palette=ov.pl.optim_palette(adata,basis='X_umap',colors='clusters') +# Line 9: Generates an optimized color palette for plotting based on UMAP embeddings and 'clusters' using omicverse. +# Line 11: -- import matplotlib.pyplot as plt +# Line 11: Imports the matplotlib.pyplot module as plt. +# Line 12: -- fig,ax=plt.subplots(figsize = (4,4)) +# Line 12: Creates a matplotlib figure and an axes object with a specified size of 4x4. +# Line 13: -- ov.pl.embedding(adata, +# Line 13: Calls omicverse's embedding plot function on the 'adata' object. +# Line 14: -- basis='X_umap', +# Line 14: Specifies 'X_umap' as the embedding basis for the plot. +# Line 15: -- color='clusters', +# Line 15: Colors the embedding plot based on the 'clusters' annotation in the AnnData object. +# Line 16: -- frameon='small', +# Line 16: Sets a small frame for the plot. +# Line 17: -- show=False, +# Line 17: Prevents the plot from being shown immediately, allowing for further customization. +# Line 18: -- palette=optim_palette, +# Line 18: Sets the color palette for the plot using the previously generated optimized palette. +# Line 19: -- ax=ax,) +# Line 19: Specifies the previously created axes object 'ax' to draw the plot on. +# Line 20: -- plt.title('Cell Type of DentateGyrus',fontsize=15) +# Line 20: Sets the title of the plot to 'Cell Type of DentateGyrus' with a font size of 15. +# Line 22: -- ov.pl.embedding(adata, +# Line 22: Calls omicverse's embedding plot function on the 'adata' object again. +# Line 23: -- basis='X_umap', +# Line 23: Specifies 'X_umap' as the embedding basis for the plot. +# Line 24: -- color='age(days)', +# Line 24: Colors the embedding plot based on the 'age(days)' annotation. +# Line 25: -- frameon='small', +# Line 25: Sets a small frame for the plot. +# Line 26: -- show=False,) +# Line 26: Prevents the plot from being shown immediately. +# Line 28: -- import matplotlib.pyplot as plt +# Line 28: Imports the matplotlib.pyplot module as plt. +# Line 29: -- fig,ax=plt.subplots(figsize = (1,4)) +# Line 29: Creates a matplotlib figure and an axes object with a specified size of 1x4. +# Line 30: -- ov.pl.cellproportion(adata=adata,celltype_clusters='clusters', +# Line 30: Creates a cell proportion plot using omicverse, specifying the 'clusters' annotation as cell types. +# Line 31: -- groupby='age(days)',legend=True,ax=ax) +# Line 31: Groups the cell proportion plot by 'age(days)', displays the legend, and draws on the specified axes. +# Line 33: -- fig,ax=plt.subplots(figsize = (2,2)) +# Line 33: Creates a matplotlib figure and axes with a size of 2x2. +# Line 34: -- ov.pl.cellproportion(adata=adata,celltype_clusters='age(days)', +# Line 34: Creates a cell proportion plot, using 'age(days)' as the cell types. +# Line 35: -- groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'], +# Line 35: Groups the plot by 'clusters', and specifies a list of clusters to show. +# Line 36: -- legend=True,ax=ax) +# Line 36: Displays the legend and draws on the specified axes. +# Line 38: -- fig,ax=plt.subplots(figsize = (2,2)) +# Line 38: Creates a matplotlib figure and axes with a size of 2x2. +# Line 39: -- ov.pl.cellstackarea(adata=adata,celltype_clusters='age(days)', +# Line 39: Creates a stacked area plot of cell proportions, using 'age(days)' as the cell types. +# Line 40: -- groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'], +# Line 40: Groups the plot by 'clusters' and specifies a list of clusters to show. +# Line 41: -- legend=True,ax=ax) +# Line 41: Displays the legend and draws on the specified axes. +# Line 43: -- ov.pl.embedding_celltype(adata,figsize=(7,4),basis='X_umap', +# Line 43: Calls omicverse's embedding_celltype function to create a cell type specific plot. +# Line 44: -- celltype_key='clusters', +# Line 44: Specifies that the 'clusters' annotation should be used to delineate cell types. +# Line 45: -- title=' Cell type', +# Line 45: Sets the title of the plot to 'Cell type'. +# Line 46: -- celltype_range=(1,10), +# Line 46: Sets the range of cell type categories to display. +# Line 47: -- embedding_range=(4,10),) +# Line 47: Sets the range of the embedding coordinates to display. +# Line 49: -- import matplotlib.pyplot as plt +# Line 49: Imports the matplotlib.pyplot module as plt. +# Line 50: -- fig,ax=plt.subplots(figsize = (4,4)) +# Line 50: Creates a matplotlib figure and axes with a size of 4x4. +# Line 52: -- ov.pl.embedding(adata, +# Line 52: Calls omicverse's embedding plot function on the 'adata' object. +# Line 53: -- basis='X_umap', +# Line 53: Specifies 'X_umap' as the embedding basis for the plot. +# Line 54: -- color=['clusters'], +# Line 54: Colors the embedding plot based on the 'clusters' annotation. +# Line 55: -- frameon='small', +# Line 55: Sets a small frame for the plot. +# Line 56: -- show=False, +# Line 56: Prevents the plot from being shown immediately. +# Line 57: -- ax=ax) +# Line 57: Specifies the axes to draw the plot on. +# Line 59: -- ov.pl.ConvexHull(adata, +# Line 59: Calls omicverse's ConvexHull function to add convex hull outlines to the embedding plot. +# Line 60: -- basis='X_umap', +# Line 60: Specifies 'X_umap' as the embedding basis. +# Line 61: -- cluster_key='clusters', +# Line 61: Specifies 'clusters' as the annotation key for identifying clusters. +# Line 62: -- hull_cluster='Granule mature', +# Line 62: Specifies that a convex hull should be drawn for the 'Granule mature' cluster. +# Line 63: -- ax=ax) +# Line 63: Specifies the axes to draw the convex hull on. +# Line 66: -- import matplotlib.pyplot as plt +# Line 66: Imports the matplotlib.pyplot module as plt. +# Line 67: -- fig,ax=plt.subplots(figsize = (4,4)) +# Line 67: Creates a matplotlib figure and axes with a size of 4x4. +# Line 69: -- ov.pl.embedding(adata, +# Line 69: Calls omicverse's embedding plot function on the 'adata' object. +# Line 70: -- basis='X_umap', +# Line 70: Specifies 'X_umap' as the embedding basis for the plot. +# Line 71: -- color=['clusters'], +# Line 71: Colors the embedding plot based on the 'clusters' annotation. +# Line 72: -- frameon='small', +# Line 72: Sets a small frame for the plot. +# Line 73: -- show=False, +# Line 73: Prevents the plot from being shown immediately. +# Line 74: -- ax=ax) +# Line 74: Specifies the axes to draw the plot on. +# Line 76: -- ov.pl.contour(ax=ax,adata=adata,groupby='clusters',clusters=['Granule immature','Granule mature'], +# Line 76: Adds contour lines to the embedding plot using omicverse. +# Line 77: -- basis='X_umap',contour_threshold=0.1,colors='#000000', +# Line 77: Specifies the basis, threshold, color, and line style for the contour lines. +# Line 78: -- linestyles='dashed',) +# Line 78: Specifies dashed line style for the contour lines. +# Line 81: -- from matplotlib import patheffects +# Line 81: Imports the patheffects module from matplotlib. +# Line 82: -- import matplotlib.pyplot as plt +# Line 82: Imports the matplotlib.pyplot module as plt. +# Line 83: -- fig, ax = plt.subplots(figsize=(4,4)) +# Line 83: Creates a matplotlib figure and an axes object with a specified size of 4x4. +# Line 85: -- ov.pl.embedding(adata, +# Line 85: Calls omicverse's embedding plot function on the 'adata' object. +# Line 86: -- basis='X_umap', +# Line 86: Specifies 'X_umap' as the embedding basis for the plot. +# Line 87: -- color=['clusters'], +# Line 87: Colors the embedding plot based on the 'clusters' annotation. +# Line 88: -- show=False, legend_loc=None, add_outline=False, +# Line 88: Prevents showing the plot, turns off the legend and outline. +# Line 89: -- frameon='small',legend_fontoutline=2,ax=ax +# Line 89: Sets a small frame, sets the legend font outline width, and draws to specified axes. +# Line 90: -- ) +# Line 92: -- ov.pl.embedding_adjust( +# Line 92: Calls omicverse's embedding_adjust function to adjust the embedding plot. +# Line 93: -- adata, +# Line 94: -- groupby='clusters', +# Line 94: Specifies that the adjustments are based on the 'clusters' annotation. +# Line 95: -- exclude=("OL",), +# Line 95: Specifies that the 'OL' cluster should be excluded from adjustments. +# Line 96: -- basis='X_umap', +# Line 96: Specifies 'X_umap' as the embedding basis. +# Line 97: -- ax=ax, +# Line 97: Specifies the axes to draw the adjustments on. +# Line 98: -- adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')), +# Line 98: Sets the style for the arrows in the adjustment. +# Line 99: -- text_kwargs=dict(fontsize=12 ,weight='bold', +# Line 99: Sets the style for the text in the adjustment. +# Line 100: -- path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ), +# Line 100: Adds a white outline to the text. +# Line 102: -- ov.pl.embedding_density(adata, +# Line 102: Calls omicverse's embedding_density function to create a density map on the embedding. +# Line 103: -- basis='X_umap', +# Line 103: Specifies 'X_umap' as the embedding basis. +# Line 104: -- groupby='clusters', +# Line 104: Specifies that the density map is based on the 'clusters' annotation. +# Line 105: -- target_clusters='Granule mature', +# Line 105: Specifies that the density map is only for the 'Granule mature' cluster. +# Line 106: -- frameon='small', +# Line 106: Sets a small frame for the plot. +# Line 107: -- show=False,cmap='RdBu_r',alpha=0.8) +# Line 107: Prevents the plot from being shown immediately and sets colormap and transparency. +# Line 109: -- ov.single.geneset_aucell(adata, +# Line 109: Calls omicverse's geneset_aucell function to compute AUC scores for a gene set. +# Line 110: -- geneset_name='Sox', +# Line 110: Specifies the name of the gene set as 'Sox'. +# Line 111: -- geneset=['Sox17', 'Sox4', 'Sox7', 'Sox18', 'Sox5']) +# Line 111: Specifies the genes included in the 'Sox' gene set. +# Line 113: -- ov.pl.embedding(adata, +# Line 113: Calls omicverse's embedding plot function. +# Line 114: -- basis='X_umap', +# Line 114: Specifies 'X_umap' as the embedding basis. +# Line 115: -- color=['Sox4'], +# Line 115: Colors the embedding plot based on expression of the gene 'Sox4'. +# Line 116: -- frameon='small', +# Line 116: Sets a small frame for the plot. +# Line 117: -- show=False,) +# Line 117: Prevents the plot from being shown immediately. +# Line 119: -- ov.pl.violin(adata,keys='Sox4',groupby='clusters',figsize=(6,3)) +# Line 119: Creates a violin plot using omicverse, visualizing 'Sox4' expression grouped by 'clusters' with a 6x3 size. +# Line 121: -- fig, ax = plt.subplots(figsize=(6,2)) +# Line 121: Creates a matplotlib figure and axes with a size of 6x2. +# Line 122: -- ov.pl.bardotplot(adata,groupby='clusters',color='Sox_aucell',figsize=(6,2), +# Line 122: Creates a bar dot plot using omicverse, grouped by clusters and coloured by Sox_aucell scores with a 6x2 size. +# Line 123: -- ax=ax, +# Line 123: Sets the axes to draw the plot on. +# Line 124: -- ylabel='Expression', +# Line 124: Sets the y-axis label to "Expression". +# Line 125: -- bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4}, +# Line 125: Sets the styling parameters for the bars. +# Line 126: -- scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'}) +# Line 126: Sets the styling parameters for the dots. +# Line 128: -- ov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=0.1, +# Line 128: Adds a p-value annotation to the plot using omicverse. +# Line 129: -- text_y=0.02, +# Line 129: Sets the y-position for the p-value text. +# Line 130: -- text='$p={}$'.format(round(0.001,3)), +# Line 130: Sets the text for the p-value annotation. +# Line 131: -- fontsize=11,fontcolor='#000000', +# Line 131: Sets the font size and color for the annotation text. +# Line 132: -- horizontalalignment='center',) +# Line 132: Sets the horizontal alignment of the annotation text. +# Line 134: -- fig, ax = plt.subplots(figsize=(6,2)) +# Line 134: Creates a matplotlib figure and axes with a size of 6x2. +# Line 135: -- ov.pl.bardotplot(adata,groupby='clusters',color='Sox17',figsize=(6,2), +# Line 135: Creates a bar dot plot using omicverse, grouped by clusters and coloured by Sox17 expression. +# Line 136: -- ax=ax, +# Line 136: Sets the axes to draw the plot on. +# Line 137: -- ylabel='Expression',xlabel='Cell Type', +# Line 137: Sets the y-axis label to "Expression" and the x-axis label to "Cell Type". +# Line 138: -- bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4}, +# Line 138: Sets the styling parameters for the bars. +# Line 139: -- scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'}) +# Line 139: Sets the styling parameters for the dots. +# Line 141: -- ov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=2, +# Line 141: Adds a p-value annotation to the plot using omicverse. +# Line 142: -- text_y=0.2, +# Line 142: Sets the y-position for the p-value text. +# Line 143: -- text='$p={}$'.format(round(0.001,3)), +# Line 143: Sets the text for the p-value annotation. +# Line 144: -- fontsize=11,fontcolor='#000000', +# Line 144: Sets the font size and color for the annotation text. +# Line 145: -- horizontalalignment='center',) +# Line 145: Sets the horizontal alignment of the annotation text. +# Line 147: -- import pandas as pd +# Line 147: Imports the pandas library and aliases it as pd. +# Line 148: -- import seaborn as sns +# Line 148: Imports the seaborn library and aliases it as sns. +# Line 150: -- ov.pl.single_group_boxplot(adata,groupby='clusters', +# Line 150: Creates a boxplot using omicverse, grouped by clusters. +# Line 151: -- color='Sox_aucell', +# Line 151: Sets the color of the boxes based on 'Sox_aucell' scores. +# Line 152: -- type_color_dict=dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])), +# Line 152: Creates a color dictionary using existing cluster colors. +# Line 153: -- x_ticks_plot=True, +# Line 153: Enables plotting of x-ticks. +# Line 154: -- figsize=(5,2), +# Line 154: Sets the size of the plot to 5x2. +# Line 155: -- kruskal_test=True, +# Line 155: Performs a Kruskal-Wallis test for statistical significance. +# Line 156: -- ylabel='Sox_aucell', +# Line 156: Sets the y-axis label to "Sox_aucell". +# Line 157: -- legend_plot=False, +# Line 157: Disables legend plotting. +# Line 158: -- bbox_to_anchor=(1,1), +# Line 158: Sets the bounding box for the legend if it were to be plotted. +# Line 159: -- title='Expression', +# Line 159: Sets the title of the plot to "Expression". +# Line 160: -- scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'}, +# Line 160: Sets the styling parameters for the scatter dots. +# Line 161: -- point_number=15, +# Line 161: Sets the number of points to plot. +# Line 162: -- sort=False, +# Line 162: Disables sorting of the plot. +# Line 163: -- save=False, +# Line 163: Disables saving the plot automatically. +# Line 164: -- ) +# Line 165: -- plt.grid(False) +# Line 165: Disables the grid on the plot. +# Line 166: -- plt.xticks(rotation=90,fontsize=12) +# Line 166: Rotates the x-axis ticks by 90 degrees and sets font size to 12. +# Line 168: -- import pandas as pd +# Line 168: Imports the pandas library as pd. +# Line 169: -- marker_genes_dict = { +# Line 169: Defines a dictionary holding marker genes for various cell types. +# Line 170: -- 'Sox':['Sox4', 'Sox7', 'Sox18', 'Sox5'], +# Line 170: Specifies the 'Sox' gene set. +# Line 172: -- color_dict = {'Sox':'#EFF3D8',} +# Line 172: Defines a dictionary holding colors for marker gene sets. +# Line 174: -- gene_color_dict = {} +# Line 174: Initializes an empty dictionary for gene colors. +# Line 175: -- gene_color_dict_black = {} +# Line 175: Initializes an empty dictionary for gene colors (black). +# Line 176: -- for cell_type, genes in marker_genes_dict.items(): +# Line 176: Iterates over the marker gene sets in the dictionary. +# Line 177: -- cell_type_color = color_dict.get(cell_type) +# Line 177: Gets the color for the current cell type. +# Line 178: -- for gene in genes: +# Line 178: Iterates over the genes in the current cell type. +# Line 179: -- gene_color_dict[gene] = cell_type_color +# Line 179: Assigns the cell type color to the current gene. +# Line 180: -- gene_color_dict_black[gene] = '#000000' +# Line 180: Assigns black as color to current gene in black color dict. +# Line 182: -- cm = ov.pl.complexheatmap(adata, +# Line 182: Creates a complex heatmap using omicverse. +# Line 183: -- groupby ='clusters', +# Line 183: Groups the heatmap by 'clusters'. +# Line 184: -- figsize =(5,2), +# Line 184: Sets the size of the heatmap to 5x2. +# Line 185: -- layer = None, +# Line 185: Specifies no layer. +# Line 186: -- use_raw = False, +# Line 186: Specifies that the raw data is not to be used. +# Line 187: -- standard_scale = 'var', +# Line 187: Specifies standard scaling by variance. +# Line 188: -- col_color_bars = dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])), +# Line 188: Sets the column color bars based on cluster colors. +# Line 189: -- col_color_labels = dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])), +# Line 189: Sets the column color labels based on cluster colors. +# Line 190: -- left_color_bars = color_dict, +# Line 190: Sets the left color bars. +# Line 191: -- left_color_labels = None, +# Line 191: Specifies no left color labels. +# Line 192: -- right_color_bars = color_dict, +# Line 192: Sets the right color bars. +# Line 193: -- right_color_labels = gene_color_dict_black, +# Line 193: Sets the right color labels. +# Line 194: -- marker_genes_dict = marker_genes_dict, +# Line 194: Specifies the marker genes dictionary. +# Line 195: -- cmap = 'coolwarm', #parula,jet +# Line 195: Sets the colormap for the heatmap. +# Line 196: -- legend_gap = 15, +# Line 196: Sets the gap between the heatmap and the legend. +# Line 197: -- legend_hpad = 0, +# Line 197: Sets the horizontal padding of the legend. +# Line 198: -- left_add_text = True, +# Line 198: Enables the addition of left-side text. +# Line 199: -- col_split_gap = 2, +# Line 199: Sets the column split gap. +# Line 200: -- row_split_gap = 1, +# Line 200: Sets the row split gap. +# Line 201: -- col_height = 6, +# Line 201: Sets the column height of the heatmap. +# Line 202: -- left_height = 4, +# Line 202: Sets the left side height. +# Line 203: -- right_height = 6, +# Line 203: Sets the right side height. +# Line 204: -- col_split = None, +# Line 204: Specifies no column split. +# Line 205: -- row_cluster = False, +# Line 205: Disables row clustering. +# Line 206: -- col_cluster = False, +# Line 206: Disables column clustering. +# Line 207: -- value_name='Gene', +# Line 207: Sets the value name for the heatmap. +# Line 208: -- xlabel = "Expression of selected genes", +# Line 208: Sets the x-axis label. +# Line 209: -- label = 'Gene Expression', +# Line 209: Sets the label for the heatmap. +# Line 210: -- save = True, +# Line 210: Enables saving of the heatmap. +# Line 211: -- show = False, +# Line 211: Prevents the heatmap from showing immediately. +# Line 212: -- legend = False, +# Line 212: Prevents a legend from being plotted. +# Line 213: -- plot_legend = False, +# Line 213: Disables plotting of the legend. +# Line 215: -- ) +# Line 217: -- adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,) +# Line 217: Preprocesses the AnnData object using omicverse with specified mode and number of highly variable genes. +# Line 219: -- marker_genes_dict = {'Granule immature': ['Sepw1', 'Camk2b', 'Cnih2'], +# Line 219: Defines a dictionary of marker genes for specific cell types. +# Line 220: -- 'Radial Glia-like': ['Dbi', 'Fabp7', 'Aldoc'], +# Line 220: Specifies marker genes for the 'Radial Glia-like' cell type. +# Line 221: -- 'Granule mature': ['Malat1', 'Rasl10a', 'Ppp3ca'], +# Line 221: Specifies marker genes for the 'Granule mature' cell type. +# Line 222: -- 'Neuroblast': ['Igfbpl1', 'Tubb2b', 'Tubb5'], +# Line 222: Specifies marker genes for the 'Neuroblast' cell type. +# Line 223: -- 'Microglia': ['Lgmn', 'C1qa', 'C1qb'], +# Line 223: Specifies marker genes for the 'Microglia' cell type. +# Line 224: -- 'Cajal Retzius': ['Diablo', 'Ramp1', 'Stmn1'], +# Line 224: Specifies marker genes for the 'Cajal Retzius' cell type. +# Line 225: -- 'OPC': ['Olig1', 'C1ql1', 'Pllp'], +# Line 225: Specifies marker genes for the 'OPC' cell type. +# Line 226: -- 'Cck-Tox': ['Tshz2', 'Cck', 'Nap1l5'], +# Line 226: Specifies marker genes for the 'Cck-Tox' cell type. +# Line 227: -- 'GABA': ['Gad2', 'Gad1', 'Snhg11'], +# Line 227: Specifies marker genes for the 'GABA' cell type. +# Line 228: -- 'Endothelial': ['Sparc', 'Myl12a', 'Itm2a'], +# Line 228: Specifies marker genes for the 'Endothelial' cell type. +# Line 229: -- 'Astrocytes': ['Apoe', 'Atp1a2'], +# Line 229: Specifies marker genes for the 'Astrocytes' cell type. +# Line 230: -- 'OL': ['Plp1', 'Mog', 'Mag'], +# Line 230: Specifies marker genes for the 'OL' cell type. +# Line 231: -- 'Mossy': ['Arhgdig', 'Camk4'], +# Line 231: Specifies marker genes for the 'Mossy' cell type. +# Line 232: -- 'nIPC': ['Hmgn2', 'Ptma', 'H2afz']} +# Line 232: Specifies marker genes for the 'nIPC' cell type. +# Line 234: -- ov.pl.marker_heatmap( +# Line 234: Creates a marker heatmap using omicverse. +# Line 235: -- adata, +# Line 236: -- marker_genes_dict, +# Line 236: Specifies the marker genes dictionary to use for the heatmap. +# Line 237: -- groupby='clusters', +# Line 237: Groups the heatmap by the 'clusters' annotation. +# Line 238: -- color_map="RdBu_r", +# Line 238: Sets the color map to 'RdBu_r'. +# Line 239: -- use_raw=False, +# Line 239: Specifies that raw data should not be used. +# Line 240: -- standard_scale="var", +# Line 240: Specifies standard scaling by variance. +# Line 241: -- expression_cutoff=0.0, +# Line 241: Sets the minimum expression cutoff. +# Line 242: -- fontsize=12, +# Line 242: Sets the font size for the plot. +# Line 243: -- bbox_to_anchor=(7, -2), +# Line 243: Sets the bounding box anchor \ No newline at end of file diff --git a/OvStudent/Converted_Scripts_Annotated/t_wgcna_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_wgcna_annotated.py new file mode 100644 index 00000000..91b1446b --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/t_wgcna_annotated.py @@ -0,0 +1,59 @@ +``` +# Line 1: Import the scanpy package for single-cell data analysis. -- import scanpy as sc +# Line 2: Import the omicverse package for omics data analysis. -- import omicverse as ov +# Line 3: Import the matplotlib.pyplot module for plotting. -- import matplotlib.pyplot as plt +# Line 4: Set the plotting style using omicverse's plot_set function. -- ov.plot_set() +# Line 6: Import the pandas package for data manipulation. -- import pandas as pd +# Line 7: Read a CSV file into a pandas DataFrame using omicverse's read function, setting the first column as the index. -- data=ov.utils.read('data/5xFAD_paper/expressionList.csv', +# Line 8: -- index_col=0) +# Line 9: Display the first few rows of the DataFrame. -- data.head() +# Line 11: Import the robust module from statsmodels for robust statistical methods. -- from statsmodels import robust #import package +# Line 12: Calculate the median absolute deviation (MAD) for each gene using robust.mad function. -- gene_mad=data.apply(robust.mad) #use function to calculate MAD +# Line 13: Transpose the DataFrame to have genes as columns. -- data=data.T +# Line 14: Select the top 2000 genes with the highest MAD values. -- data=data.loc[gene_mad.sort_values(ascending=False).index[:2000]] +# Line 15: Display the first few rows of the processed DataFrame. -- data.head() +# Line 17: Initialize a pyWGCNA object using omicverse, specifying parameters like name, species, gene expression data, and output path. -- pyWGCNA_5xFAD = ov.bulk.pyWGCNA(name='5xFAD_2k', +# Line 18: -- species='mus musculus', +# Line 19: -- geneExp=data.T, +# Line 20: -- outputPath='', +# Line 21: -- save=True) +# Line 22: Display the first few rows of the gene expression data in the pyWGCNA object. -- pyWGCNA_5xFAD.geneExpr.to_df().head(5) +# Line 24: Preprocess the gene expression data within the pyWGCNA object. -- pyWGCNA_5xFAD.preprocess() +# Line 26: Calculate the soft threshold for the network construction. -- pyWGCNA_5xFAD.calculate_soft_threshold() +# Line 28: Calculate the adjacency matrix for network analysis. -- pyWGCNA_5xFAD.calculating_adjacency_matrix() +# Line 30: Calculate the Topological Overlap Matrix (TOM) similarity matrix. -- pyWGCNA_5xFAD.calculating_TOM_similarity_matrix() +# Line 32: Calculate the gene tree using hierarchical clustering. -- pyWGCNA_5xFAD.calculate_geneTree() +# Line 33: Calculate dynamic modules using the cutreeHybrid method with specified parameters. -- pyWGCNA_5xFAD.calculate_dynamicMods(kwargs_function={'cutreeHybrid': {'deepSplit': 2, 'pamRespectsDendro': False}}) +# Line 34: Calculate the module eigengenes with specified soft power parameter. -- pyWGCNA_5xFAD.calculate_gene_module(kwargs_function={'moduleEigengenes': {'softPower': 8}}) +# Line 36: Plot the matrix representation of the network, but do not save the plot. -- pyWGCNA_5xFAD.plot_matrix(save=False) +# Line 38: Save the WGCNA results. -- pyWGCNA_5xFAD.saveWGCNA() +# Line 40: Load a previously saved WGCNA object from a file. -- pyWGCNA_5xFAD=ov.bulk.readWGCNA('5xFAD_2k.p') +# Line 42: Display the first few rows of the module information. -- pyWGCNA_5xFAD.mol.head() +# Line 44: Display the first few rows of the variable information from the expression data. -- pyWGCNA_5xFAD.datExpr.var.head() +# Line 46: Get a sub-module with specific module colors, displaying the first few rows and the shape. -- sub_mol=pyWGCNA_5xFAD.get_sub_module(['gold','lightgreen'], +# Line 47: -- mod_type='module_color') +# Line 48: -- sub_mol.head(),sub_mol.shape +# Line 50: Get a sub-network for a specific module color with a correlation threshold. -- G_sub=pyWGCNA_5xFAD.get_sub_network(mod_list=['lightgreen'], +# Line 51: -- mod_type='module_color',correlation_threshold=0.2) +# Line 52: -- G_sub +# Line 54: Get the number of edges in the sub-network. -- len(G_sub.edges()) +# Line 56: Plot the sub-network with specified module colors, layout, size, and labeling parameters. -- pyWGCNA_5xFAD.plot_sub_network(['gold','lightgreen'],pos_type='kamada_kawai',pos_scale=10,pos_dim=2, +# Line 57: -- figsize=(8,8),node_size=10,label_fontsize=8,correlation_threshold=0.2, +# Line 58: -- label_bbox={"ec": "white", "fc": "white", "alpha": 0.6}) +# Line 60: Update the sample information in the pyWGCNA object using a CSV file. -- pyWGCNA_5xFAD.updateSampleInfo(path='data/5xFAD_paper/sampleInfo.csv', sep=',') +# Line 62: Set colors for the 'Sex' metadata column. -- pyWGCNA_5xFAD.setMetadataColor('Sex', {'Female': 'green', +# Line 63: -- 'Male': 'yellow'}) +# Line 64: Set colors for the 'Genotype' metadata column. -- pyWGCNA_5xFAD.setMetadataColor('Genotype', {'5xFADWT': 'darkviolet', +# Line 65: -- '5xFADHEMI': 'deeppink'}) +# Line 66: Set colors for the 'Age' metadata column. -- pyWGCNA_5xFAD.setMetadataColor('Age', {'4mon': 'thistle', +# Line 67: -- '8mon': 'plum', +# Line 68: -- '12mon': 'violet', +# Line 69: -- '18mon': 'purple'}) +# Line 70: Set colors for the 'Tissue' metadata column. -- pyWGCNA_5xFAD.setMetadataColor('Tissue', {'Hippocampus': 'red', +# Line 71: -- 'Cortex': 'blue'}) +# Line 73: Perform various WGCNA analysis tasks. -- pyWGCNA_5xFAD.analyseWGCNA() +# Line 75: Get the column names of the metadata. -- metadata = pyWGCNA_5xFAD.datExpr.obs.columns.tolist() +# Line 77: Plot the module eigengene for the 'lightgreen' module with specified metadata. -- pyWGCNA_5xFAD.plotModuleEigenGene('lightgreen', metadata, show=True) +# Line 79: Create a barplot of the module eigengene for the 'lightgreen' module with specified metadata. -- pyWGCNA_5xFAD.barplotModuleEigenGene('lightgreen', metadata, show=True) +# Line 81: Get the top 10 hub genes for the 'lightgreen' module. -- pyWGCNA_5xFAD.top_n_hub_genes(moduleName="lightgreen", n=10) +``` \ No newline at end of file From 29d8f303b0f8d8cf78e3ff797d85cb99315d5512 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:41:24 +0800 Subject: [PATCH 03/40] RAG_sys_BackBone_0.0.6 --- OvStudent/app.py | 363 ++++++++++++++++++++++++++++++++++++++++ OvStudent/rag_system.py | 350 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 713 insertions(+) create mode 100644 OvStudent/app.py create mode 100644 OvStudent/rag_system.py diff --git a/OvStudent/app.py b/OvStudent/app.py new file mode 100644 index 00000000..a5478c1d --- /dev/null +++ b/OvStudent/app.py @@ -0,0 +1,363 @@ +# --- START OF FILE app.py --- +import streamlit as st +import json +from datetime import datetime, timezone +import os +import subprocess +import time +import requests +import getpass +import psutil +from pathlib import Path +import logging +from logging.handlers import RotatingFileHandler +from collections import OrderedDict + +from rag_system import RAGSystem +from config_manager import ConfigManager +from system_monitor import SystemMonitor +from rate_limiter import RateLimiter +from query_cache import QueryCache +from query_manager import QueryManager + + +# Set up logging with rotating file handler +def setup_logging(): + log_dir = Path("logs") + log_dir.mkdir(exist_ok=True) + + handler = RotatingFileHandler( + log_dir / 'streamlit_app.log', + maxBytes=10 * 1024 * 1024, # 10 MB + backupCount=5 + ) + + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + handler + ] + ) + + +setup_logging() + + +# Initialize session state +def initialize_session_state(): + default_state = { + 'ollama_ready': False, + 'models_installed': False, + 'query_history': [], + 'rate_limiter': None, + 'query_cache': None, + 'config': { + 'file_selection_model': 'qwen2.5-coder:3b', + 'query_processing_model': 'qwen2.5-coder:7b', + 'rate_limit': 5, # seconds between queries + # Add your default directories here + 'converted_jsons_directory': "/Users/kq_m3m/PycharmProjects/OVMaster/Converted_Jsons", + 'annotated_scripts_directory': "/Users/kq_m3m/PycharmProjects/OVMaster/Converted_Scripts_Annotated" + }, + 'current_user': getpass.getuser() # Get the current username + } + + for key, value in default_state.items(): + if key not in st.session_state: + st.session_state[key] = value + + # Initialize RateLimiter if not already set + if st.session_state['rate_limiter'] is None: + st.session_state['rate_limiter'] = RateLimiter(st.session_state['config']['rate_limit']) + + # Initialize QueryCache if not already set + if st.session_state['query_cache'] is None: + st.session_state['query_cache'] = QueryCache() + + +initialize_session_state() + + +# Cache for RAGSystem using @st.cache_resource to ensure singleton +@st.cache_resource +def get_rag_system(converted_jsons_directory, annotated_scripts_directory): + try: + return RAGSystem(converted_jsons_directory, annotated_scripts_directory) + except Exception as e: + logging.error(f"Failed to initialize RAG system: {str(e)}") + return None + + +# Function to display the header +def show_header(): + col1, col2, col3 = st.columns([2, 1, 1]) + with col1: + st.title("Agentic OmicVerse 🧬") + with col2: + # Display current time using a placeholder that will update automatically + current_time = datetime.now(timezone.utc) # Get the current time in UTC + st.info(f"📅 UTC: {current_time.strftime('%Y-%m-%d %H:%M:%S')}") + with col3: + # Display the current username + st.info(f"👤 User: {st.session_state['current_user']}") + + +# Function to display system status +def show_system_status(): + stats = SystemMonitor.get_system_stats() + with st.sidebar: + st.header("System Status 📊") + col1, col2 = st.columns(2) + with col1: + st.metric("Memory (MB)", f"{stats['memory_usage']:.1f}") + st.metric("CPU %", f"{stats['cpu_percent']:.1f}") + with col2: + st.metric("Uptime", SystemMonitor.format_uptime(stats['uptime'])) + st.metric("Memory Usage %", f"{stats['system_memory']['percent']:.1f}") + st.progress(stats['system_memory']['percent'] / 100) + + +# Function to check if Ollama server is running +def check_ollama_server() -> bool: + try: + response = requests.get("http://localhost:11434/api/version", timeout=5) + return response.status_code == 200 + except requests.RequestException: + return False + + +# Function to display health status +def display_health_status(): + healthy, checks = check_system_health() + with st.sidebar: + st.header("System Health ✅" if healthy else "System Health ⚠️") + for component, status in checks.items(): + if status: + st.success(f"{component} is running") + else: + st.error(f"{component} is not running") + + +# Function to perform health checks +def check_system_health(): + health_checks = { + 'Ollama Server': check_ollama_server(), + } + all_healthy = all(health_checks.values()) + return all_healthy, health_checks + + +# Function to display configuration settings +def show_configuration(): + with st.sidebar: + st.header("Configuration ⚙️") + with st.expander("Model Settings"): + file_selection_model = st.selectbox( + "File Selection Model", + ["qwen2.5-coder:3b", "qwen2.5-coder:7b", "gemini-pro", "gemini-1.5-flash-8b", "gemini-2.0-flash-exp"], + index=["qwen2.5-coder:3b", "qwen2.5-coder:7b", "gemini-pro", "gemini-1.5-flash-8b", + "gemini-2.0-flash-exp"].index( + st.session_state['config'].get('file_selection_model', "qwen2.5-coder:3b") + ) + ) + query_processing_model = st.selectbox( + "Query Processing Model", + ["qwen2.5-coder:7b", "qwen2.5-coder:3b", "gemini-pro", "gemini-1.5-flash-8b", "gemini-2.0-flash-exp"], + index=["qwen2.5-coder:7b", "qwen2.5-coder:3b", "gemini-pro", "gemini-1.5-flash-8b", + "gemini-2.0-flash-exp"].index( + st.session_state['config'].get('query_processing_model', "qwen2.5-coder:7b") + ) + ) + + # If using Gemini, request the API key (optional if not needed anymore) + # gemini_api_key = None # The redesigned rag_system.py doesn't require this + + rate_limit = st.slider( + "Rate Limit (seconds)", + min_value=1, + max_value=30, + value=st.session_state['config']['rate_limit'] + ) + current_user = st.text_input("Username", value=st.session_state['current_user']) + + # Directories + converted_jsons_directory = st.text_input("Converted JSONs Directory", + value=st.session_state['config']['converted_jsons_directory']) + annotated_scripts_directory = st.text_input("Annotated Scripts Directory", + value=st.session_state['config']['annotated_scripts_directory']) + + if st.button("Save Configuration"): + st.session_state['config'].update({ + 'file_selection_model': file_selection_model, + 'query_processing_model': query_processing_model, + 'rate_limit': rate_limit, + 'converted_jsons_directory': converted_jsons_directory, + 'annotated_scripts_directory': annotated_scripts_directory + }) + st.session_state['current_user'] = current_user + ConfigManager.save_config(st.session_state['config']) + st.session_state['rate_limiter'] = RateLimiter(rate_limit) + st.session_state['query_cache'] = QueryCache() + st.success("Configuration saved successfully.") + + +# Function to process query with progress tracking using the new RAG logic +def process_query_with_progress(query, rag_system): + if not query or not isinstance(query, str): + raise ValueError("Invalid query: Query must be a non-empty string") + + progress_bar = st.progress(0) + status_text = st.empty() + try: + logging.info(f"Processing query: {query}") + logging.info(f"RAG System State: {rag_system is not None}") + + status_text.text("Finding relevant documents...") + progress_bar.progress(25) + + # Pass query directly as a string + relevant_files = rag_system.find_relevant_files(query) + + status_text.text("Generating answer from annotated scripts...") + progress_bar.progress(50) + answer = rag_system.answer_query_with_annotated_scripts(query, relevant_files) + logging.info(f"Answer: {answer}") + + status_text.text("Updating history...") + progress_bar.progress(75) + + query_time = datetime.now(timezone.utc) + st.session_state.query_history.append({ + 'query': query, + 'file': relevant_files, + 'answer': answer, + 'timestamp': query_time, + 'user': st.session_state['current_user'] + }) + + st.session_state['rate_limiter'].record_request() + progress_bar.progress(100) + status_text.text("Complete!") + time.sleep(1) + progress_bar.empty() + status_text.empty() + return relevant_files, answer + except Exception as e: + logging.error(f"Query processing error: {str(e)}", exc_info=True) + progress_bar.empty() + status_text.text(f"Error: {e}") + raise e + + +# Function to display query history +def show_query_history(): + with st.sidebar: + st.header("Query History 📜") + for idx, item in enumerate(reversed(st.session_state.query_history[-10:])): + with st.expander(f"Query {len(st.session_state.query_history) - idx}: {item['query'][:30]}..."): + st.markdown(f"**Time:** {item['timestamp'].strftime('%Y-%m-%d %H:%M:%S')} UTC") + st.markdown(f"**User:** {item['user']}") + st.markdown(f"**Document(s):** {item['file']}") + st.markdown(f"**Answer:** {item['answer']}") + st.markdown("---") + + +# Main function +def main(): + show_header() + show_system_status() + display_health_status() + show_configuration() + + if st.button("Reset System"): + st.session_state.query_history = [] + st.session_state['rate_limiter'] = RateLimiter(st.session_state['config']['rate_limit']) + st.session_state['query_cache'] = QueryCache() + st.rerun() + + if not st.session_state['ollama_ready']: + if not check_ollama_server(): + st.error("❌ Ollama server is not running") + if st.button("🚀 Start Ollama Server"): + try: + subprocess.Popen(['ollama', 'serve']) + time.sleep(5) + if check_ollama_server(): + st.session_state['ollama_ready'] = True + st.success("✅ Ollama server started successfully") + st.rerun() + except FileNotFoundError: + st.error("❌ Ollama is not installed") + return + else: + st.session_state['ollama_ready'] = True + + # Initialize RAGSystem via cached function + converted_jsons_directory = st.session_state['config']['converted_jsons_directory'] + annotated_scripts_directory = st.session_state['config']['annotated_scripts_directory'] + rag_system = get_rag_system(converted_jsons_directory, annotated_scripts_directory) + + if rag_system is None: + st.error("Failed to initialize RAG system.") + return + + st.markdown("### Query Interface 🔍") + query = st.text_area( + "Enter your query:", + height=100, + placeholder="Enter your question about the documents..." + ) + + col1, col2 = st.columns([1, 5]) + with col1: + submit = st.button("🚀 Submit") + with col2: + if st.button("🗑️ Clear History"): + st.session_state.query_history = [] + st.rerun() + + if submit and query: + # Add validation to ensure query is not empty or just whitespace + if not query.strip(): + st.error("Query cannot be empty. Please enter a valid query.") + return + + is_valid, error_message = QueryManager.validate_query(query) + if not is_valid: + st.error(error_message) + return + + if not st.session_state['rate_limiter'].can_make_request(): + wait_time = st.session_state['rate_limiter'].time_until_next_request() + st.warning(f"Please wait {wait_time:.1f} seconds before making another query.") + return + + try: + with st.spinner("Processing query..."): + # Log the query and state before processing + logging.info(f"Processing query: {query!r}") + logging.info(f"RAG system state: initialized={rag_system is not None}") + + relevant_files, answer = process_query_with_progress(query, rag_system) + st.success(f"📄 Selected document(s): {relevant_files}") + st.markdown("### Answer 💡") + st.markdown(answer) + except ValueError as ve: + st.error(f"Invalid query: {str(ve)}") + logging.error(f"ValueError in query processing: {str(ve)}", exc_info=True) + except Exception as e: + st.error(f"An error occurred: {str(e)}") + logging.error("Query processing error", exc_info=True) + + show_query_history() + + +if __name__ == "__main__": + try: + main() + except Exception as e: + logging.error(f"Application error: {str(e)}", exc_info=True) + st.error(f"An unexpected error occurred: {str(e)}") +# --- END OF FILE app.py --- \ No newline at end of file diff --git a/OvStudent/rag_system.py b/OvStudent/rag_system.py new file mode 100644 index 00000000..22e9908b --- /dev/null +++ b/OvStudent/rag_system.py @@ -0,0 +1,350 @@ +# --- START OF FILE rag_system.py --- +import os +import time +import json +from datetime import datetime +from typing import List, Optional, Dict, Any, Union +import logging +from logging.handlers import RotatingFileHandler + +import chromadb +from langchain.chains import RetrievalQA +from langchain.docstore.document import Document +from langchain.prompts import PromptTemplate +from langchain.callbacks.manager import CallbackManager +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler + +# Use the original GPT4AllEmbeddings and Ollama LLM from the initial code. +from langchain_community.embeddings import GPT4AllEmbeddings +from langchain_community.llms import Ollama +from langchain_community.vectorstores import Chroma +from langchain.text_splitter import CharacterTextSplitter + +def setup_logging(): + logger = logging.getLogger('rag_system_code_optimized') + logger.setLevel(logging.INFO) + + os.makedirs('logs', exist_ok=True) + + file_handler = RotatingFileHandler( + 'logs/rag_system_code_optimized.log', + maxBytes=10 * 1024 * 1024, # 10MB + backupCount=5 + ) + file_handler.setFormatter( + logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ) + logger.addHandler(file_handler) + + console_handler = logging.StreamHandler() + console_handler.setFormatter( + logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ) + logger.addHandler(console_handler) + + return logger + +logger = setup_logging() + + +class CodeAwareTextSplitter(CharacterTextSplitter): + """ + A custom text splitter that tries to split code more gracefully. + You can enhance this by: + - Splitting on `def `, `class ` boundaries. + - Avoiding splitting in the middle of a function. + For now, this is a placeholder that uses line-based splitting + but could be improved as needed. + """ + def __init__(self, chunk_size=2000, chunk_overlap=200): + super().__init__(separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap) + + +class FirstStageRAG: + def __init__( + self, + converted_jsons_directory: str, + persist_dir: str, + file_selection_model: str, + chroma_client: chromadb.Client, + top_k_files: int = 3 + ): + self.converted_jsons_directory = converted_jsons_directory + self.persist_dir = persist_dir + self.file_selection_model = file_selection_model + self.chroma_client = chroma_client + self.top_k_files = top_k_files + + logger.info("Initializing FirstStageRAG for code retrieval...") + + self.collection_name = "file_descriptions" + self.collection = self._load_or_create_collection() + + if self.collection.count() == 0: + logger.info("Populating first-stage collection with file descriptions...") + self._index_file_descriptions() + logger.info("File descriptions indexed successfully.") + + logger.info("FirstStageRAG initialized successfully.") + + def _load_or_create_collection(self): + try: + collection = self.chroma_client.get_collection(name=self.collection_name) + return collection + except: + logger.info(f"Creating a new Chroma collection: {self.collection_name}") + collection = self.chroma_client.create_collection(name=self.collection_name) + return collection + + def _index_file_descriptions(self): + embeddings = GPT4AllEmbeddings(model=self.file_selection_model) + + docs = [] + metadatas = [] + ids = [] + + for fname in os.listdir(self.converted_jsons_directory): + if fname.endswith(".json"): + fpath = os.path.join(self.converted_jsons_directory, fname) + with open(fpath, 'r', encoding='utf-8') as f: + try: + data = json.load(f) + description = data.get("description", "") + file_name = data.get("file", fname) + + if not description: + continue + + docs.append(description) + metadatas.append({"file": file_name}) + ids.append(file_name) + except Exception as e: + logger.error(f"Error reading {fpath}: {str(e)}", exc_info=True) + + if docs: + doc_embeddings = embeddings.embed_documents(docs) + self.collection.add(documents=docs, metadatas=metadatas, ids=ids, embeddings=doc_embeddings) + else: + logger.warning("No documents to index in FirstStageRAG.") + + def find_relevant_files(self, query: Union[str, Dict[str, Any]]) -> List[str]: + if isinstance(query, dict): + query_text = query.get("query", "") + else: + query_text = query + + if not query_text.strip(): + logger.warning("Empty query received in FirstStageRAG.") + return [] + + embeddings = GPT4AllEmbeddings(model=self.file_selection_model) + query_embedding = embeddings.embed_query(query_text) + + results = self.collection.query(query_embeddings=[query_embedding], n_results=self.top_k_files) + + matched_files = [] + if results and "metadatas" in results and results["metadatas"]: + for meta in results["metadatas"][0]: + file_name = meta.get("file", None) + if file_name: + matched_files.append(file_name) + + logger.info(f"Top {self.top_k_files} matched files for query '{query_text}': {matched_files}") + return matched_files + + +class SecondStageRAG: + def __init__( + self, + annotated_scripts_directory: str, + query_processing_model: str, + chroma_client: chromadb.Client, + code_chunk_size: int = 2000, + code_chunk_overlap: int = 200, + top_k_chunks: int = 3 + ): + self.annotated_scripts_directory = annotated_scripts_directory + self.query_processing_model = query_processing_model + self.chroma_client = chroma_client + self.code_chunk_size = code_chunk_size + self.code_chunk_overlap = code_chunk_overlap + self.top_k_chunks = top_k_chunks + + logger.info("Initializing SecondStageRAG for code generation...") + + logger.info("SecondStageRAG initialized successfully.") + + def answer_query_with_annotated_scripts(self, query: str, relevant_files: List[str]) -> str: + if not relevant_files: + logger.info("No relevant files provided to second stage. Returning fallback answer.") + return "I could not find relevant code files for your request." + + documents = [] + text_splitter = CodeAwareTextSplitter(chunk_size=self.code_chunk_size, chunk_overlap=self.code_chunk_overlap) + + for file_name in relevant_files: + file_path = os.path.join(self.annotated_scripts_directory, file_name) + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + docs = text_splitter.split_text(content) + for i, doc_chunk in enumerate(docs): + documents.append( + Document( + page_content=doc_chunk, + metadata={"source_file": file_name, "chunk_id": i} + ) + ) + else: + logger.warning(f"File {file_name} not found in annotated scripts directory.") + + if not documents: + logger.info("No documents found to answer the query in second stage.") + return "I could not find relevant code content." + + embeddings = GPT4AllEmbeddings(model=self.query_processing_model) + + collection_name = f"annotated_docs_{int(time.time())}" + vectorstore = Chroma.from_documents( + documents=documents, + embedding=embeddings, + collection_name=collection_name, + client=self.chroma_client + ) + + retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": self.top_k_chunks}) + + # System prompt and template for code generation + system_prompt = ( + "You are a Python code generation assistant. " + "You will be provided with context code snippets related to the user query. " + "Please produce clear, correct, and well-commented Python code that addresses the user's request. " + "Follow PEP8 standards. If you propose changes, ensure they run without syntax errors." + ) + + prompt_template = PromptTemplate( + input_variables=["context", "question"], + template=( + "{context}\n\n" + "User Request: {question}\n\n" + "Now generate the best possible Python code solution given the above context. " + "If appropriate, include function definitions, classes, or usage examples. " + "Make sure the final answer is strictly Python code." + ) + ) + + llm = Ollama( + model=self.query_processing_model, + callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) + ) + + chain = RetrievalQA.from_chain_type( + llm=llm, + chain_type="stuff", + retriever=retriever, + return_source_documents=False, + chain_type_kwargs={"prompt": prompt_template} + ) + + logger.info(f"Running retrieval QA for code generation query: {query}") + answer = chain.run(query) + + # Cleanup the temporary collection after use + try: + self.chroma_client.delete_collection(name=collection_name) + except Exception as e: + logger.warning(f"Failed to cleanup temporary collection {collection_name}: {e}") + + return answer + + +class RAGSystem: + def __init__( + self, + converted_jsons_directory: str, + annotated_scripts_directory: str, + persist_dir: str = "./chroma_db_code", + file_selection_model: str = "qwen2.5-coder:3b", + query_processing_model: str = "qwen2.5-coder:7b", + top_k_files: int = 3, + top_k_chunks: int = 3, + code_chunk_size: int = 2000, + code_chunk_overlap: int = 200 + ): + logger.info("Initializing RAG System for Python code generation...") + + self.converted_jsons_directory = converted_jsons_directory + self.annotated_scripts_directory = annotated_scripts_directory + self.persist_directory = persist_dir + self.file_selection_model = file_selection_model + self.query_processing_model = query_processing_model + self.top_k_files = top_k_files + self.top_k_chunks = top_k_chunks + self.code_chunk_size = code_chunk_size + self.code_chunk_overlap = code_chunk_overlap + + self._validate_directories() + + os.makedirs(self.persist_directory, exist_ok=True) + + self.chroma_client = chromadb.Client( + chromadb.config.Settings( + anonymized_telemetry=False, + is_persistent=True, + persist_directory=self.persist_directory + ) + ) + + self.first_stage = FirstStageRAG( + converted_jsons_directory=self.converted_jsons_directory, + persist_dir=self.persist_directory, + file_selection_model=self.file_selection_model, + chroma_client=self.chroma_client, + top_k_files=self.top_k_files + ) + + self.second_stage = SecondStageRAG( + annotated_scripts_directory=self.annotated_scripts_directory, + query_processing_model=self.query_processing_model, + chroma_client=self.chroma_client, + top_k_chunks=self.top_k_chunks, + code_chunk_size=self.code_chunk_size, + code_chunk_overlap=self.code_chunk_overlap + ) + + logger.info("RAG System for code generation initialized successfully") + + def _validate_directories(self): + for directory in [self.converted_jsons_directory, self.annotated_scripts_directory]: + if not os.path.exists(directory): + raise ValueError(f"Directory does not exist: {directory}") + if not os.path.isdir(directory): + raise ValueError(f"Path is not a directory: {directory}") + if not os.access(directory, os.R_OK): + raise ValueError(f"Directory is not readable: {directory}") + logger.info("All directories validated successfully.") + + def find_relevant_files(self, query: Union[str, Dict[str, Any]]) -> List[str]: + return self.first_stage.find_relevant_files(query) + + def answer_query_with_annotated_scripts( + self, + query: str, + relevant_files: List[str] + ) -> str: + return self.second_stage.answer_query_with_annotated_scripts(query, relevant_files) + + def cleanup(self): + logger.info("Cleaning up resources...") + try: + self.chroma_client.reset() + if os.path.exists(self.persist_directory): + import shutil + shutil.rmtree(self.persist_directory, ignore_errors=True) + logger.info("Cleanup completed.") + except Exception as e: + logger.error(f"Error during cleanup: {str(e)}", exc_info=True) + raise + +# --- END OF FILE rag_system_code.py --- From aaa341472e7874c8a2e9757aa9330335536acf57 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 21:19:10 -0800 Subject: [PATCH 04/40] Delete rag_engine directory --- rag_engine/PreViewRead101.txt | 80 --- rag_engine/__init__.py | 3 - rag_engine/app.py | 388 ------------ rag_engine/config.json | 5 - rag_engine/ovrawmjson/KBI.json | 176 ------ rag_engine/ovrawmjson/t_anno_trans.json | 58 -- rag_engine/ovrawmjson/t_aucell.json | 70 -- rag_engine/ovrawmjson/t_bulk_combat.json | 74 --- rag_engine/ovrawmjson/t_cellanno.json | 126 ---- rag_engine/ovrawmjson/t_cellfate.json | 98 --- rag_engine/ovrawmjson/t_cellfate_gene.json | 186 ------ .../ovrawmjson/t_cellfate_genesets.json | 74 --- rag_engine/ovrawmjson/t_cellphonedb.json | 214 ------- rag_engine/ovrawmjson/t_cluster.json | 98 --- rag_engine/ovrawmjson/t_cluster_space.json | 122 ---- rag_engine/ovrawmjson/t_cnmf.json | 110 ---- rag_engine/ovrawmjson/t_commot_flowsig.json | 110 ---- rag_engine/ovrawmjson/t_cytotrace.json | 26 - rag_engine/ovrawmjson/t_deg.json | 82 --- rag_engine/ovrawmjson/t_deseq2.json | 82 --- rag_engine/ovrawmjson/t_gptanno.json | 98 --- rag_engine/ovrawmjson/t_mapping.json | 46 -- rag_engine/ovrawmjson/t_metacells.json | 94 --- rag_engine/ovrawmjson/t_metatime.json | 42 -- rag_engine/ovrawmjson/t_mofa.json | 78 --- rag_engine/ovrawmjson/t_mofa_glue.json | 98 --- rag_engine/ovrawmjson/t_network.json | 30 - rag_engine/ovrawmjson/t_nocd.json | 38 -- rag_engine/ovrawmjson/t_preprocess.json | 130 ---- rag_engine/ovrawmjson/t_preprocess_cpu.json | 122 ---- rag_engine/ovrawmjson/t_preprocess_gpu.json | 122 ---- rag_engine/ovrawmjson/t_scdeg.json | 122 ---- rag_engine/ovrawmjson/t_scdrug.json | 74 --- rag_engine/ovrawmjson/t_scmulan.json | 82 --- rag_engine/ovrawmjson/t_simba.json | 50 -- rag_engine/ovrawmjson/t_single_batch.json | 138 ---- rag_engine/ovrawmjson/t_slat.json | 130 ---- rag_engine/ovrawmjson/t_spaceflow.json | 42 -- rag_engine/ovrawmjson/t_stagate.json | 90 --- rag_engine/ovrawmjson/t_staligner.json | 46 -- rag_engine/ovrawmjson/t_starfysh.json | 126 ---- rag_engine/ovrawmjson/t_stt.json | 218 ------- rag_engine/ovrawmjson/t_tcga.json | 38 -- rag_engine/ovrawmjson/t_tosica.json | 86 --- rag_engine/ovrawmjson/t_traj.json | 98 --- rag_engine/ovrawmjson/t_via.json | 58 -- rag_engine/ovrawmjson/t_via_velo.json | 38 -- rag_engine/ovrawmjson/t_visualize_bulk.json | 34 - .../ovrawmjson/t_visualize_colorsystem.json | 46 -- rag_engine/ovrawmjson/t_visualize_single.json | 90 --- rag_engine/ovrawmjson/t_wgcna.json | 98 --- rag_engine/rag_system.py | 596 ------------------ rag_engine/requirements.txt | 7 - 53 files changed, 5387 deletions(-) delete mode 100644 rag_engine/PreViewRead101.txt delete mode 100644 rag_engine/__init__.py delete mode 100644 rag_engine/app.py delete mode 100644 rag_engine/config.json delete mode 100644 rag_engine/ovrawmjson/KBI.json delete mode 100644 rag_engine/ovrawmjson/t_anno_trans.json delete mode 100644 rag_engine/ovrawmjson/t_aucell.json delete mode 100644 rag_engine/ovrawmjson/t_bulk_combat.json delete mode 100644 rag_engine/ovrawmjson/t_cellanno.json delete mode 100644 rag_engine/ovrawmjson/t_cellfate.json delete mode 100644 rag_engine/ovrawmjson/t_cellfate_gene.json delete mode 100644 rag_engine/ovrawmjson/t_cellfate_genesets.json delete mode 100644 rag_engine/ovrawmjson/t_cellphonedb.json delete mode 100644 rag_engine/ovrawmjson/t_cluster.json delete mode 100644 rag_engine/ovrawmjson/t_cluster_space.json delete mode 100644 rag_engine/ovrawmjson/t_cnmf.json delete mode 100644 rag_engine/ovrawmjson/t_commot_flowsig.json delete mode 100644 rag_engine/ovrawmjson/t_cytotrace.json delete mode 100644 rag_engine/ovrawmjson/t_deg.json delete mode 100644 rag_engine/ovrawmjson/t_deseq2.json delete mode 100644 rag_engine/ovrawmjson/t_gptanno.json delete mode 100644 rag_engine/ovrawmjson/t_mapping.json delete mode 100644 rag_engine/ovrawmjson/t_metacells.json delete mode 100644 rag_engine/ovrawmjson/t_metatime.json delete mode 100644 rag_engine/ovrawmjson/t_mofa.json delete mode 100644 rag_engine/ovrawmjson/t_mofa_glue.json delete mode 100644 rag_engine/ovrawmjson/t_network.json delete mode 100644 rag_engine/ovrawmjson/t_nocd.json delete mode 100644 rag_engine/ovrawmjson/t_preprocess.json delete mode 100644 rag_engine/ovrawmjson/t_preprocess_cpu.json delete mode 100644 rag_engine/ovrawmjson/t_preprocess_gpu.json delete mode 100644 rag_engine/ovrawmjson/t_scdeg.json delete mode 100644 rag_engine/ovrawmjson/t_scdrug.json delete mode 100644 rag_engine/ovrawmjson/t_scmulan.json delete mode 100644 rag_engine/ovrawmjson/t_simba.json delete mode 100644 rag_engine/ovrawmjson/t_single_batch.json delete mode 100644 rag_engine/ovrawmjson/t_slat.json delete mode 100644 rag_engine/ovrawmjson/t_spaceflow.json delete mode 100644 rag_engine/ovrawmjson/t_stagate.json delete mode 100644 rag_engine/ovrawmjson/t_staligner.json delete mode 100644 rag_engine/ovrawmjson/t_starfysh.json delete mode 100644 rag_engine/ovrawmjson/t_stt.json delete mode 100644 rag_engine/ovrawmjson/t_tcga.json delete mode 100644 rag_engine/ovrawmjson/t_tosica.json delete mode 100644 rag_engine/ovrawmjson/t_traj.json delete mode 100644 rag_engine/ovrawmjson/t_via.json delete mode 100644 rag_engine/ovrawmjson/t_via_velo.json delete mode 100644 rag_engine/ovrawmjson/t_visualize_bulk.json delete mode 100644 rag_engine/ovrawmjson/t_visualize_colorsystem.json delete mode 100644 rag_engine/ovrawmjson/t_visualize_single.json delete mode 100644 rag_engine/ovrawmjson/t_wgcna.json delete mode 100644 rag_engine/rag_system.py delete mode 100644 rag_engine/requirements.txt diff --git a/rag_engine/PreViewRead101.txt b/rag_engine/PreViewRead101.txt deleted file mode 100644 index 9567797f..00000000 --- a/rag_engine/PreViewRead101.txt +++ /dev/null @@ -1,80 +0,0 @@ -This rag_engine is for preview only. It is not for production use. - -First time users should read the following instructions carefully. - -### Tutorial for RAG Engine Preview - -This tutorial will guide you through the setup and usage of the RAG Engine Preview. Follow the steps below to get started. - -#### Prerequisites - -Ensure you have the following installed: -- Python 3.12 -- Required Python packages: `langchain`, `langchain-community`, `sentence-transformers`, `numpy`, `faiss-cpu`, `chromadb`, `requests`, `psutil`, `prometheus_client`, `tenacity`, `streamlit` - -#### Step 1: Install Dependencies - -Install the necessary dependencies using pip: - -```bash -pip install langchain langchain-community sentence-transformers numpy faiss-cpu chromadb requests psutil prometheus_client tenacity streamlit -``` - -#### Step 2: Dive into the rag_engine_preview - -```bash -cd rag_engine -``` - -#### Step 3: Double-Check the Required Files - -Create the following files in your project directory: - -1. `rag_system.py` -2. `app.py` -3. `PreViewRead101.txt` -4. `ovrawjson folder` -5. `__init__.py` -6. `config.json` -7. `requirements.txt` - -Ensure the files are correctly named and placed in the project directory. - -#### Step 4: test the RAG System - -Run the app.py and rag_system.py to test the RAG System. - -#### Step 5: Run the Streamlit Application - -Navigate to the project directory and run the Streamlit application: - -```bash -streamlit run app.py -``` - -#### Step 6: Interact with the Application - -Attention: Your first time to initialize the RAG Engine Preview, you need to wait for a while to load the model. (Up to 30 minutes) - -Open the provided URL in your browser to interact with the RAG Engine Preview. You can enter queries and view the results processed by the RAG system. - -#### Step 7: Review the Logs - -Logs are generated in the `logs` directory. Review these logs to monitor the system's performance and debug any issues. - -#### Additional Information - -- **Configuration**: Modify the configuration settings in the Streamlit sidebar to customize the models and rate limits. -- **Health Checks**: Use the system health and status indicators in the sidebar to ensure the system is running correctly. -- **Query History**: View the history of queries processed by the system in the sidebar. - -This completes the setup and usage tutorial for the RAG Engine Preview. -For any issues or feedback, please refer to the provided documentation or contact the support team. - -In the Next Update, we will provide more features and improvements to enhance the user experience. -1. The Local Reasoning Engine has 90% ability to understand the context of the query compared to the OpenAI o1 model in the Bioinformatics domain. -2. The online model API supports the user to query the data from the SOTA model. -3. The Local elastic search engine is able to search the data from the local database. -4. The Local elastic knowledge base is able to update and delete the data from the local database. - -Love from 3910❤️ \ No newline at end of file diff --git a/rag_engine/__init__.py b/rag_engine/__init__.py deleted file mode 100644 index 114a8f88..00000000 --- a/rag_engine/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .rag_system import RAGSystem - -__version__ = "1.0.0" \ No newline at end of file diff --git a/rag_engine/app.py b/rag_engine/app.py deleted file mode 100644 index a13bfd84..00000000 --- a/rag_engine/app.py +++ /dev/null @@ -1,388 +0,0 @@ -import streamlit as st -import json -from datetime import datetime, timezone, timedelta -import os -import subprocess -import time -import requests -import getpass -import psutil -from pathlib import Path -import logging -from logging.handlers import RotatingFileHandler -from collections import OrderedDict -# Import the RAGSystem -from rag_system import RAGSystem, RAGLogger - -# Set up logging with rotating file handler -def setup_logging(): - log_dir = Path("logs") - log_dir.mkdir(exist_ok=True) - - handler = RotatingFileHandler( - log_dir / 'streamlit_app.log', - maxBytes=10*1024*1024, # 10 MB - backupCount=5 - ) - - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler(), - handler - ] - ) - -setup_logging() - -# Initialize session state -def initialize_session_state(): - default_state = { - 'ollama_ready': False, - 'models_installed': False, - 'query_history': [], - 'rate_limiter': None, - 'query_cache': None, - 'config': { - 'file_selection_model': 'qwen2.5-coder:3b', - 'query_processing_model': 'qwen2.5-coder:7b', - 'rate_limit': 5, # seconds between queries - }, - 'current_time': datetime(2024, 12, 8, 13, 19, 36, tzinfo=timezone.utc), - 'current_user': 'HendricksJudy' - } - - for key, value in default_state.items(): - if key not in st.session_state: - st.session_state[key] = value - -initialize_session_state() - -# Cache for RAGSystem -@st.cache_resource -def get_rag_system(): - try: - json_directory = os.path.join(os.path.dirname(__file__), "ovrawmjson") - kbi_path = os.path.join(json_directory, "KBI.json") - return RAGSystem(json_directory, kbi_path) - except Exception as e: - logging.error(f"Failed to initialize RAG system: {str(e)}") - return None - -# System Monitor class with enhanced metrics -class SystemMonitor: - @staticmethod - def get_system_stats(): - process = psutil.Process() - memory = psutil.virtual_memory() - return { - 'memory_usage': process.memory_info().rss / 1024 / 1024, # MB - 'cpu_percent': psutil.cpu_percent(interval=1), - 'uptime': time.time() - process.create_time(), - 'system_memory': { - 'total': memory.total / (1024 ** 3), # GB - 'available': memory.available / (1024 ** 3), # GB - 'percent': memory.percent - } - } - - @staticmethod - def format_uptime(seconds): - return str(timedelta(seconds=int(seconds))) - -# RateLimiter class for query rate limiting -class RateLimiter: - def __init__(self, limit_seconds): - self.limit_seconds = limit_seconds - self.last_request_time = None - - def can_make_request(self): - if not self.last_request_time: - return True - time_since_last = time.time() - self.last_request_time - return time_since_last >= self.limit_seconds - - def time_until_next_request(self): - if not self.last_request_time: - return 0 - time_since_last = time.time() - self.last_request_time - return max(0, self.limit_seconds - time_since_last) - - def record_request(self): - self.last_request_time = time.time() - -# Initialize RateLimiter -if st.session_state['rate_limiter'] is None: - st.session_state['rate_limiter'] = RateLimiter(st.session_state['config']['rate_limit']) - -# QueryCache class for cache management -class QueryCache: - def __init__(self, max_size=1000): - self.cache = OrderedDict() - self.max_size = max_size - - def get(self, key): - return self.cache.get(key) - - def set(self, key, value): - self.cache[key] = value - self.cache.move_to_end(key) - if len(self.cache) > self.max_size: - self.cache.popitem(last=False) - -# Initialize QueryCache -if st.session_state['query_cache'] is None: - st.session_state['query_cache'] = QueryCache() - -# ConfigManager class for configuration management -class ConfigManager: - CONFIG_PATH = Path('config.json') - - @staticmethod - def load_config(): - if ConfigManager.CONFIG_PATH.exists(): - with open(ConfigManager.CONFIG_PATH, 'r') as f: - return json.load(f) - else: - return st.session_state['config'] - - @staticmethod - def save_config(config): - with open(ConfigManager.CONFIG_PATH, 'w') as f: - json.dump(config, f, indent=2) - -# Load configuration -st.session_state['config'] = ConfigManager.load_config() - -# Function to display the header -def show_header(): - col1, col2, col3 = st.columns([2, 1, 1]) - with col1: - st.title("Agentic OmicVerse 🧬") - with col2: - # Using the specified datetime - st.info(f"📅 UTC: {datetime(2024, 12, 8, 13, 20, 42, tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}") - with col3: - # Using the specified username - st.info(f"👤 User: HendricksJudy") - -# Function to display system status -def show_system_status(): - stats = SystemMonitor.get_system_stats() - with st.sidebar: - st.header("System Status 📊") - col1, col2 = st.columns(2) - with col1: - st.metric("Memory (MB)", f"{stats['memory_usage']:.1f}") - st.metric("CPU %", f"{stats['cpu_percent']:.1f}") - with col2: - st.metric("Uptime", SystemMonitor.format_uptime(stats['uptime'])) - st.metric("Memory Usage %", f"{stats['system_memory']['percent']:.1f}") - st.progress(stats['system_memory']['percent'] / 100) - -# Function to check if Ollama server is running -def check_ollama_server() -> bool: - try: - response = requests.get("http://localhost:11434/api/version", timeout=5) - return response.status_code == 200 - except requests.RequestException: - return False - -# Function to display health status -def display_health_status(): - healthy, checks = check_system_health() - with st.sidebar: - st.header("System Health ✅" if healthy else "System Health ⚠️") - for component, status in checks.items(): - if status: - st.success(f"{component} is running") - else: - st.error(f"{component} is not running") - -# Function to perform health checks -def check_system_health(): - health_checks = { - 'Ollama Server': check_ollama_server(), - } - all_healthy = all(health_checks.values()) - return all_healthy, health_checks - -# Function to display configuration settings -def show_configuration(): - with st.sidebar: - st.header("Configuration ⚙️") - with st.expander("Model Settings"): - file_selection_model = st.selectbox( - "File Selection Model", - ["qwen2.5-coder:3b", "qwen2.5-coder:7b"], - index=["qwen2.5-coder:3b", "qwen2.5-coder:7b"].index( - st.session_state['config']['file_selection_model'] - ) - ) - query_processing_model = st.selectbox( - "Query Processing Model", - ["qwen2.5-coder:7b", "qwen2.5-coder:3b"], - index=["qwen2.5-coder:7b", "qwen2.5-coder:3b"].index( - st.session_state['config']['query_processing_model'] - ) - ) - rate_limit = st.slider( - "Rate Limit (seconds)", - min_value=1, - max_value=30, - value=st.session_state['config']['rate_limit'] - ) - - if st.button("Save Configuration"): - st.session_state['config'].update({ - 'file_selection_model': file_selection_model, - 'query_processing_model': query_processing_model, - 'rate_limit': rate_limit - }) - ConfigManager.save_config(st.session_state['config']) - st.session_state['rate_limiter'] = RateLimiter(rate_limit) - st.success("Configuration saved successfully.") - - -# Function to process query with progress tracking -def process_query_with_progress(query, rag_system): - progress_bar = st.progress(0) - status_text = st.empty() - try: - status_text.text("Finding relevant document...") - progress_bar.progress(25) - relevant_file = rag_system.find_relevant_file(query) - status_text.text("Processing query...") - progress_bar.progress(50) - answer = rag_system.process_query(query, relevant_file) - status_text.text("Updating history...") - progress_bar.progress(75) - - # Using the specified datetime for query history - query_time = datetime(2024, 12, 8, 13, 21, 29, tzinfo=timezone.utc) - st.session_state.query_history.append({ - 'query': query, - 'file': relevant_file, - 'answer': answer, - 'timestamp': query_time, - 'user': 'HendricksJudy' - }) - - st.session_state['rate_limiter'].record_request() - progress_bar.progress(100) - status_text.text("Complete!") - time.sleep(1) - progress_bar.empty() - status_text.empty() - return relevant_file, answer - except Exception as e: - logging.error(f"Query processing error: {str(e)}") - progress_bar.empty() - status_text.text(f"Error: {e}") - raise e - - -# QueryManager class -class QueryManager: - @staticmethod - def validate_query(query): - if not query or len(query.strip()) < 3: - return False, "Query must be at least 3 characters long" - if len(query) > 1000: - return False, "Query must be less than 1000 characters" - return True, "" - - -# Function to display query history -def show_query_history(): - with st.sidebar: - st.header("Query History 📜") - for idx, item in enumerate(reversed(st.session_state.query_history[-10:])): - with st.expander(f"Query {len(st.session_state.query_history) - idx}: {item['query'][:30]}..."): - st.markdown(f"**Time:** {item['timestamp'].strftime('%Y-%m-%d %H:%M:%S')} UTC") - st.markdown(f"**User:** {item['user']}") - st.markdown(f"**Document:** {item['file']}") - st.markdown(f"**Answer:** {item['answer']}") - st.markdown("---") - - -# Main function -def main(): - show_header() - show_system_status() - display_health_status() - show_configuration() - - if st.button("Reset System"): - st.session_state.query_history = [] - st.session_state['rate_limiter'] = RateLimiter(st.session_state['config']['rate_limit']) - st.rerun() - - if not st.session_state['ollama_ready']: - if not check_ollama_server(): - st.error("❌ Ollama server is not running") - if st.button("🚀 Start Ollama Server"): - try: - subprocess.Popen(['ollama', 'serve']) - time.sleep(5) - if check_ollama_server(): - st.session_state['ollama_ready'] = True - st.success("✅ Ollama server started successfully") - st.rerun() - except FileNotFoundError: - st.error("❌ Ollama is not installed") - return - else: - st.session_state['ollama_ready'] = True - - rag_system = get_rag_system() - if rag_system is None: - st.error("Failed to initialize RAG system.") - return - - st.markdown("### Query Interface 🔍") - query = st.text_area( - "Enter your query:", - height=100, - placeholder="Enter your question about the documents..." - ) - - col1, col2 = st.columns([1, 5]) - with col1: - submit = st.button("🚀 Submit") - with col2: - if st.button("🗑️ Clear History"): - st.session_state.query_history = [] - st.rerun() - - if submit and query: - is_valid, error_message = QueryManager.validate_query(query) - if not is_valid: - st.error(error_message) - return - - if not st.session_state['rate_limiter'].can_make_request(): - wait_time = st.session_state['rate_limiter'].time_until_next_request() - st.warning(f"Please wait {wait_time:.1f} seconds before making another query.") - return - - try: - with st.spinner("Processing query..."): - relevant_file, answer = process_query_with_progress(query, rag_system) - st.success(f"📄 Selected document: {relevant_file}") - st.markdown("### Answer 💡") - st.markdown(answer) - except Exception as e: - logging.error(f"Query processing error: {str(e)}") - st.error(f"Error processing query: {str(e)}") - - show_query_history() - - -if __name__ == "__main__": - try: - main() - except Exception as e: - logging.error(f"Application error: {str(e)}") - st.error(f"An unexpected error occurred: {str(e)}") \ No newline at end of file diff --git a/rag_engine/config.json b/rag_engine/config.json deleted file mode 100644 index 28a9e3d5..00000000 --- a/rag_engine/config.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "file_selection_model": "qwen2.5-coder:3b", - "query_processing_model": "qwen2.5-coder:7b", - "rate_limit": 5 -} \ No newline at end of file diff --git a/rag_engine/ovrawmjson/KBI.json b/rag_engine/ovrawmjson/KBI.json deleted file mode 100644 index 00f38be4..00000000 --- a/rag_engine/ovrawmjson/KBI.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "files": [ - { - "name": "t_anno_trans.json", - "introduction": "This script demonstrates how to transfer cell type annotations between two modalities (RNA and ATAC) using a weighted KNN classifier. It loads preprocessed RNA and ATAC data, combines them, performs dimensionality reduction, visualizes the alignment, trains a KNN classifier on the annotated RNA data, transfers labels to the ATAC data, and visualizes the transferred labels and their uncertainty. Finally, it merges the data again and visualizes the combined data with transferred annotations." - }, - { - "name": "t_cluster.json", - "introduction": "This script demonstrates various clustering methods available in omicverse, including Leiden, Louvain, Gaussian Mixture Model (GMM), and Latent Dirichlet Allocation (LDA). It uses the dentategyrus dataset from scvelo and performs preprocessing steps such as normalization, scaling, and PCA. The script then applies each clustering method and visualizes the results using UMAP embeddings. It also includes LDA refinement using a random forest classifier and cNMF analysis for clustering and visualization, along with ARI calculation for evaluating clustering performance." - }, - { - "name": "t_bulk_combat.json", - "introduction": "This script demonstrates batch effect correction on bulk RNA-seq data using Combat. It loads three datasets, combines them, applies Combat, and saves the corrected data. It then visualizes the batch effect before and after correction using boxplots and PCA/UMAP." - }, - { - "name": "t_aucell.json", - "introduction": "This script demonstrates the usage of AUCell for pathway enrichment analysis in omicverse. It loads single cell data, performs necessary preprocessing, prepares pathway database, and then perform AUCell enrichment analysis on one geneset, more than one genesets and test genesets. Also, including differential gene expression analysis, gene expression visualization and pathway enrichment analysis." - }, - { - "name": "t_cellanno.json", - "introduction": "This script demonstrates cell annotation using pySCSA in omicverse. It preprocesses single-cell RNA-seq data from 10x Genomics, performs clustering and dimensionality reduction, and then annotates cells using two different databases ('cellmarker' and 'panglaodb'). The script visualizes the annotation results, calculates cell type proportions, visualizes embeddings with cell type annotations, computes and visualizes the ratio of observed to expected cell numbers (Ro/e), and identifies marker genes." - }, - { - "name": "t_cellfate_genesets.json", - "introduction": "This notebook demonstrates the use of CellFateGenie with gene sets for cell fate analysis. It loads spatial transcriptomics data, prepares gene sets, calculates pathway enrichment scores, initializes and trains a CellFateGenie model, performs adaptive threshold regression, plots filtering results, fits the model, visualizes color fitting, filters genes using Kendall's tau, calculates gene trends, plots gene trends and heatmap, and generates a gene set word cloud." - }, - { - "name": "t_cellfate_gene.json", - "introduction": "This script performs cell fate analysis using CellFateGenie at single-gene resolution. It starts by loading and preprocessing single-cell data, followed by dimensionality reduction and visualization. SEACells is employed to identify metacells. Then, pseudotime is computed using pyVIA, and CellFateGenie is applied to analyze gene trends, filter genes based on significance, visualize gene expression dynamics, and identify border and kernel genes associated with specific cell types." - }, - { - "name": "t_cellphonedb.json", - "introduction": "This script demonstrates the use of CellPhoneDB for cell-cell interaction analysis within the Omicverse framework. It includes data loading, preprocessing, cell-cell interaction inference, network visualization (heatmap, chord diagram, network graph), subnetwork analysis, identification of significant interactions, and downstream pathway enrichment analysis." - }, - { - "name": "t_cluster_space.json", - "introduction": "This script demonstrates several spatial clustering methods, including GraphST, STAGATE, BINARY, and CAST, using 10x Visium spatial transcriptomics data. It preprocesses the data, calculates spatially variable genes, applies each clustering method, performs cluster refinement using mclust, visualizes spatial distribution of clusters, and evaluates the performance using the Adjusted Rand Index (ARI)." - }, - { - "name": "t_cnmf.json", - "introduction": "This script demonstrates the usage of cNMF (consensus Non-negative Matrix Factorization) for identifying gene expression programs in single-cell data. It loads data, preprocesses it, performs cNMF, visualizes results (including K selection, consensus matrix, usage matrix), refines clusters with a Random Forest Classifier, and identifies marker genes." - }, - { - "name": "t_cytotrace.json", - "introduction": "This script uses the omicverse library to analyze single-cell RNA sequencing data. It loads a dataset, preprocesses it, and then applies CytoTRACE2 to predict cellular potency scores. The script visualizes the results on a UMAP embedding, coloring cells by cluster, CytoTRACE2 score, potency, and relative order." - }, - { - "name": "t_deg.json", - "introduction": "This script demonstrates differential gene expression analysis using omicverse. It loads count data, maps gene IDs, performs DEG analysis with t-test, filters genes, visualizes results with volcano and box plots, performs pathway enrichment analysis, and visualizes enrichment results with multi-geneset plots." - }, - { - "name": "t_deseq2.json", - "introduction": "This script demonstrates differential expression analysis using DESeq2 within the omicverse framework. It covers data loading, preprocessing, DEG analysis, filtering, visualization (volcano plot, boxplot), and pathway enrichment analysis." - }, - { - "name": "t_mapping.json", - "introduction": "This script demonstrates cell type mapping from single-cell RNA-seq data to spatial transcriptomics data using Tangram. It loads and preprocesses both datasets, trains the Tangram model, maps cell types to spatial locations, and visualizes the results." - }, - { - "name": "t_metacells.json", - "introduction": "This script demonstrates the use of MetaCell (SEACells) for identifying metacells from single-cell RNA-seq data using the omicverse package. It covers data loading, preprocessing, model training, cell type purity and benchmark evaluation, metacell prediction, visualization (UMAP plots with cell type labels and S_score), highly variable gene identification for the metacells, and visualization of metacell clusters on the UMAP embedding." - }, - { - "name": "t_mofa_glue.json", - "introduction": "This script demonstrates integration of single-cell RNA and ATAC data using MOFA (Multi-Omics Factor Analysis) within Omicverse. It includes pairing cells between RNA and ATAC using GLUE correlation analysis, constructing a MuData object, and selecting a sub group for further analysis. Also includes running MOFA, visualizing results (variance explained, factor correlation, feature weights, UMAP embedding), and analyzing gene weights." - }, - { - "name": "t_metatime.json", - "introduction": "This script demonstrates the usage of MetaTiME for inferring cell types in the tumor microenvironment (TME) from single-cell RNA-seq data. It loads the data, performs dimensionality reduction, initializes and trains the MetaTiME model, and visualizes the predicted cell types." - }, - { - "name": "t_mofa.json", - "introduction": "This script demonstrates Multi-Omics Factor Analysis (MOFA) using the omicverse library. It performs MOFA on scRNA-seq and scATAC-seq data, visualizes variance explained by factors, calculates factor correlation with cell types, retrieves gene weights, and performs visualization such as scatter plots of factors, UMAP embeddings colored by factors, and heatmaps of top features." - }, - { - "name": "t_network.json", - "introduction": "This script demonstrates STRING interaction analysis using omicverse for a set of genes in *Saccharomyces cerevisiae*. It retrieves interaction data, creates a pyPPI object, performs interaction analysis, and plots the interaction network." - }, - { - "name": "t_nocd.json", - "introduction": "This script demonstrates the use of scNOCD for non-overlapping community detection in single-cell RNA-seq data using the omicverse package. It reads and preprocesses scRNA-seq data, applies the scNOCD model, and then visualizes the results on a UMAP embedding colored by leiden clustering and nocd groups." - }, - { - "name": "t_preprocess_cpu.json", - "introduction": "This script demonstrates preprocessing of single-cell RNA-seq data using omicverse on a CPU. It includes data loading, quality control, normalization, highly variable gene selection, scaling, PCA, neighborhood graph computation, UMAP and MDE embedding, cell cycle scoring, clustering, visualization, and differential expression analysis." - }, - { - "name": "t_preprocess_gpu.json", - "introduction": "This script demonstrates how to preprocess single-cell RNA-seq data using Omicverse with GPU acceleration. It covers data loading, quality control, normalization, HVG selection, scaling, PCA, neighborhood graph construction, UMAP/MDE embedding, Leiden clustering, and visualizations." - }, - { - "name": "t_preprocess.json", - "introduction": "This script demonstrates the standard preprocessing workflow for single-cell RNA-seq data using Omicverse. It includes data loading, quality control, normalization, highly variable gene selection, scaling, PCA, UMAP/MDE embedding, clustering, and visualizations." - }, - { - "name": "t_scdeg.json", - "introduction": "This script demonstrates differential gene expression analysis for single-cell RNA-seq data using omicverse. It loads data, preprocesses, performs DEG analysis between 'Alpha' and 'Beta' cells using t-test, visualizes results (volcano plot, boxplot, UMAP), performs metacell analysis with SEACells, repeats the DEG analysis on metacells, and visualizes metacell DEG results." - }, - { - "name": "t_scdrug.json", - "introduction": "This script demonstrates drug response prediction using CaDRReS-Sc within Omicverse. It loads data, infers CNVs to identify tumor cells, preprocesses tumor cell data, performs clustering and then uses a downloaded CaDRReS model for drug response prediction." - }, - { - "name": "t_scmulan.json", - "introduction": "This script demonstrates cell type annotation using scMulan within Omicverse. It loads data, performs gene symbol transformation, normalizes, predicts cell types using a pretrained scMulan model, visualizes the predictions on a 2D embedding, applies smoothing to refine predictions, and provides functions for visualizing selected cell types." - }, - { - "name": "t_simba.json", - "introduction": "This script demonstrates single-cell integration using SIMBA within the omicverse framework. It includes data loading, preprocessing, graph generation and training for SIMBA model, batch correction, and visualization (MDE/UMAP) of the corrected data." - }, - { - "name": "t_single_batch.json", - "introduction": "This script demonstrates batch correction methods for single-cell RNA-seq data using omicverse. It loads multiple datasets, performs QC and preprocessing, applies batch correction using Harmony, Combat, Scanorama, scVI, MIRA (with LDA topic modeling), and benchmarks the methods using scib metrics." - }, - { - "name": "t_slat.json", - "introduction": "This script demonstrates the use of scSLAT for spatially resolved lineage tracing analysis using omicverse. It loads two spatial transcriptomics datasets, calculates spatial networks, runs SLAT to learn cell state embeddings, performs spatial matching between the two datasets, visualizes matching results (3D model, quality index, Sankey diagram), identifies matching cells based on cell type, performs trajectory analysis on a selected cell lineage, and analyzes differentially expressed genes between stages." - }, - { - "name": "t_spaceflow.json", - "introduction": "This script demonstrates the use of SpaceFlow for spatial transcriptomics analysis in omicverse. It loads spatial data, preprocesses it, trains the SpaceFlow model, calculates a pseudo-spatial map (pSM), visualizes the pSM, clusters cells using GMM, and compares predicted clusters with a ground truth." - }, - { - "name": "t_stagate.json", - "introduction": "This script demonstrates spatial transcriptomics analysis using STAGATE within Omicverse. It includes data loading, preprocessing, GraphST training (optional), STAGATE model training, STAGATE prediction, clustering, visualization of spatial clusters and gene expression, pseudo-spatial map calculation, and clustering performance evaluation using ARI." - }, - { - "name": "t_staligner.json", - "introduction": "This script demonstrates spatial transcriptomics alignment using STAligner within Omicverse. It loads multiple spatial datasets, preprocesses them, constructs spatial networks, concatenates datasets, trains the STAligner model, retrieves aligned embeddings, performs clustering and UMAP embedding on the aligned data, and visualizes spatial clustering results." - }, - { - "name": "t_starfysh.json", - "introduction": "This script demonstrates spatial transcriptomics cell type deconvolution using Starfysh. It loads spatial data and signature gene sets, preprocesses data and image, visualizes raw data, identifies anchor spots, performs Archetypal Analysis, refines anchor spots, trains the Starfysh model, and visualizes cell type proportions, gene expression, and other inferred features." - }, - { - "name": "t_stt.json", - "introduction": "This notebook demonstrates the use of Spatially resolved Transcript Time (STT) to infer cell lineages from spatial transcriptomic data. It covers data loading, preprocessing, model initialization, stage estimation, model training, visualizations of spatial and cluster patterns, pathway enrichment analysis and visualization, streamline visualization, Sankey diagram generation, and identification of genes with high multistability." - }, - { - "name": "t_tcga.json", - "introduction": "This script demonstrates how to use the pyTCGA class in omicverse for analyzing TCGA (The Cancer Genome Atlas) data. It initializes a pyTCGA object, imports raw count, FPKM, and TPM matrices, performs gene ID conversion, initializes patient metadata, imports survival data, and performs survival analysis for single genes (e.g., 'MYC') and for all genes, saving the updated data." - }, - { - "name": "t_tosica.json", - "introduction": "This script demonstrates the use of TOSICA for cell type prediction using the omicverse package. It loads reference and query datasets, preprocesses the data, trains a TOSICA model on the reference data, predicts cell types in the query data, performs dimensionality reduction and visualization on the query data with predicted labels, and analyzes differentially expressed pathways between predicted cell types." - }, - { - "name": "t_traj.json", - "introduction": "This script demonstrates trajectory inference using various methods available in Omicverse. It loads single-cell RNA-seq data, preprocesses it, performs trajectory inference using Diffusion Map, Slingshot, and Palantir, and visualizes pseudotime, PAGA graphs, and gene expression trends along trajectories." - }, - { - "name": "t_via_velo.json", - "introduction": "This notebook showcases trajectory inference using VIA with velocity information in omicverse. It loads data, performs preprocessing and velocity calculation, runs VIA, and visualizes the trajectory using various plots including pie chart graph, trajectory GAMs, stream plot, and lineage probabilities." - }, - { - "name": "t_via.json", - "introduction": "This script demonstrates trajectory inference using VIA (Visualization of RNA velocity in single cells) within Omicverse. It loads single-cell RNA-seq data, performs PCA, runs VIA, extracts pseudotime, and visualizes results including cluster graphs, trajectory GAMs, stream plots, lineage probabilities, and gene trends." - }, - { - "name": "t_visualize_bulk.json", - "introduction": "This script demonstrates visualization techniques for bulk RNA-seq data analysis using omicverse. It includes creating Venn diagrams, volcano plots for visualizing differentially expressed genes, and box plots with p-value annotations." - }, - { - "name": "t_visualize_colorsystem.json", - "introduction": "This script demonstrates the use of Omicverse's color system, particularly the Forbidden City color palette. It visualizes the color palette, retrieves specific colors by name or index, and uses these colors to customize plots, including UMAP embeddings, segmented colormaps, and color gradients." - }, - { - "name": "t_visualize_single.json", - "introduction": "This script demonstrates various visualization techniques for single-cell RNA-seq data analysis using Omicverse. It includes embedding plots, cell proportion histograms, stacked area graphs, convex hulls, contour plots, density plots, AUCell visualization, violin plots, bar-dot plots, box plots with statistical tests, complex heatmaps, and marker gene heatmaps." - }, - { - "name": "t_wgcna.json", - "introduction": "This script demonstrates Weighted Gene Co-expression Network Analysis (WGCNA) using the Omicverse library with bulk RNA-seq data. It covers data loading, preprocessing, network construction (including soft-thresholding power calculation, adjacency matrix, and TOM similarity matrix), module detection, visualization of the TOM matrix and sub-networks, module-trait relationship analysis, and identification of hub genes." - } - ] -} \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_anno_trans.json b/rag_engine/ovrawmjson/t_anno_trans.json deleted file mode 100644 index e17ae75e..00000000 --- a/rag_engine/ovrawmjson/t_anno_trans.json +++ /dev/null @@ -1,58 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, matplotlib, and scanpy. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nimport matplotlib.pyplot as plt\nimport scanpy as sc\nov.ov_plot_set()" - }, - { - "action": "Load RNA and ATAC data from preprocessed files. These files are assumed to be outputs from GLUE cross-modal integration and contain the `obsm['X_glue']` layer. The RNA data is already annotated.", - "code": "rna=sc.read(\"data/analysis_lymph/rna-emb.h5ad\")\natac=sc.read(\"data/analysis_lymph/atac-emb.h5ad\")" - }, - { - "action": "Combine RNA and ATAC data into a single AnnData object for visualization.", - "code": "import scanpy as sc\ncombined=sc.concat([rna,atac],merge='same')\ncombined" - }, - { - "action": "Perform Minimum Distortion Embedding (MDE) on the combined data using the `X_glue` layer.", - "code": "combined.obsm['X_mde']=ov.utils.mde(combined.obsm['X_glue'])" - }, - { - "action": "Visualize the combined data using MDE, coloring by the 'domain' (RNA or ATAC) to check alignment.", - "code": "ov.utils.embedding(combined,\n basis='X_mde',\n color='domain',\n title='Layers',\n show=False,\n palette=ov.utils.red_color,\n frameon='small'\n )" - }, - { - "action": "Visualize the RNA data using MDE, coloring by the 'major_celltype' to show existing annotations.", - "code": "ov.utils.embedding(rna,\n basis='X_mde',\n color='major_celltype',\n title='Cell type',\n show=False,\n #palette=ov.utils.red_color,\n frameon='small'\n )" - }, - { - "action": "Train a weighted K-nearest neighbors (KNN) classifier using the `X_glue` features from the annotated RNA data.", - "code": "knn_transformer=ov.utils.weighted_knn_trainer(\n train_adata=rna,\n train_adata_emb='X_glue',\n n_neighbors=15,\n)" - }, - { - "action": "Transfer cell type labels from RNA to ATAC data using the trained KNN classifier. Calculate uncertainty for each prediction.", - "code": "labels,uncert=ov.utils.weighted_knn_transfer(\n query_adata=atac,\n query_adata_emb='X_glue',\n label_keys='major_celltype',\n knn_model=knn_transformer,\n ref_adata_obs=rna.obs,\n)" - }, - { - "action": "Assign the transferred cell type labels and uncertainty scores to the ATAC data.", - "code": "atac.obs[\"transf_celltype\"]=labels.loc[atac.obs.index,\"major_celltype\"]\natac.obs[\"transf_celltype_unc\"]=uncert.loc[atac.obs.index,\"major_celltype\"]" - }, - { - "action": "Copy the transferred cell type labels to the 'major_celltype' column in the ATAC data.", - "code": "atac.obs[\"major_celltype\"]=atac.obs[\"transf_celltype\"].copy()" - }, - { - "action": "Visualize the ATAC data using UMAP, coloring by the transferred cell type labels and their uncertainty.", - "code": "ov.utils.embedding(atac,\n basis='X_umap',\n color=['transf_celltype_unc','transf_celltype'],\n #title='Cell type Un',\n show=False,\n palette=ov.palette()[11:],\n frameon='small'\n )" - }, - { - "action": "Merge the RNA and ATAC data again after transferring annotations.", - "code": "import scanpy as sc\ncombined1=sc.concat([rna,atac],merge='same')\ncombined1" - }, - { - "action": "Perform MDE on the merged data after annotation transfer.", - "code": "combined1.obsm['X_mde']=ov.utils.mde(combined1.obsm['X_glue'])" - }, - { - "action": "Visualize the merged data using MDE, coloring by 'domain' and 'major_celltype' to assess the consistency of cell type annotations across modalities.", - "code": "ov.utils.embedding(combined1,\n basis='X_mde',\n color=['domain','major_celltype'],\n title=['Layers','Cell type'],\n show=False,\n palette=ov.palette()[11:],\n frameon='small'\n )" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_aucell.json b/rag_engine/ovrawmjson/t_aucell.json deleted file mode 100644 index 12aff785..00000000 --- a/rag_engine/ovrawmjson/t_aucell.json +++ /dev/null @@ -1,70 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.utils.ov_plot_set()`. Download pathway database and gene ID annotation pair using `ov.utils.download_pathway_database()` and `ov.utils.download_geneid_annotation_pair()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\n\nov.utils.ov_plot_set()\n\nov.utils.download_pathway_database()\nov.utils.download_geneid_annotation_pair()" - }, - { - "action": "Load the pancreas dataset using `scv.datasets.pancreas()`. Print the AnnData object to inspect its contents.", - "code": "adata = scv.datasets.pancreas()\nadata" - }, - { - "action": "Check the maximum value in the `adata.X` matrix.", - "code": "adata.X.max()" - }, - { - "action": "Normalize the data to a total count of 1e4 per cell and log-transform it.", - "code": "sc.pp.normalize_total(adata, target_sum=1e4)\nsc.pp.log1p(adata)" - }, - { - "action": "Check the maximum value in the `adata.X` matrix after normalization and log-transformation.", - "code": "adata.X.max()" - }, - { - "action": "Prepare the gene set dictionary from the GO Biological Process 2021 file for the Mouse organism using `ov.utils.geneset_prepare()`.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2021.txt',organism='Mouse')" - }, - { - "action": "Assess the enrichment of a single gene set ('response to vitamin (GO:0033273)') using AUCell and visualize it on a UMAP embedding.", - "code": "##Assest one geneset\ngeneset_name='response to vitamin (GO:0033273)'\nov.single.geneset_aucell(adata,\n geneset_name=geneset_name,\n geneset=pathway_dict[geneset_name])\nsc.pl.embedding(adata,\n basis='umap',\n color=[\"{}_aucell\".format(geneset_name)])" - }, - { - "action": "Assess the enrichment of multiple gene sets ('response to vitamin (GO:0033273)' and 'response to vitamin D (GO:0033280)') using AUCell and visualize them on a UMAP embedding.", - "code": "##Assest more than one geneset\ngeneset_names=['response to vitamin (GO:0033273)','response to vitamin D (GO:0033280)']\nov.single.pathway_aucell(adata,\n pathway_names=geneset_names,\n pathways_dict=pathway_dict)\nsc.pl.embedding(adata,\n basis='umap',\n color=[i+'_aucell' for i in geneset_names])" - }, - { - "action": "Assess the enrichment of a custom gene set ('Sox') using AUCell and visualize it on a UMAP embedding.", - "code": "##Assest test geneset\nov.single.geneset_aucell(adata,\n geneset_name='Sox',\n geneset=['Sox17', 'Sox4', 'Sox7', 'Sox18', 'Sox5'])\nsc.pl.embedding(adata,\n basis='umap',\n color=[\"Sox_aucell\"])" - }, - { - "action": "Calculate AUCell enrichment scores for all pathways in the `pathway_dict` using multiple workers. Then, transfer metadata from the original `adata` object to the new `adata_aucs` object.", - "code": "##Assest all pathways\nadata_aucs=ov.single.pathway_aucell_enrichment(adata,\n pathways_dict=pathway_dict,\n num_workers=8)\n\nadata_aucs.obs=adata[adata_aucs.obs.index].obs\nadata_aucs.obsm=adata[adata_aucs.obs.index].obsm\nadata_aucs.obsp=adata[adata_aucs.obs.index].obsp\nadata_aucs" - }, - { - "action": "Save the `adata_aucs` object to an H5AD file and then read it back.", - "code": "adata_aucs.write_h5ad('data/pancreas_auce.h5ad',compression='gzip')\n\nadata_aucs=sc.read('data/pancreas_auce.h5ad')" - }, - { - "action": "Visualize the AUCell enrichment scores for the previously selected gene sets on a UMAP embedding of the `adata_aucs` object.", - "code": "sc.pl.embedding(adata_aucs,\n basis='umap',\n color=geneset_names)" - }, - { - "action": "Perform differential gene expression analysis on the `adata_aucs` object using the t-test method and visualize the top 3 differentially expressed genes per cluster using a dot plot.", - "code": "#adata_aucs.uns['log1p']['base']=None\nsc.tl.rank_genes_groups(adata_aucs, 'clusters', method='t-test',n_genes=100)\nsc.pl.rank_genes_groups_dotplot(adata_aucs,groupby='clusters',\n cmap='Spectral_r',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Extract the names of differentially expressed genes in the 'Beta' cluster based on log2 fold change and p-value thresholds.", - "code": "degs = sc.get.rank_genes_groups_df(adata_aucs, group='Beta', key='rank_genes_groups', log2fc_min=2, \n pval_cutoff=0.05)['names'].squeeze()\ndegs" - }, - { - "action": "Visualize the expression of the differentially expressed genes and the cluster assignments on a UMAP embedding using `sc.pl.embedding`.", - "code": "import matplotlib.pyplot as plt\n#fig, axes = plt.subplots(4,3,figsize=(12,9))\naxes=sc.pl.embedding(adata_aucs,ncols=3,\n basis='umap',show=False,return_fig=True,wspace=0.55,hspace=0.65,\n color=['clusters']+degs.values.tolist(),\n title=[ov.utils.plot_text_set(i,3,20)for i in ['clusters']+degs.values.tolist()])\n\naxes.tight_layout()" - }, - { - "action": "Perform differential gene expression analysis on the original `adata` object using the t-test method.", - "code": "adata.uns['log1p']['base']=None\nsc.tl.rank_genes_groups(adata, 'clusters', method='t-test',n_genes=100)" - }, - { - "action": "Perform pathway enrichment analysis using `ov.single.pathway_enrichment` and visualize the results using `ov.single.pathway_enrichment_plot`.", - "code": "res=ov.single.pathway_enrichment(adata,pathways_dict=pathway_dict,organism='Mouse',\n group_by='clusters',plot=True)\n\nax=ov.single.pathway_enrichment_plot(res,plot_title='Enrichment',cmap='Reds',\n xticklabels=True,cbar=False,square=True,vmax=10,\n yticklabels=True,cbar_kws={'label': '-log10(qvalue)','shrink': 0.5,})" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_bulk_combat.json b/rag_engine/ovrawmjson/t_bulk_combat.json deleted file mode 100644 index dc6c061a..00000000 --- a/rag_engine/ovrawmjson/t_bulk_combat.json +++ /dev/null @@ -1,74 +0,0 @@ -[ - { - "action": "Import necessary libraries: anndata, pandas, and omicverse. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import anndata\nimport pandas as pd\nimport omicverse as ov\nov.ov_plot_set()" - }, - { - "action": "Load the first dataset (GSE18520) from a pickle file, create an AnnData object, transpose it, and assign batch label '1'.", - "code": "dataset_1 = pd.read_pickle(\"data/combat/GSE18520.pickle\")\nadata1=anndata.AnnData(dataset_1.T)\nadata1.obs['batch']='1'\nadata1" - }, - { - "action": "Load the second dataset (GSE66957) from a pickle file, create an AnnData object, transpose it, and assign batch label '2'.", - "code": "dataset_2 = pd.read_pickle(\"data/combat/GSE66957.pickle\")\nadata2=anndata.AnnData(dataset_2.T)\nadata2.obs['batch']='2'\nadata2" - }, - { - "action": "Load the third dataset (GSE69428) from a pickle file, create an AnnData object, transpose it, and assign batch label '3'.", - "code": "dataset_3 = pd.read_pickle(\"data/combat/GSE69428.pickle\")\nadata3=anndata.AnnData(dataset_3.T)\nadata3.obs['batch']='3'\nadata3" - }, - { - "action": "Concatenate the three AnnData objects into a single AnnData object, keeping only the common genes.", - "code": "adata=anndata.concat([adata1,adata2,adata3],merge='same')\nadata" - }, - { - "action": "Perform batch effect correction on the combined AnnData object using the `ov.bulk.batch_correction` function, specifying 'batch' as the batch key.", - "code": "ov.bulk.batch_correction(adata,batch_key='batch')" - }, - { - "action": "Convert the raw data to a pandas DataFrame and transpose it.", - "code": "raw_data=adata.to_df().T\nraw_data.head()" - }, - { - "action": "Convert the batch-corrected data to a pandas DataFrame and transpose it.", - "code": "removing_data=adata.to_df(layer='batch_correction').T\nremoving_data.head()" - }, - { - "action": "Save the raw data and batch-corrected data to CSV files.", - "code": "raw_data.to_csv('raw_data.csv')\nremoving_data.to_csv('removing_data.csv')" - }, - { - "action": "Save the AnnData object to an H5AD file with gzip compression.", - "code": "adata.write_h5ad('adata_batch.h5ad',compression='gzip')\n#adata=ov.read('adata_batch.h5ad')" - }, - { - "action": "Define a dictionary to map batch labels to colors for visualization.", - "code": "color_dict={\n '1':ov.utils.red_color[1],\n '2':ov.utils.blue_color[1],\n '3':ov.utils.green_color[1],\n}" - }, - { - "action": "Create a boxplot of the raw data, coloring each box by its corresponding batch.", - "code": "fig,ax=plt.subplots( figsize = (20,4))\nbp=plt.boxplot(adata.to_df().T,patch_artist=True)\nfor i,batch in zip(range(adata.shape[0]),adata.obs['batch']):\n bp['boxes'][i].set_facecolor(color_dict[batch])\nax.axis(False)\nplt.show()" - }, - { - "action": "Create a boxplot of the batch-corrected data, coloring each box by its corresponding batch.", - "code": "fig,ax=plt.subplots( figsize = (20,4))\nbp=plt.boxplot(adata.to_df(layer='batch_correction').T,patch_artist=True)\nfor i,batch in zip(range(adata.shape[0]),adata.obs['batch']):\n bp['boxes'][i].set_facecolor(color_dict[batch])\nax.axis(False)\nplt.show()" - }, - { - "action": "Store a copy of the raw data in the 'raw' layer of the AnnData object.", - "code": "adata.layers['raw']=adata.X.copy()" - }, - { - "action": "Calculate principal components (PCs) for the raw data using `ov.pp.pca`.", - "code": "ov.pp.pca(adata,layer='raw',n_pcs=50)\nadata" - }, - { - "action": "Calculate principal components (PCs) for the batch-corrected data using `ov.pp.pca`.", - "code": "ov.pp.pca(adata,layer='batch_correction',n_pcs=50)\nadata" - }, - { - "action": "Create a UMAP embedding of the raw data, colored by batch.", - "code": "ov.utils.embedding(adata,\n basis='raw|original|X_pca',\n color='batch',\n frameon='small')" - }, - { - "action": "Create a UMAP embedding of the batch-corrected data, colored by batch.", - "code": "ov.utils.embedding(adata,\n basis='batch_correction|original|X_pca',\n color='batch',\n frameon='small')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cellanno.json b/rag_engine/ovrawmjson/t_cellanno.json deleted file mode 100644 index f7df50f1..00000000 --- a/rag_engine/ovrawmjson/t_cellanno.json +++ /dev/null @@ -1,126 +0,0 @@ -[ - { - "action": "Import the omicverse library and print its version. Import the scanpy library and print its version. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nprint(f'omicverse version:{ov.__version__}')\nimport scanpy as sc\nprint(f'scanpy version:{sc.__version__}')\nov.ov_plot_set()" - }, - { - "action": "Create a directory named 'data'. Download the PBMC3K filtered gene-barcode matrices from 10x Genomics and save them in the 'data' directory. Extract the downloaded tar.gz file in the 'data' directory. Create a directory named 'write' for writing processed data.", - "code": "# !mkdir data\n# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data in Matrix Market format into an AnnData object named `adata`. Use gene symbols for variable names and cache the data for faster subsequent reading.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading" - }, - { - "action": "Perform quality control on the AnnData object `adata` using the `ov.pp.qc` function. Filter cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes.", - "code": "#adata=ov.single.scanpy_lazy(adata)\n\n#quantity control\nadata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})" - }, - { - "action": "Preprocess the AnnData object `adata` using the `ov.pp.preprocess` function. Normalize the data using the 'shiftlog|pearson' mode and calculate 2000 highly variable genes (HVGs).", - "code": "#normalize and high variable genes (HVGs) calculated\nadata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)" - }, - { - "action": "Save the whole genes in `adata.raw` and filter out non-highly variable genes from `adata`.", - "code": "#save the whole genes and filter the non-HVGs\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]" - }, - { - "action": "Scale the data in `adata.X` using `ov.pp.scale`.", - "code": "#scale the adata.X\nov.pp.scale(adata)" - }, - { - "action": "Perform Principal Component Analysis (PCA) on the scaled data in `adata` using `ov.pp.pca`. Use the 'scaled' layer and calculate 50 principal components.", - "code": "#Dimensionality Reduction\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Construct a neighborhood graph using `sc.pp.neighbors`. Use 15 neighbors, 50 principal components, and the 'scaled|original|X_pca' representation.", - "code": "#Neighbourhood graph construction\nsc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph using `sc.tl.leiden`.", - "code": "#clusters\nsc.tl.leiden(adata)" - }, - { - "action": "Calculate Minimum Distortion Embedding (MDE) for visualization using `ov.utils.mde` and store the result in `adata.obsm[\"X_mde\"]`. Use the 'scaled|original|X_pca' representation as input.", - "code": "#Dimensionality Reduction for visualization(X_mde=X_umap+GPU)\nadata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])\nadata" - }, - { - "action": "Create a pySCSA object named `scsa` from the AnnData object `adata`. Set parameters for fold change, p-value, cell type, target, tissue, and model path.", - "code": "scsa=ov.single.pySCSA(adata=adata,\n foldchange=1.5,\n pvalue=0.01,\n celltype='normal',\n target='cellmarker',\n tissue='All',\n model_path='temp/pySCSA_2023_v2_plus.db' \n)" - }, - { - "action": "Perform cell annotation using the `scsa.cell_anno` method. Specify the clustering type as 'leiden', annotate all clusters, and calculate rank genes groups.", - "code": "anno=scsa.cell_anno(clustertype='leiden',\n cluster='all',rank_rep=True)" - }, - { - "action": "Query and display only the better-annotated results using `scsa.cell_auto_anno` and store the results in adata with the key 'scsa_celltype_cellmarker'.", - "code": "scsa.cell_auto_anno(adata,key='scsa_celltype_cellmarker')" - }, - { - "action": "Create a new pySCSA object named `scsa` with the same parameters as before, but change the target to 'panglaodb'.", - "code": "scsa=ov.single.pySCSA(adata=adata,\n foldchange=1.5,\n pvalue=0.01,\n celltype='normal',\n target='panglaodb',\n tissue='All',\n model_path='temp/pySCSA_2023_v2_plus.db'\n \n)" - }, - { - "action": "Perform cell annotation using the new `scsa` object with 'panglaodb' as the target.", - "code": "res=scsa.cell_anno(clustertype='leiden',\n cluster='all',rank_rep=True)" - }, - { - "action": "Print the cell annotation results using `scsa.cell_anno_print()`.", - "code": "scsa.cell_anno_print()" - }, - { - "action": "Query and display only the better-annotated results using `scsa.cell_auto_anno` and store the results in adata with the key 'scsa_celltype_panglaodb'.", - "code": "scsa.cell_auto_anno(adata,key='scsa_celltype_panglaodb')" - }, - { - "action": "Visualize the embeddings using `ov.utils.embedding`. Display the 'leiden' clusters, 'scsa_celltype_cellmarker' annotations, and 'scsa_celltype_panglaodb' annotations on the 'X_mde' embedding. Customize the legend, frame, and color palette.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden','scsa_celltype_cellmarker','scsa_celltype_panglaodb'], \n legend_loc='on data', \n frameon='small',\n legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Assign the first 1000 cells to group 'B' and the rest to group 'A' in a new column named 'group' in `adata.obs`. Visualize the 'group' on the 'X_mde' embedding using `ov.utils.embedding`.", - "code": "#Randomly designate the first 1000 cells as group B and the rest as group A\nadata.obs['group']='A'\nadata.obs.loc[adata.obs.index[:1000],'group']='B'\n#Colored\nov.utils.embedding(adata,\n basis='X_mde',\n color=['group'], \n frameon='small',legend_fontoutline=2,\n palette=ov.utils.red_color,\n )" - }, - { - "action": "Plot the cell type proportions using `ov.utils.plot_cellproportion`. Specify 'scsa_celltype_cellmarker' as the cell type clusters, 'group' as the visual clusters, and set the figure size.", - "code": "ov.utils.plot_cellproportion(adata=adata,celltype_clusters='scsa_celltype_cellmarker',\n visual_clusters='group',\n visual_name='group',figsize=(2,4))" - }, - { - "action": "Visualize the embeddings with cell type annotations using `ov.utils.plot_embedding_celltype`. Specify the 'X_mde' embedding, 'scsa_celltype_cellmarker' as the cell type key, and customize the title and ranges.", - "code": "ov.utils.plot_embedding_celltype(adata,figsize=None,basis='X_mde',\n celltype_key='scsa_celltype_cellmarker',\n title=' Cell type',\n celltype_range=(2,6),\n embedding_range=(4,10),)" - }, - { - "action": "Calculate the ratio of observed to expected cell numbers (Ro/e) for each cluster in different groups using `ov.utils.roe`. Specify 'group' as the sample key and 'scsa_celltype_cellmarker' as the cell type key.", - "code": "roe=ov.utils.roe(adata,sample_key='group',cell_type_key='scsa_celltype_cellmarker')" - }, - { - "action": "Create a heatmap to visualize the Ro/e values using `seaborn.heatmap`. Transform the Ro/e values into categorical labels ('+++', '++', '+', '+/-') for annotation. Customize the colormap, axis labels, and title.", - "code": "import seaborn as sns\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(2,4))\n\ntransformed_roe = roe.copy()\ntransformed_roe = transformed_roe.applymap(\n lambda x: '+++' if x >= 2 else ('++' if x >= 1.5 else ('+' if x >= 1 else '+/-')))\n\nsns.heatmap(roe, annot=transformed_roe, cmap='RdBu_r', fmt='', \n cbar=True, ax=ax,vmin=0.5,vmax=1.5,cbar_kws={'shrink':0.5})\nplt.xticks(fontsize=12)\nplt.yticks(fontsize=12)\n\nplt.xlabel('Group',fontsize=13)\nplt.ylabel('Cell type',fontsize=13)\nplt.title('Ro/e',fontsize=13)" - }, - { - "action": "Prepare a dictionary `res_marker_dict` containing marker genes for different cell types.", - "code": "res_marker_dict={\n 'Megakaryocyte':['ITGA2B','ITGB3'],\n 'Dendritic cell':['CLEC10A','IDO1'],\n 'Monocyte' :['S100A8','S100A9','LST1',],\n 'Macrophage':['CSF1R','CD68'],\n 'B cell':['MS4A1','CD79A','MZB1',],\n 'NK/NKT cell':['GNLY','KLRD1'],\n 'CD8+T cell':['CD8A','CD8B'],\n 'Treg':['CD4','CD40LG','IL7R','FOXP3','IL2RA'],\n 'CD4+T cell':['PTPRC','CD3D','CD3E'],\n\n}" - }, - { - "action": "Calculate a dendrogram for the 'leiden' clusters using `sc.tl.dendrogram`. Create a dot plot using `sc.pl.dotplot` to visualize the expression of marker genes from `res_marker_dict` in each 'leiden' cluster. Include the dendrogram and standardize the scale by variable.", - "code": "sc.tl.dendrogram(adata,'leiden')\nsc.pl.dotplot(adata, res_marker_dict, 'leiden', \n dendrogram=True,standard_scale='var')" - }, - { - "action": "Create a dictionary `cluster2annotation` to map 'leiden' cluster IDs to manual annotation labels based on the dot plot. Annotate the cells in `adata` using `ov.single.scanpy_cellanno_from_dict` based on the `cluster2annotation` dictionary and 'leiden' clustering.", - "code": "# create a dictionary to map cluster to annotation label\ncluster2annotation = {\n '0': 'T cell',\n '1': 'T cell',\n '2': 'Monocyte',#Germ-cell(Oid)\n '3': 'B cell',#Germ-cell(Oid)\n '4': 'T cell',\n '5': 'Macrophage',\n '6': 'NKT cells',\n '7': 'T cell',\n '8':'Monocyte',\n '9':'Dendritic cell',\n '10':'Megakaryocyte',\n\n}\nov.single.scanpy_cellanno_from_dict(adata,anno_dict=cluster2annotation,\n clustertype='leiden')" - }, - { - "action": "Compare the automatic annotation results ('scsa_celltype_cellmarker') with the manual annotation ('major_celltype') by visualizing them on the 'X_mde' embedding using `ov.utils.embedding`. Customize the legend, frame, and color palette.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color=['major_celltype','scsa_celltype_cellmarker'], \n legend_loc='on data', frameon='small',legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Obtain the marker genes for each cell type using `ov.single.get_celltype_marker`. Specify 'scsa_celltype_cellmarker' as the cluster type.", - "code": "marker_dict=ov.single.get_celltype_marker(adata,clustertype='scsa_celltype_cellmarker')\nmarker_dict.keys()" - }, - { - "action": "Print the marker genes for 'B cell' from the `marker_dict`.", - "code": "marker_dict['B cell']" - }, - { - "action": "Retrieve the available tissues in the database using `scsa.get_model_tissue()`.", - "code": "scsa.get_model_tissue()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cellfate.json b/rag_engine/ovrawmjson/t_cellfate.json deleted file mode 100644 index f8de33cd..00000000 --- a/rag_engine/ovrawmjson/t_cellfate.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, pandas, and tqdm. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport pandas as pd\nfrom tqdm.auto import tqdm\nov.plot_set()" - }, - { - "action": "Load the mouse hematopoiesis data from Nestorowa et al. (2016) using `ov.single.mouse_hsc_nestorowa16()`.", - "code": "adata = ov.single.mouse_hsc_nestorowa16()\nadata" - }, - { - "action": "Load the human prior interaction network from the 'nichenet' dataset using `ov.single.load_human_prior_interaction_network()`.", - "code": "prior_network = ov.single.load_human_prior_interaction_network(dataset='nichenet')" - }, - { - "action": "Convert the gene symbols in the prior network from human to mouse using `ov.single.convert_human_to_mouse_network()`.", - "code": "prior_network = ov.single.convert_human_to_mouse_network(prior_network,server_name='asia')\nprior_network" - }, - { - "action": "Save the converted prior network to a compressed tab-separated file.", - "code": "prior_network.to_csv('result/combined_network_Mouse.txt.gz',sep='\t')" - }, - { - "action": "Alternatively, read the prior network from the saved file using `ov.read()`.", - "code": "prior_network=ov.read('result/combined_network_Mouse.txt.gz',index_col=0)" - }, - { - "action": "Initialize the CEFCON object with the AnnData object, prior network, number of repeats, and solver.", - "code": "CEFCON_obj = ov.single.pyCEFCON(adata, prior_network, repeats=5, solver='GUROBI')\nCEFCON_obj" - }, - { - "action": "Preprocess the data for CEFCON analysis using `CEFCON_obj.preprocess()`.", - "code": "CEFCON_obj.preprocess()" - }, - { - "action": "Train the CEFCON model using `CEFCON_obj.train()`.", - "code": "CEFCON_obj.train()" - }, - { - "action": "Identify driver regulators for each lineage using `CEFCON_obj.predicted_driver_regulators()`.", - "code": "CEFCON_obj.predicted_driver_regulators()" - }, - { - "action": "Display the top driver regulators for the 'E_pseudotime' lineage.", - "code": "CEFCON_obj.cefcon_results_dict['E_pseudotime'].driver_regulator.head()" - }, - { - "action": "Predict regulon-like gene modules (RGMs) using `CEFCON_obj.predicted_RGM()`.", - "code": "CEFCON_obj.predicted_RGM()" - }, - { - "action": "Access the results for the 'E_pseudotime' lineage.", - "code": "CEFCON_obj.cefcon_results_dict['E_pseudotime']" - }, - { - "action": "Store the lineage name and results in variables.", - "code": "lineage = 'E_pseudotime'\nresult = CEFCON_obj.cefcon_results_dict[lineage]" - }, - { - "action": "Create an AnnData object from the gene embeddings.", - "code": "gene_ad=sc.AnnData(result.gene_embedding)" - }, - { - "action": "Compute the neighborhood graph of the gene embeddings.", - "code": "sc.pp.neighbors(gene_ad, n_neighbors=30, use_rep='X')" - }, - { - "action": "Perform Leiden clustering on the gene embeddings.", - "code": "sc.tl.leiden(gene_ad, resolution=1)" - }, - { - "action": "Compute UMAP embeddings for the gene embeddings.", - "code": "sc.tl.umap(gene_ad, n_components=2, min_dist=0.3)" - }, - { - "action": "Plot the Leiden clustering results on the UMAP embeddings.", - "code": "ov.utils.embedding(gene_ad,basis='X_umap',legend_loc='on data',\n legend_fontsize=8, legend_fontoutline=2,\n color='leiden',frameon='small',title='Leiden clustering using CEFCON\\nderived gene embeddings')" - }, - { - "action": "Prepare data for plotting influence scores of driver regulators.", - "code": "import matplotlib.pyplot as plt\nimport seaborn as sns\ndata_for_plot = result.driver_regulator[result.driver_regulator['is_driver_regulator']]\ndata_for_plot = data_for_plot[0:20]" - }, - { - "action": "Create a horizontal bar plot of influence scores for the top 20 driver regulators.", - "code": "plt.figure(figsize=(2, 20 * 0.2))\nsns.set_theme(style='ticks', font_scale=0.5)\n\nax = sns.barplot(x='influence_score', y=data_for_plot.index, data=data_for_plot, orient='h',\n palette=sns.color_palette(f\"ch:start=.5,rot=-.5,reverse=1,dark=0.4\", n_colors=20))\nax.set_title(result.name)\nax.set_xlabel('Influence score')\nax.set_ylabel('Driver regulators')\n\nax.spines['left'].set_position(('outward', 10))\nax.spines['bottom'].set_position(('outward', 10))\nplt.xticks(fontsize=12)\nplt.yticks(fontsize=12)\n\nplt.grid(False)\nax.spines['top'].set_visible(False)\nax.spines['right'].set_visible(False)\nax.spines['bottom'].set_visible(True)\nax.spines['left'].set_visible(True)\n\nplt.title('E_pseudotime',fontsize=12)\nplt.xlabel('Influence score',fontsize=12)\nplt.ylabel('Driver regulon',fontsize=12)\n\nsns.despine()" - }, - { - "action": "Plot a Venn diagram of driver genes using `result.plot_driver_genes_Venn()`.", - "code": "result.plot_driver_genes_Venn()" - }, - { - "action": "Create a subset of the AnnData object containing cells from the specific lineage.", - "code": "adata_lineage = adata[adata.obs_names[adata.obs[result.name].notna()],:]" - }, - { - "action": "Plot a heatmap of the RGM activity matrix using `result.plot_RGM_activity_heatmap()`.", - "code": "result.plot_RGM_activity_heatmap(cell_label=adata_lineage.obs['cell_type_finely'],\n type='out',col_cluster=True,bbox_to_anchor=(1.48, 0.25))" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cellfate_gene.json b/rag_engine/ovrawmjson/t_cellfate_gene.json deleted file mode 100644 index 2255abde..00000000 --- a/rag_engine/ovrawmjson/t_cellfate_gene.json +++ /dev/null @@ -1,186 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scvelo, and matplotlib. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scvelo as scv\nimport matplotlib.pyplot as plt\nov.ov_plot_set()" - }, - { - "action": "Load the dentategyrus dataset using `scv.datasets.dentategyrus()`.", - "code": "adata = scv.datasets.dentategyrus()\nadata" - }, - { - "action": "Perform quality control on the dataset using `ov.pp.qc()`, filtering cells based on mitochondrial percentage, number of UMIs, and number of detected genes.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.15, 'nUMIs': 500, 'detected_genes': 250},\n )" - }, - { - "action": "Store the raw counts in the 'counts' layer using `ov.utils.store_layers()`.", - "code": "ov.utils.store_layers(adata,layers='counts')\nadata" - }, - { - "action": "Preprocess the dataset using `ov.pp.preprocess()` with 'shiftlog|pearson' mode and selecting 2000 highly variable genes.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',\n n_HVGs=2000)" - }, - { - "action": "Store the raw data in `adata.raw` and subset the data to include only highly variable genes.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Scale the data using `ov.pp.scale()` and perform PCA using `ov.pp.pca()` on the scaled data with 50 principal components. Then, apply MDE to the PCA results.", - "code": "ov.pp.scale(adata)\nov.pp.pca(adata,layer='scaled',n_pcs=50)\n\nadata.obsm[\"X_mde_pca\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])" - }, - { - "action": "Convert the raw data back to an AnnData object.", - "code": "adata=adata.raw.to_adata()" - }, - { - "action": "Create an embedding plot using `ov.utils.embedding()` based on 'X_mde_pca' and color the points by 'clusters'.", - "code": "fig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(adata,\n basis='X_mde_pca',frameon='small',\n color=['clusters'],show=False,ax=ax)" - }, - { - "action": "Filter out cells belonging to the 'Endothelial' cluster and initialize a SEACells model.", - "code": "import SEACells\nadata=adata[adata.obs['clusters']!='Endothelial']\nmodel = SEACells.core.SEACells(adata, \n build_kernel_on='scaled|original|X_pca', \n n_SEACells=200, \n n_waypoint_eigs=10,\n convergence_epsilon = 1e-5)" - }, - { - "action": "Construct the kernel matrix using the initialized SEACells model.", - "code": "model.construct_kernel_matrix()\nM = model.kernel_matrix\n# Initialize archetypes\nmodel.initialize_archetypes()" - }, - { - "action": "Fit the SEACells model with a minimum of 10 and a maximum of 50 iterations.", - "code": "model.fit(min_iter=10, max_iter=50)" - }, - { - "action": "Plot the convergence of the SEACells model.", - "code": "# Check for convergence \nget_ipython().run_line_magic('matplotlib', 'inline')\nmodel.plot_convergence()" - }, - { - "action": "Force the model to run additional iterations using the `model.step()` function.", - "code": "# You can force the model to run additional iterations step-wise using the .step() function\nprint(f'Run for {len(model.RSS_iters)} iterations')\nfor _ in range(10):\n model.step()\nprint(f'Run for {len(model.RSS_iters)} iterations')" - }, - { - "action": "Plot the convergence of the SEACells model again.", - "code": "# Check for convergence \nget_ipython().run_line_magic('matplotlib', 'inline')\nmodel.plot_convergence()" - }, - { - "action": "Plot a 2D representation of the Dentategyrus Metacells using `SEACells.plot.plot_2D()`.", - "code": "get_ipython().run_line_magic('matplotlib', 'inline')\nSEACells.plot.plot_2D(adata, key='X_mde_pca', colour_metacells=False,\n figsize=(4,4),cell_size=20,title='Dentategyrus Metacells',\n )" - }, - { - "action": "Set `adata.raw` to a copy of `adata`.", - "code": "adata.raw=adata.copy()" - }, - { - "action": "Summarize the data by soft SEACells using `SEACells.core.summarize_by_soft_SEACell()`.", - "code": "SEACell_soft_ad = SEACells.core.summarize_by_soft_SEACell(adata, model.A_, \n celltype_label='clusters',\n summarize_layer='raw', minimum_weight=0.05)\nSEACell_soft_ad" - }, - { - "action": "Set `SEACell_soft_ad.raw` to a copy of `SEACell_soft_ad` and identify highly variable genes.", - "code": "import scanpy as sc\nSEACell_soft_ad.raw=SEACell_soft_ad.copy()\nsc.pp.highly_variable_genes(SEACell_soft_ad, n_top_genes=2000, inplace=True)\nSEACell_soft_ad=SEACell_soft_ad[:,SEACell_soft_ad.var.highly_variable]" - }, - { - "action": "Scale the data in `SEACell_soft_ad`, perform PCA, compute neighbors, and generate a UMAP embedding.", - "code": "ov.pp.scale(SEACell_soft_ad)\nov.pp.pca(SEACell_soft_ad,layer='scaled',n_pcs=50)\nsc.pp.neighbors(SEACell_soft_ad, use_rep='scaled|original|X_pca')\nsc.tl.umap(SEACell_soft_ad)" - }, - { - "action": "Set the 'celltype' observation to a categorical type and reorder categories based on `adata.obs['clusters']`. Also, set the color palette.", - "code": "SEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].astype('category')\nSEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories)\nSEACell_soft_ad.uns['celltype_colors']=adata.uns['clusters_colors']" - }, - { - "action": "Create an embedding plot of the metacells using `ov.utils.embedding()` based on 'X_umap' and color the points by 'celltype'.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(SEACell_soft_ad,\n basis='X_umap',\n color=[\"celltype\"],\n title='Meta Celltype',\n frameon='small',\n legend_fontsize=12,\n #palette=ov.utils.palette()[11:],\n ax=ax,\n show=False)" - }, - { - "action": "Calculate pseudotime using pyVIA with manually adjusted parameters.", - "code": "v0 = ov.single.pyVIA(adata=SEACell_soft_ad,adata_key='scaled|original|X_pca',\n adata_ncomps=50, basis='X_umap',\n clusters='celltype',knn=10, root_user=['nIPC','Neuroblast'],\n dataset='group', \n random_seed=112,is_coarse=True, \n preserve_disconnected=True,\n piegraph_arrow_head_width=0.05,piegraph_edgeweight_scalingfactor=2.5,\n gene_matrix=SEACell_soft_ad.X,velo_weight=0.5,\n edgebundle_pruning_twice=False, edgebundle_pruning=0.15, \n jac_std_global=0.05,too_big_factor=0.05,\n cluster_graph_pruning_std=1,\n time_series=False,\n )\n\nv0.run()" - }, - { - "action": "Obtain the pseudotime values and store them in the `SEACell_soft_ad` object.", - "code": "v0.get_pseudotime(SEACell_soft_ad)" - }, - { - "action": "Create an embedding plot using `ov.utils.embedding()` based on 'X_umap' and color the points by 'pt_via' (pseudotime).", - "code": "#v0.get_pseudotime(SEACell_soft_ad)\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(SEACell_soft_ad,\n basis='X_umap',\n color=[\"pt_via\"],\n title='Pseudotime',\n frameon='small',\n cmap='Reds',\n #size=40,\n legend_fontsize=12,\n #palette=ov.utils.palette()[11:],\n ax=ax,\n show=False)" - }, - { - "action": "Save the metacell results to an h5ad file.", - "code": "SEACell_soft_ad.write_h5ad('data/tutorial_meta_den.h5ad',compression='gzip')" - }, - { - "action": "Read the metacell results from the h5ad file.", - "code": "SEACell_soft_ad=ov.utils.read('data/tutorial_meta_den.h5ad')" - }, - { - "action": "Initialize the cellfategenie object with the metacell data and pseudotime.", - "code": "cfg_obj=ov.single.cellfategenie(SEACell_soft_ad,pseudotime='pt_via')\ncfg_obj.model_init()" - }, - { - "action": "Perform Adaptive Threshold Regression (ATR) to find the minimum number of genes for accurate regression.", - "code": "cfg_obj.ATR(stop=500,flux=0.01)" - }, - { - "action": "Plot the filtering results from the ATR analysis.", - "code": "fig,ax=cfg_obj.plot_filtering(color='#5ca8dc')\nax.set_title('Dentategyrus Metacells\\nCellFateGenie')" - }, - { - "action": "Fit the model and obtain the results.", - "code": "res=cfg_obj.model_fit()" - }, - { - "action": "Plot the color fitting for the raw data type, colored by cell type.", - "code": "cfg_obj.plot_color_fitting(type='raw',cluster_key='celltype')" - }, - { - "action": "Plot the color fitting for the filtered data type, colored by cell type.", - "code": "cfg_obj.plot_color_fitting(type='filter',cluster_key='celltype')" - }, - { - "action": "Perform Kendalltau test to filter genes based on trend significance.", - "code": "kt_filter=cfg_obj.kendalltau_filter()\nkt_filter.head()" - }, - { - "action": "Extract gene names with p-value less than the mean p-value and calculate gene trends.", - "code": "var_name=kt_filter.loc[kt_filter['pvalue']=0.\n result_precision = 3, # Sets the rounding for the mean values in significan_means.\n pvalue = 0.05, # P-value threshold to employ for significance.\n subsampling = False, # To enable subsampling the data (geometri sketching).\n subsampling_log = False, # (mandatory) enable subsampling log1p for non log-transformed data inputs.\n subsampling_num_pc = 100, # Number of componets to subsample via geometric skectching (dafault: 100).\n subsampling_num_cells = 1000, # Number of cells to subsample (integer) (default: 1/3 of the dataset).\n separator = '|', # Sets the string to employ to separate cells in the results dataframes \"cellA|CellB\".\n debug = False, # Saves all intermediate tables employed during the analysis in pkl format.\n output_path = out_path, # Path to save results.\n output_suffix = None # Replaces the timestamp in the output files by a user defined string in the (default: None).\n )" - }, - { - "action": "Save the CellPhoneDB results to a pickle file.", - "code": "ov.utils.save(cpdb_results,'data/cpdb/gex_cpdb_test.pkl')" - }, - { - "action": "Load the CellPhoneDB results from a pickle file.", - "code": "cpdb_results=ov.utils.load('data/cpdb/gex_cpdb_test.pkl')" - }, - { - "action": "Calculate cell-cell interaction network using `ov.single.cpdb_network_cal`.", - "code": "interaction=ov.single.cpdb_network_cal(adata = adata,\n pvals = cpdb_results['pvalues'],\n celltype_key = \"cell_labels\",)" - }, - { - "action": "Display the head of the interaction edges DataFrame.", - "code": "interaction['interaction_edges'].head()" - }, - { - "action": "Set the plotting style using `ov.plot_set()`.", - "code": "ov.plot_set()" - }, - { - "action": "Create and display a heatmap of cell-cell interactions.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n fontsize=11,\n ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',})" - }, - { - "action": "Create and display a heatmap of cell-cell interactions for specific source cells.", - "code": "fig, ax = plt.subplots(figsize=(2,4)) \nov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'],\n ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',})" - }, - { - "action": "Create and display a chord diagram of cell-cell interactions.", - "code": "fig=ov.pl.cpdb_chord(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n count_min=60,fontsize=12,padding=50,radius=100,save=None,)\nfig.show()" - }, - { - "action": "Create and display a network graph of cell-cell interactions.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n counts_min=60,\n nodesize_scale=5,\n ax=ax)" - }, - { - "action": "Create and display a network graph of cell-cell interactions for specific source cells.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n counts_min=60,\n nodesize_scale=5,\n source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'],\n ax=ax)" - }, - { - "action": "Create and display a network graph of cell-cell interactions for specific target cells.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n counts_min=60,\n nodesize_scale=5,\n target_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'],\n ax=ax)" - }, - { - "action": "Plot a network of cell-cell interactions with customized appearance.", - "code": "ov.single.cpdb_plot_network(adata=adata,\n interaction_edges=interaction['interaction_edges'],\n celltype_key='cell_labels',\n nodecolor_dict=None,title='EVT Network',\n edgeswidth_scale=25,nodesize_scale=10,\n pos_scale=1,pos_size=10,figsize=(6,6),\n legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10)" - }, - { - "action": "Extract a subnetwork of interactions based on specified cell types.", - "code": "sub_i=interaction['interaction_edges']\nsub_i=sub_i.loc[sub_i['SOURCE'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])]\nsub_i=sub_i.loc[sub_i['TARGET'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])]" - }, - { - "action": "Create a sub-anndata object containing only specified cell types.", - "code": "sub_adata=adata[adata.obs['cell_labels'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])]\nsub_adata" - }, - { - "action": "Plot the sub-interaction network between the cells in scRNA-seq for the extracted subnetwork.", - "code": "ov.single.cpdb_plot_network(adata=sub_adata,\n interaction_edges=sub_i,\n celltype_key='cell_labels',\n nodecolor_dict=None,title='Sub-EVT Network',\n edgeswidth_scale=25,nodesize_scale=1,\n pos_scale=1,pos_size=10,figsize=(5,5),\n legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10)" - }, - { - "action": "Create and display a chord diagram for the subnetwork.", - "code": "fig=ov.pl.cpdb_chord(sub_adata,sub_i,celltype_key='cell_labels',\n count_min=10,fontsize=12,padding=60,radius=100,save=None,)\nfig.show()" - }, - { - "action": "Create and display a network graph for the subnetwork.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(sub_adata,sub_i,celltype_key='cell_labels',\n counts_min=10,\n nodesize_scale=5,\n ax=ax)" - }, - { - "action": "Create and display a heatmap for the subnetwork.", - "code": "fig, ax = plt.subplots(figsize=(3,3)) \nov.pl.cpdb_heatmap(sub_adata,sub_i,celltype_key='cell_labels',\n ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',})" - }, - { - "action": "Extract significant interactions where 'eEVT' and 'iEVT' are targets using `ov.single.cpdb_exact_target`.", - "code": "sub_means=ov.single.cpdb_exact_target(cpdb_results['means'],['eEVT','iEVT'])\nsub_means=ov.single.cpdb_exact_source(sub_means,['dNK1','dNK2','dNK3'])\nsub_means.head() " - }, - { - "action": "Plot a heatmap of interacting proteins between specified source and target cells.", - "code": "ov.pl.cpdb_interacting_heatmap(adata=adata,\n celltype_key='cell_labels',\n means=cpdb_results['means'],\n pvalues=cpdb_results['pvalues'],\n source_cells=['dNK1','dNK2','dNK3'],\n target_cells=['eEVT','iEVT'],\n plot_secret=True,\n min_means=3,\n nodecolor_dict=None,\n ax=None,\n figsize=(2,6),\n fontsize=10,)" - }, - { - "action": "Plot a grouped heatmap showing the expression of ligands in source cells and receptors in target cells.", - "code": "ov.pl.cpdb_group_heatmap(adata=adata,\n celltype_key='cell_labels',\n means=cpdb_results['means'],\n cmap={'Target':'Blues','Source':'Reds'},\n source_cells=['dNK1','dNK2','dNK3'],\n target_cells=['eEVT','iEVT'],\n plot_secret=True,\n min_means=3,\n nodecolor_dict=None,\n ax=None,\n figsize=(2,6),\n fontsize=10,)" - }, - { - "action": "Plot an interacting network graph showing connections between ligands, receptors, source, and target cells.", - "code": "ov.pl.cpdb_interacting_network(adata=adata,\n celltype_key='cell_labels',\n means=cpdb_results['means'],\n source_cells=['dNK1','dNK2','dNK3'],\n target_cells=['eEVT','iEVT'],\n means_min=1,\n means_sum_min=1, \n nodecolor_dict=None,\n ax=None,\n figsize=(6,6),\n fontsize=10)" - }, - { - "action": "Filter out rows with missing gene_a or gene_b, and combine gene_a and gene_b into a single list for enrichment analysis.", - "code": "sub_means=sub_means.loc[~sub_means['gene_a'].isnull()]\nsub_means=sub_means.loc[~sub_means['gene_b'].isnull()]\nenrichr_genes=sub_means['gene_a'].tolist()+sub_means['gene_b'].tolist()" - }, - { - "action": "Prepare a pathway dictionary for gene set enrichment analysis using human organism data.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Human')" - }, - { - "action": "Perform gene set enrichment analysis on the list of genes using the prepared pathway dictionary.", - "code": "#deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()\nenr=ov.bulk.geneset_enrichment(gene_list=enrichr_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='human')" - }, - { - "action": "Set the plotting style and create a gene set enrichment plot with specified parameters.", - "code": "ov.plot_set()\nov.bulk.geneset_plot(enr,figsize=(2,4),fig_title='GO-Bio(EVT)',\n cax_loc=[2, 0.45, 0.5, 0.02],num=8,\n bbox_to_anchor_used=(-0.25, -13),custom_ticks=[10,100],\n cmap='Greens')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cluster.json b/rag_engine/ovrawmjson/t_cluster.json deleted file mode 100644 index 01e9bb21..00000000 --- a/rag_engine/ovrawmjson/t_cluster.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\nov.plot_set()" - }, - { - "action": "Import the scvelo library and load the dentategyrus dataset using `scv.datasets.dentategyrus()`. The dataset is stored in the `adata` variable.", - "code": "import scvelo as scv\nadata=scv.datasets.dentategyrus()\nadata" - }, - { - "action": "Preprocess the `adata` object using `ov.pp.preprocess()`. The preprocessing steps include shifting and logging the data, applying Pearson residuals, and selecting the top 3000 highly variable genes. The preprocessed data is stored back in `adata`. The raw data is saved in `adata.raw`, and only the highly variable genes are kept. Finally, the data is scaled and PCA is performed using `ov.pp.scale()` and `ov.pp.pca()`.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=3000,)\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nov.pp.scale(adata)\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Plot the variance ratio explained by each principal component using `ov.utils.plot_pca_variance_ratio()`.", - "code": "ov.utils.plot_pca_variance_ratio(adata)" - }, - { - "action": "Compute the k-nearest neighbor graph using `sc.pp.neighbors()`. The number of neighbors is set to 15, the number of principal components is set to 50, and the representation used is 'scaled|original|X_pca'. Then, perform Leiden clustering using `ov.utils.cluster()` with `method='leiden'` and `resolution=1`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')\nov.utils.cluster(adata,method='leiden',resolution=1)" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and Leiden clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','leiden'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','leiden'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Compute the k-nearest neighbor graph using `sc.pp.neighbors()`. The number of neighbors is set to 15, the number of principal components is set to 50, and the representation used is 'scaled|original|X_pca'. Then, perform Louvain clustering using `ov.utils.cluster()` with `method='louvain'` and `resolution=1`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')\nov.utils.cluster(adata,method='louvain',resolution=1)" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and Louvain clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','louvain'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','louvain'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Perform Gaussian Mixture Model (GMM) clustering using `ov.utils.cluster()`. The representation used is 'scaled|original|X_pca', `method` is set to 'GMM', `n_components` is set to 21, `covariance_type` is set to 'full', `tol` is set to 1e-9, and `max_iter` is set to 1000.", - "code": "ov.utils.cluster(adata,use_rep='scaled|original|X_pca',\n method='GMM',n_components=21,\n covariance_type='full',tol=1e-9, max_iter=1000, )" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and GMM clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','gmm_cluster'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','gmm_cluster'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Initialize the Latent Dirichlet Allocation (LDA) model using `ov.utils.LDA_topic()`. The `feature_type` is set to 'expression', `highly_variable_key` is set to 'highly_variable_features', `layers` is set to 'counts', `batch_key` is set to None, and `learning_rate` is set to 1e-3.", - "code": "LDA_obj=ov.utils.LDA_topic(adata,feature_type='expression',\n highly_variable_key='highly_variable_features',\n layers='counts',batch_key=None,learning_rate=1e-3)" - }, - { - "action": "Plot the topic contributions for the first 6 topics using `LDA_obj.plot_topic_contributions()`.", - "code": "LDA_obj.plot_topic_contributions(6)" - }, - { - "action": "Predict the topic distribution for each cell using 13 topics with `LDA_obj.predicted()`.", - "code": "LDA_obj.predicted(13)" - }, - { - "action": "Generate UMAP embeddings and visualize the distribution of topics across cells using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to `LDA_obj.model.topic_cols`, `cmap` is set to 'BuPu', `ncols` is set to 4, `add_outline` is set to True, and `frameon` is set to 'small'.", - "code": "ov.plot_set()\nov.utils.embedding(adata, basis='X_umap',color = LDA_obj.model.topic_cols, cmap='BuPu', ncols=4,\n add_outline=True, frameon='small',)" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and LDA clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','LDA_cluster'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','LDA_cluster'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Refine the LDA clustering results using a random forest classifier. Cells with LDA greater than 0.4 are used as a primitive class. The random forest model is trained on these cells and then used to classify cells with LDA less than 0.4. This is done using `LDA_obj.get_results_rfc()`.", - "code": "LDA_obj.get_results_rfc(adata,use_rep='scaled|original|X_pca',\n LDA_threshold=0.4,num_topics=13)" - }, - { - "action": "Generate UMAP embeddings and visualize the refined LDA clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['LDA_cluster_rfc','LDA_cluster_clf'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['LDA_cluster_rfc','LDA_cluster_clf'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Convert the sparse matrix `adata.X` to a dense array using `adata.X.toarray()`.", - "code": "adata.X.toarray()" - }, - { - "action": "Initialize and run cNMF analysis. This includes initializing the `cnmf_obj` with specified parameters, factorizing the data, combining results, and generating a k-selection plot.", - "code": "import numpy as np\n## Initialize the cnmf object that will be used to run analyses\ncnmf_obj = ov.single.cNMF(adata,components=np.arange(5,11), n_iter=20, seed=14, num_highvar_genes=2000,\n output_dir='example_dg1/cNMF', name='dg_cNMF')\n## Specify that the jobs are being distributed over a single worker (total_workers=1) and then launch that worker\ncnmf_obj.factorize(worker_i=0, total_workers=4)\ncnmf_obj.combine(skip_missing_files=True)\ncnmf_obj.k_selection_plot(close_fig=False)" - }, - { - "action": "Perform consensus clustering with a selected number of components (K=7) and a density threshold of 2.00. The results are then loaded and used to annotate the `adata` object.", - "code": "selected_K = 7\ndensity_threshold = 2.00\ncnmf_obj.consensus(k=selected_K, \n density_threshold=density_threshold, \n show_clustering=True, \n close_clustergram_fig=False)\nresult_dict = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)\ncnmf_obj.get_results(adata,result_dict)" - }, - { - "action": "Generate UMAP embeddings and visualize the cNMF usage scores using `ov.pl.embedding()`. The `basis` is set to 'X_umap', `color` is set to the columns of `result_dict['usage_norm']`, `use_raw` is set to False, `ncols` is set to 3, `vmin` is set to 0, `vmax` is set to 1, and `frameon` is set to 'small'.", - "code": "ov.pl.embedding(adata, basis='X_umap',color=result_dict['usage_norm'].columns,\n use_raw=False, ncols=3, vmin=0, vmax=1,frameon='small')" - }, - { - "action": "Refine the cNMF clustering results using a random forest classifier, similar to the LDA refinement. Cells with cNMF usage greater than 0.5 are used as a primitive class, and the random forest model is trained on these cells to classify cells with cNMF usage less than 0.5. This is done using `cnmf_obj.get_results_rfc()`.", - "code": "cnmf_obj.get_results_rfc(adata,result_dict,\n use_rep='scaled|original|X_pca',\n cNMF_threshold=0.5)" - }, - { - "action": "Generate UMAP embeddings and visualize the refined cNMF clustering results using `ov.pl.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['cNMF_cluster_rfc','cNMF_cluster_clf'], `frameon` is set to 'small', and other plotting parameters are specified.", - "code": "ov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['cNMF_cluster_rfc','cNMF_cluster_clf'],\n frameon='small',\n #title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Calculate the Adjusted Rand Index (ARI) for different clustering methods (Leiden, Louvain, GMM, LDA, LDA_rfc, LDA_clf, cNMF_rfc, cNMF_clf) compared to the 'clusters' annotation in `adata.obs`. The ARI values are printed for each method.", - "code": "from sklearn.metrics.cluster import adjusted_rand_score\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['leiden'])\nprint('Leiden, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['louvain'])\nprint('Louvain, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['gmm_cluster'])\nprint('GMM, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster'])\nprint('LDA, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster_rfc'])\nprint('LDA_rfc, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster_clf'])\nprint('LDA_clf, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['cNMF_cluster_rfc'])\nprint('cNMF_rfc, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['cNMF_cluster_clf'])\nprint('cNMF_clf, Adjusted rand index = %.2f' %ARI)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cluster_space.json b/rag_engine/ovrawmjson/t_cluster_space.json deleted file mode 100644 index 0377be68..00000000 --- a/rag_engine/ovrawmjson/t_cluster_space.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from the specified path and count file. Make variable names unique.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics. Filter out genes with total counts less than 100. Identify spatially variable genes (SVGs) using the `prost` method and other specified parameters.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)" - }, - { - "action": "Write the processed AnnData object to an H5AD file with gzip compression.", - "code": "adata.write('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "Read the processed AnnData object from the H5AD file with gzip decompression.", - "code": "adata=ov.read('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "(Optional) Read ground truth annotations from a TSV file and add them to the AnnData object's observation metadata. Visualize the spatial distribution of the ground truth annotations.", - "code": "import pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0)\nAnn_df.columns = ['Ground Truth']\nadata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])" - }, - { - "action": "Define parameters for the GraphST clustering method, including device, number of principal components. Apply GraphST clustering to the AnnData object using specified parameters and log-normalization.", - "code": "methods_kwargs={}\nmethods_kwargs['GraphST']={ \n 'device':'cuda:0',\n 'n_pcs':30\n}\n\nadata=ov.space.clusters(adata,\n methods=['GraphST'],\n methods_kwargs=methods_kwargs,\n lognorm=1e4)" - }, - { - "action": "Perform mclust clustering on the GraphST representation, refine the labels, and convert the refined labels to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10,\n modelNames='EEV', random_state=112,\n )\nadata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust') \nadata.obs['mclust_GraphST']=adata.obs['mclust_GraphST'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclust_GraphST' labels using a specified threshold and visualize the merging process.", - "code": "res=ov.space.merge_cluster(adata,groupby='mclust_GraphST',use_rep='graphst|original|X_pca',\n threshold=0.2,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_GraphST', 'mclust_GraphST_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_GraphST','mclust_GraphST_tree','mclust','Ground Truth'])" - }, - { - "action": "Perform mclust_R clustering on the GraphST representation, refine the labels, convert them to categorical type, and merge clusters based on the refined labels.", - "code": "ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust_R',n_components=10,\n random_state=42,\n )\nadata.obs['mclust_R_GraphST'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') \nadata.obs['mclust_R_GraphST']=adata.obs['mclust_R_GraphST'].astype('category')\nres=ov.space.merge_cluster(adata,groupby='mclust_R_GraphST',use_rep='graphst|original|X_pca',\n threshold=0.2,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_R_GraphST', 'mclust_R_GraphST_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_R_GraphST','mclust_R_GraphST_tree','mclust','Ground Truth'])" - }, - { - "action": "Define parameters for the BINARY clustering method. Apply BINARY clustering to the AnnData object using specified parameters.", - "code": "methods_kwargs={}\nmethods_kwargs['BINARY']={ \n 'use_method':'KNN',\n 'cutoff':6,\n 'obs_key':'BINARY_sample',\n 'use_list':None,\n 'pos_weight':10,\n 'device':'cuda:0',\n 'hidden_dims':[512, 30],\n 'n_epochs': 1000,\n 'lr': 0.001,\n 'key_added': 'BINARY',\n 'gradient_clipping': 5,\n 'weight_decay': 0.0001,\n 'verbose': True,\n 'random_seed':0,\n 'lognorm':1e4,\n 'n_top_genes':2000,\n}\nadata=ov.space.clusters(adata,\n methods=['BINARY'],\n methods_kwargs=methods_kwargs)" - }, - { - "action": "Perform mclust_R clustering on the BINARY representation, refine the labels, and convert them to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='BINARY',method='mclust_R',n_components=10,\n random_state=42,\n )\nadata.obs['mclust_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') \nadata.obs['mclust_BINARY']=adata.obs['mclust_BINARY'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclust_BINARY' labels using a specified threshold and visualize the merging process.", - "code": "res=ov.space.merge_cluster(adata,groupby='mclust_BINARY',use_rep='BINARY',\n threshold=0.01,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_BINARY', 'mclust_BINARY_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_BINARY','mclust_BINARY_tree','mclust','Ground Truth'])" - }, - { - "action": "Perform mclust clustering on the BINARY representation using Python's implementation, refine the labels, and convert them to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='BINARY',method='mclust',n_components=10,\n modelNames='EEV', random_state=42,\n )\nadata.obs['mclustpy_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust') \nadata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclustpy_BINARY' labels using a specified threshold and visualize the merging process.", - "code": "adata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category')\nres=ov.space.merge_cluster(adata,groupby='mclustpy_BINARY',use_rep='BINARY',\n threshold=0.013,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclustpy_BINARY', 'mclustpy_BINARY_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclustpy_BINARY','mclustpy_BINARY_tree','mclust','Ground Truth'])" - }, - { - "action": "Define parameters for the STAGATE clustering method. Apply STAGATE clustering to the AnnData object using specified parameters.", - "code": "methods_kwargs={}\nmethods_kwargs['STAGATE']={ \n 'num_batch_x':3,'num_batch_y':2,\n 'spatial_key':['X','Y'],'rad_cutoff':200,\n 'num_epoch':1000,'lr':0.001,\n 'weight_decay':1e-4,'hidden_dims':[512, 30],\n 'device':'cuda:0',\n #'n_top_genes':2000,\n}\n\nadata=ov.space.clusters(adata,\n methods=['STAGATE'],\n methods_kwargs=methods_kwargs)" - }, - { - "action": "Perform mclust_R clustering on the STAGATE representation, refine the labels, convert them to categorical type, and merge clusters based on the refined labels.", - "code": "ov.utils.cluster(adata,use_rep='STAGATE',method='mclust_R',n_components=10,\n random_state=112,\n )\nadata.obs['mclust_R_STAGATE'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') \nadata.obs['mclust_R_STAGATE']=adata.obs['mclust_R_STAGATE'].astype('category')\nres=ov.space.merge_cluster(adata,groupby='mclust_R_STAGATE',use_rep='STAGATE',\n threshold=0.005,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_R_STAGATE', 'mclust_R_STAGATE_tree', 'mclust_R', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_R_STAGATE','mclust_R_STAGATE_tree','mclust_R','Ground Truth'])" - }, - { - "action": "Sort genes by their 'PI' values in descending order and display the top 5 genes.", - "code": "adata.var.sort_values('PI',ascending=False).head(5)" - }, - { - "action": "Visualize the spatial expression of a specific gene ('MBP') in both raw and STAGATE-denoised data.", - "code": "plot_gene = 'MBP'\nimport matplotlib.pyplot as plt\nfig, axs = plt.subplots(1, 2, figsize=(8, 4))\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99')\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99')" - }, - { - "action": "Define parameters for the CAST clustering method. Apply CAST clustering to the AnnData object using specified parameters.", - "code": "methods_kwargs={}\nmethods_kwargs['CAST']={ \n 'output_path_t':'result/CAST_gas/output',\n 'device':'cuda:0',\n 'gpu_t':0\n}\nadata=ov.space.clusters(adata,\n methods=['CAST'],\n methods_kwargs=methods_kwargs)" - }, - { - "action": "Perform mclust clustering on the CAST representation, refine the labels, and convert them to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='X_cast',method='mclust',n_components=10,\n modelNames='EEV', random_state=42,\n )\nadata.obs['mclust_CAST'] = ov.utils.refine_label(adata, radius=50, key='mclust') \nadata.obs['mclust_CAST']=adata.obs['mclust_CAST'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclust_CAST' labels using a specified threshold and visualize the merging process.", - "code": "res=ov.space.merge_cluster(adata,groupby='mclust_CAST',use_rep='X_cast',\n threshold=0.1,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_CAST', 'mclust_CAST_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_CAST','mclust_CAST_tree','mclust','Ground Truth'])" - }, - { - "action": "Display the AnnData object.", - "code": "adata" - }, - { - "action": "Calculate and print the Adjusted Rand Index (ARI) for each clustering method compared to the ground truth.", - "code": "from sklearn.metrics.cluster import adjusted_rand_score\n\nobs_df = adata.obs.dropna()\n#GraphST\nARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth'])\nprint('mclust_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_R_GraphST'], obs_df['Ground Truth'])\nprint('mclust_R_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_R_STAGATE'], obs_df['Ground Truth'])\nprint('mclust_STAGATE: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_BINARY'], obs_df['Ground Truth'])\nprint('mclust_BINARY: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclustpy_BINARY'], obs_df['Ground Truth'])\nprint('mclustpy_BINARY: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_CAST'], obs_df['Ground Truth'])\nprint('mclust_CAST: Adjusted rand index = %.2f' %ARI)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cnmf.json b/rag_engine/ovrawmjson/t_cnmf.json deleted file mode 100644 index 1384f0da..00000000 --- a/rag_engine/ovrawmjson/t_cnmf.json +++ /dev/null @@ -1,110 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy, omicverse, and scvelo. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()\nimport scvelo as scv" - }, - { - "action": "Load the dentategyrus dataset using scvelo.", - "code": "adata=scv.datasets.dentategyrus()" - }, - { - "action": "Preprocess the AnnData object using omicverse. The preprocessing steps include shiftlog normalization, Pearson residual scaling, and selecting the top 2000 highly variable genes.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Scale the data and perform Principal Component Analysis (PCA) on the preprocessed AnnData object.", - "code": "ov.pp.scale(adata)\nov.pp.pca(adata)" - }, - { - "action": "Plot a UMAP embedding of the cells, colored by their cluster assignments.", - "code": "import matplotlib.pyplot as plt\nfrom matplotlib import patheffects\nfig, ax = plt.subplots(figsize=(4,4))\nov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['clusters'],\n frameon='small',\n title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n ax=ax,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Initialize a cNMF object with specified parameters, including the range of components (K values) to explore, number of iterations, random seed, number of highly variable genes, output directory, and name.", - "code": "import numpy as np\n## Initialize the cnmf object that will be used to run analyses\ncnmf_obj = ov.single.cNMF(adata,components=np.arange(5,11), n_iter=20, seed=14, num_highvar_genes=2000,\n output_dir='example_dg/cNMF', name='dg_cNMF')" - }, - { - "action": "Run the cNMF factorization on the specified worker.", - "code": "## Specify that the jobs are being distributed over a single worker (total_workers=1) and then launch that worker\ncnmf_obj.factorize(worker_i=0, total_workers=2)" - }, - { - "action": "Combine the results from different workers, skipping missing files.", - "code": "cnmf_obj.combine(skip_missing_files=True)" - }, - { - "action": "Generate a K selection plot to visualize the stability and error at each choice of K.", - "code": "cnmf_obj.k_selection_plot(close_fig=False)" - }, - { - "action": "Set the selected K value and density threshold for consensus clustering.", - "code": "selected_K = 7\ndensity_threshold = 2.00" - }, - { - "action": "Perform consensus clustering with the specified K value and density threshold, and visualize the clustering results.", - "code": "cnmf_obj.consensus(k=selected_K, \n density_threshold=density_threshold, \n show_clustering=True, \n close_clustergram_fig=False)" - }, - { - "action": "Update the density threshold based on the initial consensus clustering results.", - "code": "density_threshold = 0.10" - }, - { - "action": "Perform consensus clustering again with the updated density threshold.", - "code": "cnmf_obj.consensus(k=selected_K, \n density_threshold=density_threshold, \n show_clustering=True, \n close_clustergram_fig=False)" - }, - { - "action": "Visualize the distance matrix of the consensus spectra using a heatmap.", - "code": "import seaborn as sns\nimport matplotlib.pyplot as plt\nfrom matplotlib import patheffects\n\nfrom matplotlib import gridspec\nimport matplotlib.pyplot as plt\n\nwidth_ratios = [0.2, 4, 0.5, 10, 1]\nheight_ratios = [0.2, 4]\nfig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios)))\ngs = gridspec.GridSpec(len(height_ratios), len(width_ratios), fig,\n 0.01, 0.01, 0.98, 0.98,\n height_ratios=height_ratios,\n width_ratios=width_ratios,\n wspace=0, hspace=0)\n \nD = cnmf_obj.topic_dist[cnmf_obj.spectra_order, :][:, cnmf_obj.spectra_order]\ndist_ax = fig.add_subplot(gs[1,1], xscale='linear', yscale='linear',\n xticks=[], yticks=[],xlabel='', ylabel='',\n frameon=True)\ndist_im = dist_ax.imshow(D, interpolation='none', cmap='viridis',\n aspect='auto', rasterized=True)\n\nleft_ax = fig.add_subplot(gs[1,0], xscale='linear', yscale='linear', xticks=[], yticks=[],\n xlabel='', ylabel='', frameon=True)\nleft_ax.imshow(cnmf_obj.kmeans_cluster_labels.values[cnmf_obj.spectra_order].reshape(-1, 1),\n interpolation='none', cmap='Spectral', aspect='auto',\n rasterized=True)\n\ntop_ax = fig.add_subplot(gs[0,1], xscale='linear', yscale='linear', xticks=[], yticks=[],\n xlabel='', ylabel='', frameon=True)\ntop_ax.imshow(cnmf_obj.kmeans_cluster_labels.values[cnmf_obj.spectra_order].reshape(1, -1),\n interpolation='none', cmap='Spectral', aspect='auto',\n rasterized=True)\n\ncbar_gs = gridspec.GridSpecFromSubplotSpec(3, 3, subplot_spec=gs[1, 2],\n wspace=0, hspace=0)\ncbar_ax = fig.add_subplot(cbar_gs[1,2], xscale='linear', yscale='linear',\n xlabel='', ylabel='', frameon=True, title='Euclidean\\nDistance')\ncbar_ax.set_title('Euclidean\\nDistance',fontsize=12)\nvmin = D.min().min()\nvmax = D.max().max()\nfig.colorbar(dist_im, cax=cbar_ax,\n ticks=np.linspace(vmin, vmax, 3),\n )\ncbar_ax.set_yticklabels(cbar_ax.get_yticklabels(),fontsize=12)\n" - }, - { - "action": "Plot a histogram of the local density values and indicate the filtering threshold.", - "code": "density_filter = cnmf_obj.local_density.iloc[:, 0] < density_threshold\nfig, hist_ax = plt.subplots(figsize=(4,4))\n\n#hist_ax = fig.add_subplot(hist_gs[0,0], xscale='linear', yscale='linear',\n # xlabel='', ylabel='', frameon=True, title='Local density histogram')\nhist_ax.hist(cnmf_obj.local_density.values, bins=np.linspace(0, 1, 50))\nhist_ax.yaxis.tick_right()\n\nxlim = hist_ax.get_xlim()\nylim = hist_ax.get_ylim()\nif density_threshold < xlim[1]:\n hist_ax.axvline(density_threshold, linestyle='--', color='k')\n hist_ax.text(density_threshold + 0.02, ylim[1] * 0.95, 'filtering\\nthreshold\\n\\n', va='top')\nhist_ax.set_xlim(xlim)\nhist_ax.set_xlabel('Mean distance to k nearest neighbors\\n\\n%d/%d (%.0f%%) spectra above threshold\\nwere removed prior to clustering'%(sum(~density_filter), len(density_filter), 100*(~density_filter).mean()))\nhist_ax.set_title('Local density histogram')" - }, - { - "action": "Load the cNMF results for the selected K value and density threshold.", - "code": "result_dict = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)" - }, - { - "action": "Display the head of the normalized usage matrix.", - "code": "result_dict['usage_norm'].head()" - }, - { - "action": "Display the head of the GEP scores matrix.", - "code": "result_dict['gep_scores'].head()" - }, - { - "action": "Display the head of the GEP TPM matrix.", - "code": "result_dict['gep_tpm'].head()" - }, - { - "action": "Display the head of the top genes matrix.", - "code": "result_dict['top_genes'].head()" - }, - { - "action": "Assign cNMF cluster labels to cells in the AnnData object based on the loaded results.", - "code": "cnmf_obj.get_results(adata,result_dict)" - }, - { - "action": "Plot UMAP embeddings of the cells, colored by the cNMF usage values for each program.", - "code": "ov.pl.embedding(adata, basis='X_umap',color=result_dict['usage_norm'].columns,\n use_raw=False, ncols=3, vmin=0, vmax=1,frameon='small')" - }, - { - "action": "Plot a UMAP embedding of the cells, colored by their assigned cNMF cluster labels.", - "code": "ov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['cNMF_cluster'],\n frameon='small',\n #title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Assign cNMF cluster labels using a random forest classifier (RFC) based on the usage values and a specified threshold.", - "code": "cnmf_obj.get_results_rfc(adata,result_dict,\n use_rep='scaled|original|X_pca',\n cNMF_threshold=0.5)" - }, - { - "action": "Plot UMAP embeddings of the cells, colored by their assigned cNMF cluster labels from both the direct assignment and the RFC-based assignment.", - "code": "ov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['cNMF_cluster_rfc','cNMF_cluster_clf'],\n frameon='small',\n #title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Create a list of top genes for plotting.", - "code": "plot_genes=[]\nfor i in result_dict['top_genes'].columns:\n plot_genes+=result_dict['top_genes'][i][:3].values.reshape(-1).tolist()" - }, - { - "action": "Generate a dot plot of the top genes, grouped by cNMF cluster.", - "code": "sc.pl.dotplot(adata,plot_genes,\n \"cNMF_cluster\", dendrogram=False,standard_scale='var',)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_commot_flowsig.json b/rag_engine/ovrawmjson/t_commot_flowsig.json deleted file mode 100644 index 242a5220..00000000 --- a/rag_engine/ovrawmjson/t_commot_flowsig.json +++ /dev/null @@ -1,110 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\n#print(f\"omicverse version: {ov.__version__}\")\nimport scanpy as sc\n#print(f\"scanpy version: {sc.__version__}\")\nov.plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from the specified path and count file, and make variable names unique.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics and filter genes with total counts less than 100. Then, identify spatially variable genes using the `ov.space.svg` function with the 'prost' method.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)" - }, - { - "action": "Write the processed AnnData object to a compressed H5AD file.", - "code": "adata.write('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "Load ligand-receptor database from CellChat for secreted signaling in humans using `ov.externel.commot.pp.ligand_receptor_database`.", - "code": "df_cellchat = ov.externel.commot.pp.ligand_receptor_database(species='human', \n signaling_type='Secreted Signaling', \n database='CellChat')\nprint(df_cellchat.shape)" - }, - { - "action": "Filter the ligand-receptor database to include only pairs where both ligand and receptor are expressed in at least 5% of the spots using `ov.externel.commot.pp.filter_lr_database`.", - "code": "df_cellchat_filtered = ov.externel.commot.pp.filter_lr_database(df_cellchat, \n adata, \n min_cell_pct=0.05)\nprint(df_cellchat_filtered.shape)" - }, - { - "action": "Perform spatial communication inference using `ov.externel.commot.tl.spatial_communication` with specified parameters, including distance threshold and handling of heteromeric complexes.", - "code": "ov.externel.commot.tl.spatial_communication(adata,\n database_name='cellchat', \n df_ligrec=df_cellchat_filtered, \n dis_thr=500, heteromeric=True, \n pathway_sum=True)" - }, - { - "action": "Read ground truth annotations from a file and add them to the AnnData object. Visualize the spatial distribution of ground truth annotations using `sc.pl.spatial`.", - "code": "# read the annotation\nimport pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\\t', header=None, index_col=0)\nAnn_df.columns = ['Ground_Truth']\nadata.obs['Ground_Truth'] = Ann_df.loc[adata.obs_names, 'Ground_Truth']\nLayer_color=['#283b5c', '#d8e17b', '#838e44', '#4e8991', '#d08c35', '#511a3a',\n '#c2c2c2', '#dfc648']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground_Truth\"],palette=Layer_color)" - }, - { - "action": "Create a dictionary mapping ground truth categories to their corresponding colors.", - "code": "ct_color_dict=dict(zip(adata.obs['Ground_Truth'].cat.categories,\n adata.uns['Ground_Truth_colors']))" - }, - { - "action": "Display the head of the ligand-receptor dataframe from the CellChat database information.", - "code": "adata.uns['commot-cellchat-info']['df_ligrec'].head()" - }, - { - "action": "Determine the spatial direction of the FGF signaling pathway using `ov.externel.commot.tl.communication_direction`. Visualize the cell communication for the FGF pathway using `ov.externel.commot.pl.plot_cell_communication` with specified parameters.", - "code": "import matplotlib.pyplot as plt\nscale=0.000008\nk=5\ngoal_pathway='FGF'\nov.externel.commot.tl.communication_direction(adata, database_name='cellchat', pathway_name=goal_pathway, k=k)\nov.externel.commot.pl.plot_cell_communication(adata, database_name='cellchat', \n pathway_name='FGF', plot_method='grid', \n background_legend=True,\n scale=scale, ndsize=8, grid_density=0.4, \n summary='sender', background='cluster', \n clustering='Ground_Truth', \n cluster_cmap=ct_color_dict,\n cmap='Alphabet',\n normalize_v = True, normalize_v_quantile=0.995)\nplt.title(f'Pathway:{goal_pathway}',fontsize=13)\n#plt.savefig('figures/TLE/TLE_cellchat_all_FGF.png',dpi=300,bbox_inches='tight')\n#fig.savefig('pdf/TLE/control_cellchat_all_FGF.pdf',dpi=300,bbox_inches='tight')" - }, - { - "action": "Write the AnnData object with COMMOT results to a compressed H5AD file.", - "code": "adata.write('data/151676_commot.h5ad',compression='gzip')" - }, - { - "action": "Read the AnnData object with COMMOT results from the H5AD file.", - "code": "adata=ov.read('data/151676_commot.h5ad')" - }, - { - "action": "Copy the normalized data to a new layer in the AnnData object.", - "code": "adata.layers['normalized'] = adata.X.copy()" - }, - { - "action": "Construct gene expression modules (GEMs) using non-negative matrix factorization (NMF) with `ov.externel.flowsig.pp.construct_gems_using_nmf`.", - "code": "# We construct 10 gene expression modules using the raw cell count.\nov.externel.flowsig.pp.construct_gems_using_nmf(adata,\n n_gems = 10,\n layer_key = 'counts',\n )" - }, - { - "action": "Retrieve the top genes for a specific GEM using `ov.externel.flowsig.ul.get_top_gem_genes`.", - "code": "goal_gem='GEM-5'\ngem_gene=ov.externel.flowsig.ul.get_top_gem_genes(adata=adata,\n gems=[goal_gem],\n n_genes=100,\n gene_type='all',\n method = 'nmf',\n )\ngem_gene.head()" - }, - { - "action": "Construct flow expression matrices using `ov.externel.flowsig.pp.construct_flows_from_commot` with specified parameters.", - "code": "commot_output_key = 'commot-cellchat'\n# We first construct the potential cellular flows from the commot output\nov.externel.flowsig.pp.construct_flows_from_commot(adata,\n commot_output_key,\n gem_expr_key = 'X_gem',\n scale_gem_expr = True,\n flowsig_network_key = 'flowsig_network',\n flowsig_expr_key = 'X_flow')" - }, - { - "action": "Determine informative variables for spatial data using `ov.externel.flowsig.pp.determine_informative_variables` with a Moran's I threshold.", - "code": "# Then we subset for \"spatially flowing\" inflows and outflows\nov.externel.flowsig.pp.determine_informative_variables(adata, \n flowsig_expr_key = 'X_flow',\n flowsig_network_key = 'flowsig_network',\n spatial = True,\n moran_threshold = 0.15,\n coord_type = 'grid',\n n_neighbours = 8,\n library_key = None)" - }, - { - "action": "Perform k-means clustering on spatial coordinates and add the cluster labels to the AnnData object.", - "code": "from sklearn.cluster import KMeans\nimport pandas as pd\n\nkmeans = KMeans(n_clusters=10, random_state=0).fit(adata.obsm['spatial'])\nadata.obs['spatial_kmeans'] = pd.Series(kmeans.labels_, dtype='category').values" - }, - { - "action": "Learn intercellular flows using spatial block bootstrapping with `ov.externel.flowsig.tl.learn_intercellular_flows`.", - "code": "# # Now we are ready to learn the network\nov.externel.flowsig.tl.learn_intercellular_flows(adata,\n flowsig_key = 'flowsig_network',\n flow_expr_key = 'X_flow',\n use_spatial = True,\n block_key = 'spatial_kmeans',\n n_jobs = 4,\n n_bootstraps = 500)" - }, - { - "action": "Apply biological flow constraints to the network using `ov.externel.flowsig.tl.apply_biological_flow`.", - "code": "# This part is key for reducing false positives\nov.externel.flowsig.tl.apply_biological_flow(adata,\n flowsig_network_key = 'flowsig_network',\n adjacency_key = 'adjacency',\n validated_key = 'validated')" - }, - { - "action": "Filter low-confidence edges based on bootstrapped frequencies using `ov.externel.flowsig.tl.filter_low_confidence_edges`.", - "code": "edge_threshold = 0.7\n\nov.externel.flowsig.tl.filter_low_confidence_edges(adata,\n edge_threshold = edge_threshold,\n flowsig_network_key = 'flowsig_network',\n adjacency_key = 'adjacency_validated',\n filtered_key = 'filtered')" - }, - { - "action": "Write the AnnData object with COMMOT and flowsig results to a compressed H5AD file.", - "code": "adata.write('data/cortex_commot_flowsig.h5ad',compression='gzip')" - }, - { - "action": "Construct the directed NetworkX DiGraph object from the filtered adjacency matrix using `ov.externel.flowsig.tl.construct_intercellular_flow_network`.", - "code": "flow_network = ov.externel.flowsig.tl.construct_intercellular_flow_network(adata,\n flowsig_network_key = 'flowsig_network',\n adjacency_key = 'adjacency_validated_filtered')" - }, - { - "action": "Create a subset of the AnnData object containing only GEM expression data and corresponding metadata.", - "code": "flowsig_expr_key='X_gem'\nX_flow = adata.obsm[flowsig_expr_key]\nadata_subset = sc.AnnData(X=X_flow)\nadata_subset.obs = adata.obs\nadata_subset.var.index =[f'GEM-{i}' for i in range(1,len(adata_subset.var)+1)]" - }, - { - "action": "Visualize the expression of GEMs in different cell types using a dotplot with `sc.pl.dotplot`.", - "code": "import matplotlib.pyplot as plt\nax=sc.pl.dotplot(adata_subset, adata_subset.var.index, groupby='Ground_Truth', \n dendrogram=True,standard_scale='var',cmap='Reds',show=False)\ncolor_dict=dict(zip(adata.obs['Ground_Truth'].cat.categories,adata.uns['Ground_Truth_colors']))" - }, - { - "action": "Visualize the flowsig network using `ov.pl.plot_flowsig_network` with specified parameters for node shapes, curve arguments, and axis limits.", - "code": "ov.pl.plot_flowsig_network(flow_network=flow_network,\n gem_plot=['GEM-2','GEM-7','GEM-1','GEM-3','GEM-4','GEM-5'],\n figsize=(8,4),\n curve_awarg={'eps':2},\n node_shape={'GEM':'^','Sender':'o','Receptor':'o'},\n ylim=(-0.5,0.5),xlim=(-3,3))" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cytotrace.json b/rag_engine/ovrawmjson/t_cytotrace.json deleted file mode 100644 index 69910109..00000000 --- a/rag_engine/ovrawmjson/t_cytotrace.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters.", - "code": "import omicverse as ov\nov.plot_set()" - }, - { - "action": "Import the scvelo library and load the dentategyrus dataset into an AnnData object.", - "code": "import scvelo as scv\nadata=scv.datasets.dentategyrus()\nadata" - }, - { - "action": "Preprocess the AnnData object using the `ov.pp.preprocess` function with specified parameters, including mode, number of highly variable genes (n_HVGs), and timing the execution.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Predict CytoTRACE 2 scores using the `ov.single.cytotrace2` function with specified parameters, including the path to the pre-trained model directory, species, batch sizes, parallelization settings, maximum number of principal components, random seed, and output directory.", - "code": "results = ov.single.cytotrace2(adata,\n use_model_dir=\"cymodels/5_models_weights\",\n species=\"mouse\",\n batch_size = 10000,\n smooth_batch_size = 1000,\n disable_parallelization = False,\n max_cores = None,\n max_pcs = 200,\n seed = 14,\n output_dir = 'cytotrace2_results'\n)" - }, - { - "action": "Visualize the UMAP embeddings of the AnnData object, colored by cell clusters and CytoTRACE2 scores, with specified parameters for frame, colormap, and whitespace.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','CytoTRACE2_Score'],\n frameon='small',cmap='Reds',wspace=0.55)" - }, - { - "action": "Visualize the UMAP embeddings of the AnnData object, colored by CytoTRACE2 potency and relative order, with specified parameters for frame, colormap, and whitespace.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['CytoTRACE2_Potency','CytoTRACE2_Relative'],\n frameon='small',cmap='Reds',wspace=0.55)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_deg.json b/rag_engine/ovrawmjson/t_deg.json deleted file mode 100644 index ac2f5dfd..00000000 --- a/rag_engine/ovrawmjson/t_deg.json +++ /dev/null @@ -1,82 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and matplotlib.pyplot. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport matplotlib.pyplot as plt\n\nov.plot_set()" - }, - { - "action": "Download gene ID annotation pair using `ov.utils.download_geneid_annotation_pair()`. This is necessary for converting gene IDs.", - "code": "ov.utils.download_geneid_annotation_pair()" - }, - { - "action": "Read the data from a file named 'counts.txt' (or from a URL, commented out). The data is assumed to be a tab-separated file with the first column as index and the second row as header. The `.bam` suffix is removed from column names.", - "code": "#data=pd.read_csv('https://raw.githubusercontent.com/Starlitnightly/omicverse/master/sample/counts.txt',index_col=0,sep='\\t',header=1)\ndata=ov.read('data/counts.txt',index_col=0,header=1)\n#replace the columns `.bam` to `` \ndata.columns=[i.split('/')[-1].replace('.bam','') for i in data.columns]\ndata.head()" - }, - { - "action": "Perform gene ID mapping on the data using the downloaded annotation pair file for 'GRCm39'.", - "code": "data=ov.bulk.Matrix_ID_mapping(data,'genesets/pair_GRCm39.tsv')\ndata.head()" - }, - { - "action": "Initialize a pyDEG object for differential expression analysis using the `omicverse` library.", - "code": "dds=ov.bulk.pyDEG(data)" - }, - { - "action": "Drop duplicate indices in the pyDEG object, keeping only the highest expressed genes.", - "code": "dds.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Normalize the data using the `estimateSizeFactors` method from DEseq2, likely to remove batch effects.", - "code": "dds.normalize()\nprint('... estimateSizeFactors and normalize success')" - }, - { - "action": "Perform differential expression gene analysis using the t-test method. The treatment groups are '4-3' and '4-4', and the control groups are '1--1' and '1--2'.", - "code": "treatment_groups=['4-3','4-4']\ncontrol_groups=['1--1','1--2']\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')\nresult.head()" - }, - { - "action": "Filter out genes with low expression (log2(BaseMean) <= 1).", - "code": "print(result.shape)\nresult=result.loc[result['log2(BaseMean)']>1]\nprint(result.shape)" - }, - { - "action": "Set the threshold for fold change. The threshold is calculated automatically (-1) based on the log2FC distribution. The p-value threshold is set to 0.05, and the maximum log p-value is set to 6.", - "code": "# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=6)" - }, - { - "action": "Plot a volcano plot to visualize the results of the differential expression analysis. The plot includes the top 8 differentially expressed genes and sets the font size for gene labels to 12.", - "code": "dds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot a boxplot for the genes 'Ckap2' and 'Lef1' to visualize their expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2','Lef1'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Plot a boxplot for the gene 'Ckap2' to visualize its expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Download pathway database using `ov.utils.download_pathway_database()`.", - "code": "ov.utils.download_pathway_database()" - }, - { - "action": "Prepare a pathway dictionary from the 'WikiPathways_2019_Mouse.txt' file for mouse.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt',organism='Mouse')" - }, - { - "action": "Perform gene set enrichment analysis using the prepared pathway dictionary. The `pvalue_type` is set to 'auto' to automatically determine whether to use adjusted or raw p-values. The organism is set to 'mouse'.", - "code": "deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()\nenr=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot the gene set enrichment results using a custom plot function `geneset_plot`.", - "code": "ov.bulk.geneset_plot(enr,figsize=(2,5),fig_title='Wiki Pathway enrichment',\n cax_loc=[2, 0.45, 0.5, 0.02],\n bbox_to_anchor_used=(-0.25, -13),node_diameter=10,\n custom_ticks=[5,7],text_knock=3,\n cmap='Reds')" - }, - { - "action": "Prepare pathway dictionaries for GO Biological Process, GO Molecular Function, and GO Cellular Component for mouse and perform gene set enrichment analysis for each.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Mouse')\nenr_go_bp=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Molecular_Function_2023.txt',organism='Mouse')\nenr_go_mf=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Cellular_Component_2023.txt',organism='Mouse')\nenr_go_cc=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot multiple gene set enrichment results together using `geneset_plot_multi`.", - "code": "enr_dict={'BP':enr_go_bp,\n 'MF':enr_go_mf,\n 'CC':enr_go_cc}\ncolors_dict={\n 'BP':ov.pl.red_color[1],\n 'MF':ov.pl.green_color[1],\n 'CC':ov.pl.blue_color[1],\n}\n \nov.bulk.geneset_plot_multi(enr_dict,colors_dict,num=3,\n figsize=(2,5),\n text_knock=3,fontsize=8,\n cmap='Reds'\n )" - }, - { - "action": "Define a function `geneset_plot_multi` to plot multiple gene set enrichment results.", - "code": "def geneset_plot_multi(enr_dict,colors_dict,num:int=5,fontsize=10,\n fig_title:str='',fig_xlabel:str='Fractions of genes',\n figsize:tuple=(2,4),cmap:str='YlGnBu',\n text_knock:int=5,text_maxsize:int=20,ax=None,\n ):\n from PyComplexHeatmap import HeatmapAnnotation,DotClustermapPlotter,anno_label,anno_simple,AnnotationBase\n for key in enr_dict.keys():\n enr_dict[key]['Type']=key\n enr_all=pd.concat([enr_dict[i].iloc[:num] for i in enr_dict.keys()],axis=0)\n enr_all['Term']=[ov.utils.plot_text_set(i.split('(')[0],text_knock=text_knock,text_maxsize=text_maxsize) for i in enr_all.Term.tolist()]\n enr_all.index=enr_all.Term\n enr_all['Term1']=[i for i in enr_all.index.tolist()]\n del enr_all['Term']\n\n colors=colors_dict\n\n left_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Type, merge=True,rotation=0,colors=colors,relpos=(1,0.8)),\n Category=anno_simple(enr_all.Type,cmap='Set1',\n add_text=False,legend=False,colors=colors),\n axis=0,verbose=0,label_kws={'rotation':45,'horizontalalignment':'left','visible':False})\n right_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Term1, merge=True,rotation=0,relpos=(0,0.5),arrowprops=dict(visible=True),\n colors=enr_all.assign(color=enr_all.Type.map(colors)).set_index('Term1').color.to_dict(),\n fontsize=fontsize,luminance=0.8,height=2),\n axis=0,verbose=0,#label_kws={'rotation':45,'horizontalalignment':'left'},\n orientation='right')\n if ax==None:\n fig, ax = plt.subplots(figsize=figsize) \n else:\n ax=ax\n #plt.figure(figsize=figsize)\n cm = DotClustermapPlotter(data=enr_all, x='fraction',y='Term1',value='logp',c='logp',s='num',\n cmap=cmap,\n row_cluster=True,#col_cluster=True,#hue='Group',\n #cmap={'Group1':'Greens','Group2':'OrRd'},\n vmin=-1*np.log10(0.1),vmax=-1*np.log10(1e-10),\n #colors={'Group1':'yellowgreen','Group2':'orange'},\n #marker={'Group1':'*','Group2':'$\\ast$'},\n show_rownames=True,show_colnames=False,row_dendrogram=False,\n col_names_side='top',row_names_side='right',\n xticklabels_kws={'labelrotation': 30, 'labelcolor': 'blue','labelsize':fontsize},\n #yticklabels_kws={'labelsize':10},\n #top_annotation=col_ha,left_annotation=left_ha,right_annotation=right_ha,\n left_annotation=left_ha,right_annotation=right_ha,\n spines=False,\n row_split=enr_all.Type,# row_split_gap=1,\n #col_split=df_col.Group,col_split_gap=0.5,\n verbose=1,legend_gap=10,\n #dot_legend_marker='*',\n \n xlabel='Fractions of genes',xlabel_side=\"bottom\",\n xlabel_kws=dict(labelpad=8,fontweight='normal',fontsize=fontsize+2),\n # xlabel_bbox_kws=dict(facecolor=facecolor)\n )\n tesr=plt.gcf().axes\n for ax in plt.gcf().axes:\n if hasattr(ax, 'get_xlabel'):\n if ax.get_xlabel() == 'Fractions of genes': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.grid(False)\n if ax.get_ylabel() == 'logp': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.tick_params(labelsize=fontsize+2)\n cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2)\n cbar.grid(False)\n return ax" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_deseq2.json b/rag_engine/ovrawmjson/t_deseq2.json deleted file mode 100644 index a180ed11..00000000 --- a/rag_engine/ovrawmjson/t_deseq2.json +++ /dev/null @@ -1,82 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and matplotlib.pyplot. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport matplotlib.pyplot as plt\n\nov.plot_set()" - }, - { - "action": "Download gene ID annotation pair using `ov.utils.download_geneid_annotation_pair()`. This is necessary for converting gene IDs.", - "code": "ov.utils.download_geneid_annotation_pair()" - }, - { - "action": "Read the data from a file named 'counts.txt' (or from a URL, commented out). The data is assumed to be a tab-separated file with the first column as index and the second row as header. The `.bam` suffix is removed from column names.", - "code": "#data=pd.read_csv('https://raw.githubusercontent.com/Starlitnightly/omicverse/master/sample/counts.txt',index_col=0,sep='\\t',header=1)\ndata=ov.read('data/counts.txt',index_col=0,header=1)\n#replace the columns `.bam` to `` \ndata.columns=[i.split('/')[-1].replace('.bam','') for i in data.columns]\ndata.head()" - }, - { - "action": "Perform gene ID mapping on the data using the downloaded annotation pair file for 'GRCm39'.", - "code": "data=ov.bulk.Matrix_ID_mapping(data,'genesets/pair_GRCm39.tsv')\ndata.head()" - }, - { - "action": "Initialize a pyDEG object for differential expression analysis using the `omicverse` library.", - "code": "dds=ov.bulk.pyDEG(data)" - }, - { - "action": "Drop duplicate indices in the pyDEG object, keeping only the highest expressed genes.", - "code": "dds.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Normalize the data using the `estimateSizeFactors` method from DEseq2, likely to remove batch effects.", - "code": "dds.normalize()\nprint('... estimateSizeFactors and normalize success')" - }, - { - "action": "Perform differential expression gene analysis using the t-test method. The treatment groups are '4-3' and '4-4', and the control groups are '1--1' and '1--2'.", - "code": "treatment_groups=['4-3','4-4']\ncontrol_groups=['1--1','1--2']\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')\nresult.head()" - }, - { - "action": "Filter out genes with low expression (log2(BaseMean) <= 1).", - "code": "print(result.shape)\nresult=result.loc[result['log2(BaseMean)']>1]\nprint(result.shape)" - }, - { - "action": "Set the threshold for fold change. The threshold is calculated automatically (-1) based on the log2FC distribution. The p-value threshold is set to 0.05, and the maximum log p-value is set to 6.", - "code": "# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=6)" - }, - { - "action": "Plot a volcano plot to visualize the results of the differential expression analysis. The plot includes the top 8 differentially expressed genes and sets the font size for gene labels to 12.", - "code": "dds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot a boxplot for the genes 'Ckap2' and 'Lef1' to visualize their expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2','Lef1'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Plot a boxplot for the gene 'Ckap2' to visualize its expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Download pathway database using `ov.utils.download_pathway_database()`.", - "code": "ov.utils.download_pathway_database()" - }, - { - "action": "Prepare a pathway dictionary from the 'WikiPathways_2019_Mouse.txt' file for mouse.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt',organism='Mouse')" - }, - { - "action": "Perform gene set enrichment analysis using the prepared pathway dictionary. The `pvalue_type` is set to 'auto' to automatically determine whether to use adjusted or raw p-values. The organism is set to 'mouse'.", - "code": "deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()\nenr=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot the gene set enrichment results using a custom plot function `geneset_plot`.", - "code": "ov.bulk.geneset_plot(enr,figsize=(2,5),fig_title='Wiki Pathway enrichment',\n cax_loc=[2, 0.45, 0.5, 0.02],\n bbox_to_anchor_used=(-0.25, -13),node_diameter=10,\n custom_ticks=[5,7],text_knock=3,\n cmap='Reds')" - }, - { - "action": "Prepare pathway dictionaries for GO Biological Process, GO Molecular Function, and GO Cellular Component for mouse and perform gene set enrichment analysis for each.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Mouse')\nenr_go_bp=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Molecular_Function_2023.txt',organism='Mouse')\nenr_go_mf=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Cellular_Component_2023.txt',organism='Mouse')\nenr_go_cc=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot multiple gene set enrichment results together using `geneset_plot_multi`.", - "code": "enr_dict={'BP':enr_go_bp,\n 'MF':enr_go_mf,\n 'CC':enr_go_cc}\ncolors_dict={\n 'BP':ov.pl.red_color[1],\n 'MF':ov.pl.green_color[1],\n 'CC':ov.pl.blue_color[1],\n}\n \nov.bulk.geneset_plot_multi(enr_dict,colors_dict,num=3,\n figsize=(2,5),\n text_knock=3,fontsize=8,\n cmap='Reds'\n )" - }, - { - "action": "Define a function `geneset_plot_multi` to plot multiple gene set enrichment results. This function takes a dictionary of enrichment results and a dictionary of colors, and plots them in a combined dot plot. It allows customization of the number of top pathways to display, font size, figure title, x-axis label, figure size, colormap, text knock, and maximum text size. It uses the `PyComplexHeatmap` library to create the plot.", - "code": "def geneset_plot_multi(enr_dict,colors_dict,num:int=5,fontsize=10,\n fig_title:str='',fig_xlabel:str='Fractions of genes',\n figsize:tuple=(2,4),cmap:str='YlGnBu',\n text_knock:int=5,text_maxsize:int=20,ax=None,\n ):\n from PyComplexHeatmap import HeatmapAnnotation,DotClustermapPlotter,anno_label,anno_simple,AnnotationBase\n for key in enr_dict.keys():\n enr_dict[key]['Type']=key\n enr_all=pd.concat([enr_dict[i].iloc[:num] for i in enr_dict.keys()],axis=0)\n enr_all['Term']=[ov.utils.plot_text_set(i.split('(')[0],text_knock=text_knock,text_maxsize=text_maxsize) for i in enr_all.Term.tolist()]\n enr_all.index=enr_all.Term\n enr_all['Term1']=[i for i in enr_all.index.tolist()]\n del enr_all['Term']\n\n colors=colors_dict\n\n left_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Type, merge=True,rotation=0,colors=colors,relpos=(1,0.8)),\n Category=anno_simple(enr_all.Type,cmap='Set1',\n add_text=False,legend=False,colors=colors),\n axis=0,verbose=0,label_kws={'rotation':45,'horizontalalignment':'left','visible':False})\n right_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Term1, merge=True,rotation=0,relpos=(0,0.5),arrowprops=dict(visible=True),\n colors=enr_all.assign(color=enr_all.Type.map(colors)).set_index('Term1').color.to_dict(),\n fontsize=fontsize,luminance=0.8,height=2),\n axis=0,verbose=0,#label_kws={'rotation':45,'horizontalalignment':'left'},\n orientation='right')\n if ax==None:\n fig, ax = plt.subplots(figsize=figsize) \n else:\n ax=ax\n #plt.figure(figsize=figsize)\n cm = DotClustermapPlotter(data=enr_all, x='fraction',y='Term1',value='logp',c='logp',s='num',\n cmap=cmap,\n row_cluster=True,#col_cluster=True,#hue='Group',\n #cmap={'Group1':'Greens','Group2':'OrRd'},\n vmin=-1*np.log10(0.1),vmax=-1*np.log10(1e-10),\n #colors={'Group1':'yellowgreen','Group2':'orange'},\n #marker={'Group1':'*','Group2':'$\\ast$'},\n show_rownames=True,show_colnames=False,row_dendrogram=False,\n col_names_side='top',row_names_side='right',\n xticklabels_kws={'labelrotation': 30, 'labelcolor': 'blue','labelsize':fontsize},\n #yticklabels_kws={'labelsize':10},\n #top_annotation=col_ha,left_annotation=left_ha,right_annotation=right_ha,\n left_annotation=left_ha,right_annotation=right_ha,\n spines=False,\n row_split=enr_all.Type,# row_split_gap=1,\n #col_split=df_col.Group,col_split_gap=0.5,\n verbose=1,legend_gap=10,\n #dot_legend_marker='*',\n \n xlabel='Fractions of genes',xlabel_side=\"bottom\",\n xlabel_kws=dict(labelpad=8,fontweight='normal',fontsize=fontsize+2),\n # xlabel_bbox_kws=dict(facecolor=facecolor)\n )\n tesr=plt.gcf().axes\n for ax in plt.gcf().axes:\n if hasattr(ax, 'get_xlabel'):\n if ax.get_xlabel() == 'Fractions of genes': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.grid(False)\n if ax.get_ylabel() == 'logp': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.tick_params(labelsize=fontsize+2)\n cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2)\n cbar.grid(False)\n return ax" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_gptanno.json b/rag_engine/ovrawmjson/t_gptanno.json deleted file mode 100644 index 4b760066..00000000 --- a/rag_engine/ovrawmjson/t_gptanno.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nprint(f'omicverse version:{ov.__version__}')\nimport scanpy as sc\nprint(f'scanpy version:{sc.__version__}')\nov.ov_plot_set()" - }, - { - "action": "Create a directory named 'data', download the PBMC3K dataset from 10x Genomics, and unpack it. Then, create a directory named 'write' for storing processed data.", - "code": "# !mkdir data\n# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the count matrix from the 10x Genomics data into an AnnData object, using gene symbols for variable names and caching the data for faster reading.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading" - }, - { - "action": "Perform quality control on the AnnData object, filtering cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})" - }, - { - "action": "Preprocess the data by normalizing and identifying highly variable genes (HVGs) using the 'shiftlog|pearson' mode, selecting the top 2000 HVGs.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)" - }, - { - "action": "Store the raw data in `adata.raw` and filter the AnnData object to keep only the highly variable genes.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]" - }, - { - "action": "Scale the data in `adata.X`.", - "code": "ov.pp.scale(adata)" - }, - { - "action": "Perform Principal Component Analysis (PCA) on the scaled data, reducing the dimensionality to 50 principal components.", - "code": "ov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Construct a neighborhood graph using the top 50 principal components, considering 15 nearest neighbors.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph.", - "code": "sc.tl.leiden(adata)" - }, - { - "action": "Calculate a dendrogram for the Leiden clusters and identify marker genes for each cluster using the Wilcoxon rank-sum test.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='wilcoxon',use_raw=False,)" - }, - { - "action": "Perform dimensionality reduction for visualization using Minimum Distortion Embedding (MDE) based on the PCA results.", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])" - }, - { - "action": "Plot the MDE embedding, coloring cells by their Leiden cluster assignments, with the legend placed on the data points and a custom color palette.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'], \n legend_loc='on data', \n frameon='small',\n legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Manually define a dictionary of marker genes for two clusters and use `ov.single.gptcelltype` to annotate cell types using the Qwen model through its API, specifying 'PBMC' as the tissue and 'human' as the species.", - "code": "import os\nall_markers={'cluster1':['CD3D','CD3E'],\n 'cluster2':['MS4A1']}\n\nos.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='qwen-plus', provider='qwen',\n topgenenumber=5)\nresult" - }, - { - "action": "Automatically identify marker genes for each cluster in the AnnData object using `ov.single.get_celltype_marker`, considering genes with a fold change greater than 2 and selecting the top 5 genes.", - "code": "all_markers=ov.single.get_celltype_marker(adata,clustertype='leiden',rank=True,\n key='rank_genes_groups',\n foldchange=2,topgenenumber=5)\nall_markers" - }, - { - "action": "Use `ov.single.gptcelltype` to annotate cell types using the Qwen model through its API, specifying 'PBMC' as the tissue and 'human' as the species, based on automatically identified marker genes.", - "code": "import os\nos.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='qwen-plus', provider='qwen',\n topgenenumber=5)\nresult" - }, - { - "action": "Extract the cell type annotations from the `gptcelltype` output, removing extra information and keeping only the cell type names.", - "code": "new_result={}\nfor key in result.keys():\n new_result[key]=result[key].split(': ')[-1].split(' (')[0].split('. ')[1]\nnew_result" - }, - { - "action": "Map the extracted cell type annotations to the 'leiden' clusters in the AnnData object and store them in a new observation called 'gpt_celltype'.", - "code": "adata.obs['gpt_celltype'] = adata.obs['leiden'].map(new_result).astype('category')" - }, - { - "action": "Plot the MDE embedding, coloring cells by both their 'leiden' cluster assignments and the new 'gpt_celltype' annotations, with the legend placed on the data points and a custom color palette.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden','gpt_celltype'], \n legend_loc='on data', \n frameon='small',\n legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Use `ov.single.gptcelltype` with the OpenAI API to annotate cell types for a given set of marker genes, specifying 'gpt-4o' as the model and 'openai' as the provider.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='gpt-4o', provider='openai',\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype` with the Qwen API to annotate cell types for a given set of marker genes, specifying 'qwen-plus' as the model and 'qwen' as the provider.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='qwen-plus', provider='qwen',\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype` with the Kimi API to annotate cell types for a given set of marker genes, specifying 'moonshot-v1-8k' as the model and 'kimi' as the provider.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='moonshot-v1-8k', provider='kimi',\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype` with a custom `base_url` to annotate cell types, demonstrating the flexibility to use other models that support the OpenAI API format.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='moonshot-v1-8k', base_url=\"https://api.moonshot.cn/v1\",\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype_local` to annotate cell types using a local large language model (LLM), specifying the path to the local model.", - "code": "anno_model = 'path/to/your/local/LLM' # '~/models/Qwen2-7B-Instruct'\n\nresult = ov.single.gptcelltype_local(all_markers, tissuename='PBMC', speciename='human', \n model_name=anno_model, topgenenumber=5)\nresult" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_mapping.json b/rag_engine/ovrawmjson/t_mapping.json deleted file mode 100644 index 723497e6..00000000 --- a/rag_engine/ovrawmjson/t_mapping.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Read single-cell data from a file, then create and display a UMAP plot colored by 'Subset' to visualize the different subsets within the data.", - "code": "adata_sc=ov.read('data/sc.h5ad')\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(\n adata_sc,\n basis=\"X_umap\",\n color=['Subset'],\n title='Subset',\n frameon='small',\n wspace=0.65,\n show=False,\n ax=ax\n)" - }, - { - "action": "Print the maximum value of the raw data, preprocess the single-cell data using shiftlog and Pearson residuals, select the top 3000 highly variable genes, normalize the data to a target sum of 1e4, and then print the maximum value of the normalized data.", - "code": "print(\"RAW\",adata_sc.X.max())\nadata_sc=ov.pp.preprocess(adata_sc,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4)\nadata_sc.raw = adata_sc\nadata_sc = adata_sc[:, adata_sc.var.highly_variable_features]\nprint(\"Normalize\",adata_sc.X.max())" - }, - { - "action": "Load spatial transcriptomics data from 10X Genomics for the 'V1_Human_Lymph_Node' sample, assign sample ID, and ensure unique variable names.", - "code": "adata = sc.datasets.visium_sge(sample_id=\"V1_Human_Lymph_Node\")\nadata.obs['sample'] = list(adata.uns['spatial'].keys())[0]\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics for the spatial data, filter out genes with total counts less than 100, compute spatially variable genes using the 'prost' method, select the top 3000 spatially variable genes, normalize the data, and create a copy for further analysis.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)\nadata.raw = adata\nadata = adata[:, adata.var.space_variable_features]\nadata_sp=adata.copy()\nadata_sp" - }, - { - "action": "Initialize the Tangram model with single-cell and spatial data, using 'Subset' as the clustering key.", - "code": "tg=ov.space.Tangram(adata_sc,adata_sp,clusters='Subset')" - }, - { - "action": "Train the Tangram model in 'clusters' mode for 500 epochs using a CUDA device.", - "code": "tg.train(mode=\"clusters\",num_epochs=500,device=\"cuda:0\")" - }, - { - "action": "Use the trained Tangram model to infer cell locations in spatial spots and store the result in `adata_plot`.", - "code": "adata_plot=tg.cell2location()\nadata_plot.obs.columns" - }, - { - "action": "Create a spatial plot showing the distribution of specified cell types using the 'magma' colormap.", - "code": "annotation_list=['B_Cycling', 'B_GC_LZ', 'T_CD4+_TfH_GC', 'FDC',\n 'B_naive', 'T_CD4+_naive', 'B_plasma', 'Endo']\n\nsc.pl.spatial(adata_plot, cmap='magma',\n color=annotation_list,\n ncols=4, size=1.3,\n img_key='hires'\n )" - }, - { - "action": "Create a dictionary mapping cell type categories to their corresponding colors from the single-cell data.", - "code": "color_dict=dict(zip(adata_sc.obs['Subset'].cat.categories,\n adata_sc.uns['Subset_colors']))" - }, - { - "action": "Create a spatial plot of the first 5 cell types from `annotation_list`, using specified colors and adjusting the color scale and circle size.", - "code": "import matplotlib as mpl\nclust_labels = annotation_list[:5]\nclust_col = ['' + str(i) for i in clust_labels]\n\nwith mpl.rc_context({'figure.figsize': (8, 8),'axes.grid': False}):\n fig = ov.pl.plot_spatial(\n adata=adata_plot,\n color=clust_col, labels=clust_labels,\n show_img=True,\n style='fast',\n max_color_quantile=0.992,\n circle_diameter=3,\n reorder_cmap = [1,2,3,4,6],\n colorbar_position='right',\n palette=color_dict\n )" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_metacells.json b/rag_engine/ovrawmjson/t_metacells.json deleted file mode 100644 index d9b7fa15..00000000 --- a/rag_engine/ovrawmjson/t_metacells.json +++ /dev/null @@ -1,94 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\n\nov.plot_set()" - }, - { - "action": "Load the pancreas dataset using `scv.datasets.pancreas()`.", - "code": "adata = scv.datasets.pancreas()\nadata" - }, - { - "action": "Perform quality control on the AnnData object `adata` using `ov.pp.qc()`, filtering cells based on mitochondrial percentage, number of UMIs, and number of detected genes.", - "code": "#quantity control\nadata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.20, 'nUMIs': 500, 'detected_genes': 250},\n mt_startswith='mt-')" - }, - { - "action": "Preprocess the AnnData object `adata` using `ov.pp.preprocess()`, normalizing and calculating highly variable genes (HVGs).", - "code": "#normalize and high variable genes (HVGs) calculated\nadata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)" - }, - { - "action": "Save the whole genes in `adata.raw` and filter out non-HVGs from `adata`.", - "code": "#save the whole genes and filter the non-HVGs\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]" - }, - { - "action": "Scale the `adata.X` using `ov.pp.scale()`.", - "code": "#scale the adata.X\nov.pp.scale(adata)" - }, - { - "action": "Perform dimensionality reduction using PCA on the scaled data with `ov.pp.pca()`, keeping the top 50 principal components.", - "code": "#Dimensionality Reduction\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Construct a metacellular object using `ov.single.MetaCell` with specified parameters.", - "code": "meta_obj=ov.single.MetaCell(adata,use_rep='scaled|original|X_pca',\n n_metacells=None,\n use_gpu='cuda:0')" - }, - { - "action": "Initialize archetypes for the metacellular object.", - "code": "get_ipython().run_cell_magic('time', '', 'meta_obj.initialize_archetypes()\\n')" - }, - { - "action": "Train the SEACells model with specified minimum and maximum iterations.", - "code": "get_ipython().run_cell_magic('time', '', 'meta_obj.train(min_iter=10, max_iter=50)\\n')" - }, - { - "action": "Save the trained model to a file.", - "code": "meta_obj.save('seacells/model.pkl')" - }, - { - "action": "Load the trained model from a file.", - "code": "meta_obj.load('seacells/model.pkl')" - }, - { - "action": "Predict metacells using the `predicted` method with 'soft' aggregation and summarize the 'lognorm' layer.", - "code": "ad=meta_obj.predicted(method='soft',celltype_label='clusters',\n summarize_layer='lognorm')" - }, - { - "action": "Compute cell type purity, separation, and compactness for benchmarking.", - "code": "SEACell_purity = meta_obj.compute_celltype_purity('clusters')\nseparation = meta_obj.separation(use_rep='scaled|original|X_pca',nth_nbr=1)\ncompactness = meta_obj.compactness(use_rep='scaled|original|X_pca')" - }, - { - "action": "Create box plots to visualize cell type purity, compactness, and separation using `seaborn` and `matplotlib`.", - "code": "import seaborn as sns\nimport matplotlib.pyplot as plt\nov.plot_set()\nfig, axes = plt.subplots(1,3,figsize=(4,4))\nsns.boxplot(data=SEACell_purity, y='clusters_purity',ax=axes[0],\n color=ov.utils.blue_color[3])\nsns.boxplot(data=compactness, y='compactness',ax=axes[1],\n color=ov.utils.blue_color[4])\nsns.boxplot(data=separation, y='separation',ax=axes[2],\n color=ov.utils.blue_color[4])\nplt.tight_layout()\nplt.suptitle('Evaluate of MetaCells',fontsize=13,y=1.05)\nfor ax in axes:\n ax.grid(False)\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.spines['bottom'].set_visible(True)\n ax.spines['left'].set_visible(True)" - }, - { - "action": "Plot UMAP embedding of metacells colored by cluster labels and overlay metacell centers.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\nov.pl.embedding(\n meta_obj.adata,\n basis=\"X_umap\",\n color=['clusters'],\n frameon='small',\n title=\"Meta cells\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n size=10,\n ax=ax,\n alpha=0.2,\n #legend_loc='', \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n #palette=ov.utils.blue_color[:],\n #legend_fontweight='normal'\n)\nov.single.plot_metacells(ax,meta_obj.adata,color='#CB3E35',\n )" - }, - { - "action": "Get the mean value of 'S_score' from the original `adata` and store it in the metacell AnnData object `ad`.", - "code": "ov.single.get_obs_value(ad,adata,groupby='S_score',\n type='mean')\nad.obs.head()" - }, - { - "action": "Identify highly variable genes in the metacell AnnData object `ad`.", - "code": "import scanpy as sc\nad.raw=ad.copy()\nsc.pp.highly_variable_genes(ad, n_top_genes=2000, inplace=True)\nad=ad[:,ad.var.highly_variable]" - }, - { - "action": "Scale the metacell data and perform PCA.", - "code": "ov.pp.scale(ad)\nov.pp.pca(ad,layer='scaled',n_pcs=30)" - }, - { - "action": "Compute nearest neighbors for the metacell data.", - "code": "ov.pp.neighbors(ad, n_neighbors=15, n_pcs=20,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Compute UMAP for the metacell data.", - "code": "ov.pp.umap(ad)" - }, - { - "action": "Set the 'celltype' observation to be categorical and reorder categories to match the original data. Also, set the color palette for 'celltype' to match the original data.", - "code": "ad.obs['celltype']=ad.obs['celltype'].astype('category')\nad.obs['celltype']=ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories)\nad.uns['celltype_colors']=adata.uns['clusters_colors']" - }, - { - "action": "Plot UMAP embedding of metacells colored by 'celltype' and 'S_score'.", - "code": "ov.pl.embedding(ad, basis='X_umap',\n color=[\"celltype\",\"S_score\"],\n frameon='small',cmap='RdBu_r',\n wspace=0.5)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_metatime.json b/rag_engine/ovrawmjson/t_metatime.json deleted file mode 100644 index abcae38b..00000000 --- a/rag_engine/ovrawmjson/t_metatime.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Import the scanpy library and read the 'TiME_adata_scvi.h5ad' file into an AnnData object named `adata`.", - "code": "import scanpy as sc\nadata=sc.read('TiME_adata_scvi.h5ad')\nadata" - }, - { - "action": "Calculate the neighborhood graph of the cells in `adata` using the 'X_scVI' representation.", - "code": "sc.pp.neighbors(adata, use_rep=\"X_scVI\")" - }, - { - "action": "Calculate the Minimum Distortion Embedding (MDE) of the 'X_scVI' representation and store it in `adata.obsm[\"X_mde\"]`.", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"X_scVI\"])" - }, - { - "action": "Plot the MDE embedding, colored by the 'patient' variable.", - "code": "sc.pl.embedding(\n adata,\n basis=\"X_mde\",\n color=[\"patient\"],\n frameon=False,\n ncols=1,\n)" - }, - { - "action": "Initialize a MetaTiME object with the AnnData object `adata` and mode set to 'table'.", - "code": "TiME_object=ov.single.MetaTiME(adata,mode='table')" - }, - { - "action": "Overcluster the cells in the `TiME_object` with a resolution of 8 and store the cluster labels in `adata.obs['overcluster']`.", - "code": "TiME_object.overcluster(resolution=8,clustercol = 'overcluster',)" - }, - { - "action": "Predict the cell types in the tumor microenvironment (TME) using `TiME_object.predictTiME()` and store the results in `adata.obs['MetaTiME']` and `adata.obs['Major_MetaTiME']`.", - "code": "TiME_object.predictTiME(save_obs_name='MetaTiME')" - }, - { - "action": "Plot the predicted cell types on the MDE embedding using `TiME_object.plot()`.", - "code": "fig,ax=TiME_object.plot(cluster_key='MetaTiME',basis='X_mde',dpi=80)" - }, - { - "action": "Plot the major cell types on the MDE embedding using `sc.pl.embedding()`.", - "code": "sc.pl.embedding(\n adata,\n basis=\"X_mde\",\n color=[\"Major_MetaTiME\"],\n frameon=False,\n ncols=1,\n)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_mofa.json b/rag_engine/ovrawmjson/t_mofa.json deleted file mode 100644 index a3166f4d..00000000 --- a/rag_engine/ovrawmjson/t_mofa.json +++ /dev/null @@ -1,78 +0,0 @@ -[ - { - "action": "Import the omicverse library and read scRNA-seq and scATAC-seq data from specified files.", - "code": "import omicverse as ov\nrna=ov.utils.read('data/sample/rna_p_n_raw.h5ad')\natac=ov.utils.read('data/sample/atac_p_n_raw.h5ad')" - }, - { - "action": "Display the loaded scRNA-seq and scATAC-seq data.", - "code": "rna,atac" - }, - { - "action": "Create a MOFA model using the omicverse library, incorporating both scRNA-seq and scATAC-seq data, and assigning names to each omics type.", - "code": "test_mofa=ov.single.pyMOFA(omics=[rna,atac],\n omics_name=['RNA','ATAC'])" - }, - { - "action": "Preprocess the MOFA model and run it, saving the output to a specified HDF5 file.", - "code": "test_mofa.mofa_preprocess()\ntest_mofa.mofa_run(outfile='models/brac_rna_atac.hdf5')" - }, - { - "action": "Import the omicverse library and set plotting parameters using `ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Read scRNA-seq data from a specified file.", - "code": "rna=ov.utils.read('data/sample/rna_test.h5ad')" - }, - { - "action": "Extract factor values from a pre-computed MOFA model (stored in an HDF5 file) and add them to the scRNA-seq AnnData object.", - "code": "rna=ov.single.factor_exact(rna,hdf5_path='data/sample/MOFA_POS.hdf5')\nrna" - }, - { - "action": "Calculate and display the correlation between factors and cell types in the scRNA-seq data.", - "code": "ov.single.factor_correlation(adata=rna,cluster='cell_type',factor_list=[1,2,3,4,5])" - }, - { - "action": "Retrieve and display the gene/feature weights for a specific factor and view from the MOFA model.", - "code": "ov.single.get_weights(hdf5_path='data/sample/MOFA_POS.hdf5',view='RNA',factor=1)" - }, - { - "action": "Initialize a MOFA visualization object using a pre-computed MOFA model from a specified HDF5 file.", - "code": "pymofa_obj=ov.single.pyMOFAART(model_path='data/sample/MOFA_POS.hdf5')" - }, - { - "action": "Extract the factor values for each cell in the scRNA-seq data using the MOFA visualization object.", - "code": "pymofa_obj.get_factors(rna)\nrna" - }, - { - "action": "Plot the variance explained (R-squared) for each factor in each view of the MOFA model.", - "code": "pymofa_obj.plot_r2()" - }, - { - "action": "Retrieve and display the R-squared values for each factor in each view.", - "code": "pymofa_obj.get_r2()" - }, - { - "action": "Plot the correlation between factors and cell types using the MOFA visualization object.", - "code": "pymofa_obj.plot_cor(rna,'cell_type')" - }, - { - "action": "Plot the values of two specified factors against each other, colored by a specific cell type ('Epi').", - "code": "pymofa_obj.plot_factor(rna,'cell_type','Epi',figsize=(3,3),\n factor1=6,factor2=10,)" - }, - { - "action": "Calculate and visualize UMAP embeddings of the scRNA-seq data, colored by 'factor6' and 'cell_type'.", - "code": "import scanpy as sc\nsc.pp.neighbors(rna)\nsc.tl.umap(rna)\nsc.pl.embedding(\n rna,\n basis=\"X_umap\",\n color=[\"factor6\",\"cell_type\"],\n frameon=False,\n ncols=2,\n #palette=ov.utils.pyomic_palette(),\n show=False,\n cmap='Greens',\n vmin=0,\n)\n#plt.savefig(\"figures/umap_factor6.png\",dpi=300,bbox_inches = 'tight')" - }, - { - "action": "Plot the weights of genes/features for two specified factors in a scatter plot, highlighting the top weighted genes.", - "code": "pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=6,factor2=10,)" - }, - { - "action": "Plot the weights of genes/features for a specific factor, ordered by weight and colored.", - "code": "pymofa_obj.plot_weights(view='RNA',factor=6,color='#5de25d',\n ascending=True)" - }, - { - "action": "Plot a heatmap showing the top weighted features for each factor in a specific view ('RNA').", - "code": "pymofa_obj.plot_top_feature_heatmap(view='RNA')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_mofa_glue.json b/rag_engine/ovrawmjson/t_mofa_glue.json deleted file mode 100644 index 53b79b02..00000000 --- a/rag_engine/ovrawmjson/t_mofa_glue.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Load RNA and ATAC data from 'h5ad' files using `ov.utils.read()`.", - "code": "rna=ov.utils.read(\"chen_rna-emb.h5ad\")\natac=ov.utils.read(\"chen_atac-emb.h5ad\")" - }, - { - "action": "Create a GLUE_pair object to pair cells between RNA and ATAC data based on the Pearson correlation coefficient of their feature vectors (X_glue).", - "code": "pair_obj=ov.single.GLUE_pair(rna,atac)\npair_obj.correlation()" - }, - { - "action": "Find the top 20 highly correlated cells in the other omics layer for each cell, with a default minimum correlation threshold of 0.9. Save the results to a CSV file.", - "code": "res_pair=pair_obj.find_neighbor_cell(depth=20)\nres_pair.to_csv('models/chen_pair_res.csv')" - }, - { - "action": "Filter the original RNA and ATAC datasets to keep only the paired cells identified in the previous step. Rename the index of the filtered datasets to match the paired cell indices.", - "code": "rna1=rna[res_pair['omic_1']]\natac1=atac[res_pair['omic_2']]\nrna1.obs.index=res_pair.index\natac1.obs.index=res_pair.index\nrna1,atac1" - }, - { - "action": "Create a MuData object to store the paired RNA and ATAC data.", - "code": "from mudata import MuData\n\nmdata = MuData({'rna': rna1, 'atac': atac1})\nmdata" - }, - { - "action": "Write the MuData object to a compressed 'h5mu' file.", - "code": "mdata.write(\"chen_mu.h5mu\",compression='gzip')" - }, - { - "action": "Filter the RNA and ATAC data to keep only highly variable genes.", - "code": "rna1=mdata['rna']\nrna1=rna1[:,rna1.var['highly_variable']==True]\natac1=mdata['atac']\natac1=atac1[:,atac1.var['highly_variable']==True]\nrna1.obs.index=res_pair.index\natac1.obs.index=res_pair.index" - }, - { - "action": "Randomly select 5000 cells from rna1 data", - "code": "import random\nrandom_obs_index=random.sample(list(rna1.obs.index),5000)" - }, - { - "action": "Calculate the adjusted rand index (ARI) between the cell types of the randomly selected cells and all cells in the paired RNA and ATAC data.", - "code": "from sklearn.metrics import adjusted_rand_score as ari\nari_random=ari(rna1[random_obs_index].obs['cell_type'], atac1[random_obs_index].obs['cell_type'])\nari_raw=ari(rna1.obs['cell_type'], atac1.obs['cell_type'])\nprint('raw ari:{}, random ari:{}'.format(ari_raw,ari_random))" - }, - { - "action": "Construct a MOFA model using the paired RNA and ATAC data.", - "code": "test_mofa=ov.single.pyMOFA(omics=[rna1,atac1],\n omics_name=['RNA','ATAC'])" - }, - { - "action": "Preprocess the data for MOFA and run the MOFA algorithm, saving the results to an HDF5 file.", - "code": "test_mofa.mofa_preprocess()\ntest_mofa.mofa_run(outfile='models/chen_rna_atac.hdf5')" - }, - { - "action": "Create a pyMOFAART object to analyze the MOFA results.", - "code": "pymofa_obj=ov.single.pyMOFAART(model_path='models/chen_rna_atac.hdf5')" - }, - { - "action": "Extract the learned factors from the MOFA model and add them to the RNA AnnData object.", - "code": "pymofa_obj.get_factors(rna1)\nrna1" - }, - { - "action": "Plot the variance explained (R^2) by each factor for each view.", - "code": "pymofa_obj.plot_r2()" - }, - { - "action": "Get the R^2 values for each factor and view.", - "code": "pymofa_obj.get_r2()" - }, - { - "action": "Plot the correlation between factors and a specified metadata column ('cell_type') in the RNA AnnData object.", - "code": "pymofa_obj.plot_cor(rna1,'cell_type',figsize=(4,6))" - }, - { - "action": "Get the correlation values between factors and the specified metadata column.", - "code": "pymofa_obj.get_cor(rna1,'cell_type')" - }, - { - "action": "Plot a scatter plot of two specified factors, colored by a specified metadata column and highlighting a specific cell type.", - "code": "pymofa_obj.plot_factor(rna1,'cell_type','Ast',figsize=(3,3),\n factor1=1,factor2=3,)" - }, - { - "action": "Calculate and store the Minimum Description Length (MDE) embedding of the data using the 'X_glue' representation.", - "code": "from scvi.model.utils import mde\nimport scanpy as sc\nsc.pp.neighbors(rna1, use_rep=\"X_glue\", metric=\"cosine\")\nrna1.obsm[\"X_mde\"] = mde(rna1.obsm[\"X_glue\"])" - }, - { - "action": "Plot the MDE embedding, colored by specified factors and cell type.", - "code": "sc.pl.embedding(\n rna1,\n basis=\"X_mde\",\n color=[\"factor1\",\"factor3\",\"cell_type\"],\n frameon=False,\n ncols=3,\n #palette=ov.utils.pyomic_palette(),\n show=False,\n cmap='Greens',\n vmin=0,\n)" - }, - { - "action": "Plot the weights of genes for two specified factors in a specified view.", - "code": "pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=1,factor2=3,)" - }, - { - "action": "Plot the weights of genes for a specified factor in a specified view, sorted in ascending or descending order.", - "code": "pymofa_obj.plot_weights(view='RNA',factor=1,\n ascending=False)" - }, - { - "action": "Plot a heatmap of the top features for each factor in a specified view.", - "code": "pymofa_obj.plot_top_feature_heatmap(view='RNA')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_network.json b/rag_engine/ovrawmjson/t_network.json deleted file mode 100644 index a8626fad..00000000 --- a/rag_engine/ovrawmjson/t_network.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Define a list of genes to be analyzed, representing FAA4 and its ten most confident interactors in Saccharomyces cerevisiae.", - "code": "gene_list=['FAA4','POX1','FAT1','FAS2','FAS1','FAA1','OLE1','YJU3','TGL3','INA1','TGL5']" - }, - { - "action": "Create dictionaries to store gene type and color information for visualization. The top 5 genes are assigned 'Type1' and a specific color, while the rest are assigned 'Type2' and another color.", - "code": "gene_type_dict=dict(zip(gene_list,['Type1']*5+['Type2']*6))\ngene_color_dict=dict(zip(gene_list,['#F7828A']*5+['#9CCCA4']*6))" - }, - { - "action": "Perform STRING interaction analysis using `ov.bulk.string_interaction()`. This function retrieves protein-protein interaction data from the STRING database for the given gene list and species (4932 for Saccharomyces cerevisiae). The result is stored in the `G_res` variable.", - "code": "G_res=ov.bulk.string_interaction(gene_list,4932)\nG_res.head()" - }, - { - "action": "Initialize a `pyPPI` object from `omicverse.bulk` to handle protein-protein interaction network analysis. The object is configured with the gene list, gene type dictionary, gene color dictionary, and species ID.", - "code": "ppi=ov.bulk.pyPPI(gene=gene_list,\n gene_type_dict=gene_type_dict,\n gene_color_dict=gene_color_dict,\n species=4932)" - }, - { - "action": "Connect to the STRING database and calculate the protein-protein interactions using the `interaction_analysis()` method of the `pyPPI` object.", - "code": "ppi.interaction_analysis()" - }, - { - "action": "Plot the protein-protein interaction network using the `plot_network()` method of the `pyPPI` object. This function visualizes the network based on the calculated interactions and the provided gene type and color information.", - "code": "ppi.plot_network()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_nocd.json b/rag_engine/ovrawmjson/t_nocd.json deleted file mode 100644 index d73002ca..00000000 --- a/rag_engine/ovrawmjson/t_nocd.json +++ /dev/null @@ -1,38 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, anndata, scanpy, matplotlib.pyplot, numpy, and pandas. Also, enable inline plotting for matplotlib.", - "code": "import omicverse as ov\nimport anndata\nimport scanpy as sc\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')" - }, - { - "action": "Set scanpy settings for verbosity and figure parameters.", - "code": "sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\nsc.settings.set_figure_params(dpi=80, facecolor='white')" - }, - { - "action": "Define a custom colormap for visualizations.", - "code": "from matplotlib.colors import LinearSegmentedColormap\nsc_color=['#7CBB5F','#368650','#A499CC','#5E4D9A','#78C2ED','#866017', '#9F987F','#E0DFED',\n '#EF7B77', '#279AD7','#F0EEF0', '#1F577B', '#A56BA7', '#E0A7C8', '#E069A6', '#941456', '#FCBC10',\n '#EAEFC5', '#01A0A7', '#75C8CC', '#F0D7BC', '#D5B26C', '#D5DA48', '#B6B812', '#9DC3C3', '#A89C92', '#FEE00C', '#FEF2A1']\nsc_color_cmap = LinearSegmentedColormap.from_list('Custom', sc_color, len(sc_color))" - }, - { - "action": "Read the single-cell RNA sequencing data from an h5ad file.", - "code": "adata = anndata.read('sample/rna.h5ad')\nadata" - }, - { - "action": "Apply lazy preprocessing using omicverse's scanpy_lazy function.", - "code": "adata=ov.single.scanpy_lazy(adata)" - }, - { - "action": "Initialize, configure, and run the scNOCD model for overlapping community detection.", - "code": "scbrca=ov.single.scnocd(adata)\nscbrca.matrix_transform()\nscbrca.matrix_normalize()\nscbrca.GNN_configure()\nscbrca.GNN_preprocess()\nscbrca.GNN_model()\nscbrca.GNN_result()\nscbrca.GNN_plot()\n#scbrca.calculate_nocd()\nscbrca.cal_nocd()" - }, - { - "action": "Calculate the non-overlapping community detection (NOCD) results.", - "code": "scbrca.calculate_nocd()" - }, - { - "action": "Visualize the UMAP embeddings colored by Leiden clustering and NOCD results.", - "code": "sc.pl.umap(scbrca.adata, color=['leiden','nocd'],wspace=0.4,palette=sc_color)" - }, - { - "action": "Visualize the UMAP embeddings colored by Leiden clustering and the number of communities each cell belongs to (nocd_n).", - "code": "sc.pl.umap(scbrca.adata, color=['leiden','nocd_n'],wspace=0.4,palette=sc_color)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_preprocess.json b/rag_engine/ovrawmjson/t_preprocess.json deleted file mode 100644 index 684f6504..00000000 --- a/rag_engine/ovrawmjson/t_preprocess.json +++ /dev/null @@ -1,130 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.ov_plot_set()" - }, - { - "action": "Create directories for data storage and download the 10x Genomics PBMC3k dataset.", - "code": "# !mkdir data\n# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data into an AnnData object using `sc.read_10x_mtx()`.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading\nadata" - }, - { - "action": "Make variable and observation names unique.", - "code": "adata.var_names_make_unique()\nadata.obs_names_make_unique()" - }, - { - "action": "Perform quality control on the AnnData object using `ov.pp.qc()`, filtering cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})\nadata" - }, - { - "action": "Store the original counts in `adata.uns['layers_counts']` using `ov.utils.store_layers()`.", - "code": "ov.utils.store_layers(adata,layers='counts')\nadata" - }, - { - "action": "Preprocess the data using `ov.pp.preprocess()`, applying `shiftlog` normalization and Pearson residuals for highly variable gene detection.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\nadata" - }, - { - "action": "Set the `.raw` attribute of the AnnData object to the normalized and logarithmized raw gene expression.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Retrieve the original counts from `adata.uns['layers_counts']` and compare the maximum values of normalized and raw count data.", - "code": "adata_counts=adata.copy()\nov.utils.retrieve_layers(adata_counts,layers='counts')\nprint('normalize adata:',adata.X.max())\nprint('raw count adata:',adata_counts.X.max())" - }, - { - "action": "Display the AnnData object with raw counts.", - "code": "adata_counts" - }, - { - "action": "Retrieve the original count matrix at the whole gene level.", - "code": "adata_counts=adata.raw.to_adata().copy()\nov.utils.retrieve_layers(adata_counts,layers='counts')\nprint('normalize adata:',adata.X.max())\nprint('raw count adata:',adata_counts.X.max())\nadata_counts" - }, - { - "action": "Scale the data and store the results in a layer using `ov.pp.scale()`.", - "code": "ov.pp.scale(adata)\nadata" - }, - { - "action": "Perform principal component analysis (PCA) on the scaled data using `ov.pp.pca()`.", - "code": "ov.pp.pca(adata,layer='scaled',n_pcs=50)\nadata" - }, - { - "action": "Visualize the PCA embeddings using `ov.utils.embedding()`, coloring by the 'CST3' gene.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']\nov.utils.embedding(adata,\n basis='X_pca',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute the neighborhood graph using `sc.pp.neighbors()`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Calculate Minimum Distortion Embedding (MDE) using `ov.utils.mde()`.", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])\nadata" - }, - { - "action": "Visualize the MDE embeddings using `ov.utils.embedding()`, coloring by the 'CST3' gene.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute UMAP embeddings using `sc.tl.umap()`.", - "code": "sc.tl.umap(adata)" - }, - { - "action": "Visualize the UMAP embeddings using `ov.utils.embedding()`, coloring by the 'CST3' gene.", - "code": "ov.utils.embedding(adata,\n basis='X_umap',\n color='CST3',\n frameon='small')" - }, - { - "action": "Perform Leiden clustering using `sc.tl.leiden()`.", - "code": "sc.tl.leiden(adata)" - }, - { - "action": "Visualize the MDE embeddings using `ov.utils.embedding()`, coloring by 'leiden', 'CST3', and 'NKG7'.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden', 'CST3', 'NKG7'],\n frameon='small')" - }, - { - "action": "Visualize specific clusters using `ov.utils.plot_ConvexHull()`.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots( figsize = (4,4))\n\nov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.utils.plot_ConvexHull(adata,\n basis='X_mde',\n cluster_key='leiden',\n hull_cluster='0',\n ax=ax)" - }, - { - "action": "Generate and display labels for Leiden clusters using `ov.utils.gen_mpl_labels()` with custom styling.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.utils.gen_mpl_labels(\n adata,\n 'leiden',\n exclude=(\"None\",), \n basis='X_mde',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize= 12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Define a list of marker genes.", - "code": "marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',\n 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',\n 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']" - }, - { - "action": "Create a dot plot of the marker genes using `sc.pl.dotplot()`.", - "code": "sc.pl.dotplot(adata, marker_genes, groupby='leiden',\n standard_scale='var');" - }, - { - "action": "Calculate a dendrogram and rank genes using t-test with `sc.tl.dendrogram()` and `sc.tl.rank_genes_groups()`, then visualize the results with `sc.pl.rank_genes_groups_dotplot()`.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='leiden_ttest')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_ttest',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Rank genes using t-test and COSG with `sc.tl.rank_genes_groups()` and `ov.single.cosg()`, then visualize the results with `sc.pl.rank_genes_groups_dotplot()`.", - "code": "sc.tl.rank_genes_groups(adata, groupby='leiden', \n method='t-test',use_rep='scaled|original|X_pca',)\nov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_cosg',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Prepare a dictionary of DataFrames for the stacked volcano plot, where each DataFrame contains gene names, log fold changes, and adjusted p-values for each Leiden cluster.", - "code": "data_dict={}\nfor i in adata.obs['leiden'].cat.categories:\n data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest',\n pval_cutoff=None,log2fc_min=None)" - }, - { - "action": "Print keys of the data_dict", - "code": "data_dict.keys()" - }, - { - "action": "Print the head of the DataFrame for a specific cluster", - "code": "data_dict[i].head()" - }, - { - "action": "Prepare a dictionary mapping Leiden cluster names to colors.", - "code": "type_color_dict=dict(zip(adata.obs['leiden'].cat.categories,\n adata.uns['leiden_colors']))\ntype_color_dict" - }, - { - "action": "Create a stacked volcano plot using `ov.utils.stacking_vol()` with specified parameters.", - "code": "fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict,\n pval_threshold=0.01,\n log2fc_threshold=2,\n figsize=(8,4),\n sig_color='#a51616',\n normal_color='#c7c7c7',\n plot_genes_num=2,\n plot_genes_fontsize=6,\n plot_genes_weight='bold',\n )\n\n#The following code will be removed in future\ny_min,y_max=0,0\nfor i in data_dict.keys():\n y_min=min(y_min,data_dict[i]['logfoldchanges'].min())\n y_max=max(y_max,data_dict[i]['logfoldchanges'].max())\nfor i in adata.obs['leiden'].cat.categories:\n axes[i].set_ylim(y_min,y_max)\nplt.suptitle('Stacking_vol',fontsize=12) " - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_preprocess_cpu.json b/rag_engine/ovrawmjson/t_preprocess_cpu.json deleted file mode 100644 index eae87fbd..00000000 --- a/rag_engine/ovrawmjson/t_preprocess_cpu.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy and omicverse. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Download and unpack the PBMC3k dataset from 10x Genomics.", - "code": "# !mkdir data\nget_ipython().system('wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz')\nget_ipython().system('cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz')\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data into an AnnData object.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading\nadata" - }, - { - "action": "Make variable and observation names unique.", - "code": "adata.var_names_make_unique()\nadata.obs_names_make_unique()" - }, - { - "action": "Perform quality control on the AnnData object.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.qc(adata,\\n tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},\\n doublets_method='sccomposite',\\n batch_key=None)\\nadata\\n\")" - }, - { - "action": "Preprocess the AnnData object, including normalization and highly variable gene detection.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Store the normalized and logarithmized raw gene expression in the .raw attribute of the AnnData object.", - "code": "get_ipython().run_cell_magic('time', '', 'adata.raw = adata\\nadata = adata[:, adata.var.highly_variable_features]\\nadata\\n')" - }, - { - "action": "Scale the data for principal component analysis.", - "code": "get_ipython().run_cell_magic('time', '', 'ov.pp.scale(adata)\\nadata\\n')" - }, - { - "action": "Perform principal component analysis (PCA) on the scaled data.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.pca(adata,layer='scaled',n_pcs=50)\\nadata\\n\")" - }, - { - "action": "Visualize the PCA embedding, coloring cells by the expression of the CST3 gene.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']\nov.pl.embedding(adata,\n basis='X_pca',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute the neighborhood graph of the cells.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\\n use_rep='scaled|original|X_pca')\\n\")" - }, - { - "action": "Embed the neighborhood graph using UMAP.", - "code": "get_ipython().run_cell_magic('time', '', 'ov.pp.umap(adata)\\n')" - }, - { - "action": "Visualize the UMAP embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color='CST3',\n frameon='small')" - }, - { - "action": "Calculate mde embeddings", - "code": "ov.pp.mde(adata,embedding_dim=2,n_neighbors=15, basis='X_mde',\n n_pcs=50, use_rep='scaled|original|X_pca',)" - }, - { - "action": "Visualize the mde embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color='CST3',\n frameon='small')" - }, - { - "action": "Score cell cycle genes in the AnnData object.", - "code": "adata_raw=adata.raw.to_adata()\nov.pp.score_genes_cell_cycle(adata_raw,species='human')" - }, - { - "action": "Visualize the mde embedding, coloring cells by cell cycle phase.", - "code": "ov.pl.embedding(adata_raw,\n basis='X_mde',\n color='phase',\n frameon='small')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph.", - "code": "ov.pp.leiden(adata,resolution=1)" - }, - { - "action": "Visualize the mde embedding, coloring cells by Leiden cluster, CST3 expression, and NKG7 expression.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden', 'CST3', 'NKG7'],\n frameon='small')" - }, - { - "action": "Visualize specific clusters using a convex hull.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots( figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.ConvexHull(adata,\n basis='X_mde',\n cluster_key='leiden',\n hull_cluster='0',\n ax=ax)" - }, - { - "action": "Generate labels for the mde embedding, improving text overlap.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.utils.gen_mpl_labels(\n adata,\n 'leiden',\n exclude=(\"None\",), \n basis='X_mde',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize= 12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Define a list of marker genes.", - "code": "marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',\n 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',\n 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']" - }, - { - "action": "Create a dot plot of the marker genes, grouped by Leiden cluster.", - "code": "sc.pl.dotplot(adata, marker_genes, groupby='leiden',\n standard_scale='var');" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using a t-test.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='leiden_ttest')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_ttest',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using the COSG method.", - "code": "sc.tl.rank_genes_groups(adata, groupby='leiden', \n method='t-test',use_rep='scaled|original|X_pca',)\nov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_cosg',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Prepare data for the Stacked Volcano Chart by creating a dictionary of DataFrames, each containing gene names, log fold changes, and adjusted p-values for a specific Leiden cluster.", - "code": "data_dict={}\nfor i in adata.obs['leiden'].cat.categories:\n data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest',\n pval_cutoff=None,log2fc_min=None)" - }, - { - "action": "Display the keys of data_dict", - "code": "data_dict.keys()" - }, - { - "action": "Show the head of the DataFrame for a specific cluster.", - "code": "data_dict[i].head()" - }, - { - "action": "Create a dictionary mapping Leiden cluster names to their corresponding colors.", - "code": "type_color_dict=dict(zip(adata.obs['leiden'].cat.categories,\n adata.uns['leiden_colors']))\ntype_color_dict" - }, - { - "action": "Generate and display a Stacked Volcano Chart.", - "code": "fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict,\n pval_threshold=0.01,\n log2fc_threshold=2,\n figsize=(8,4),\n sig_color='#a51616',\n normal_color='#c7c7c7',\n plot_genes_num=2,\n plot_genes_fontsize=6,\n plot_genes_weight='bold',\n )\n\n#The following code will be removed in future\ny_min,y_max=0,0\nfor i in data_dict.keys():\n y_min=min(y_min,data_dict[i]['logfoldchanges'].min())\n y_max=max(y_max,data_dict[i]['logfoldchanges'].max())\nfor i in adata.obs['leiden'].cat.categories:\n axes[i].set_ylim(y_min,y_max)\nplt.suptitle('Stacking_vol',fontsize=12) " - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_preprocess_gpu.json b/rag_engine/ovrawmjson/t_preprocess_gpu.json deleted file mode 100644 index f3940d6f..00000000 --- a/rag_engine/ovrawmjson/t_preprocess_gpu.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.plot_set()`. Initialize GPU settings using `ov.settings.gpu_init()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()\nov.settings.gpu_init()" - }, - { - "action": "Download and unpack the PBMC3k dataset from 10x Genomics.", - "code": "# !mkdir data\n#!wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n#!cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data into an AnnData object.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading\nadata" - }, - { - "action": "Make variable and observation names unique.", - "code": "adata.var_names_make_unique()\nadata.obs_names_make_unique()" - }, - { - "action": "Convert the AnnData object to a GPU-compatible format.", - "code": "ov.pp.anndata_to_GPU(adata)" - }, - { - "action": "Perform quality control on the AnnData object.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.qc(adata,\\n tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},\\n batch_key=None)\\nadata\\n\")" - }, - { - "action": "Preprocess the AnnData object, including normalization and highly variable gene detection.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Set the .raw attribute of the AnnData object to the normalized and logarithmized raw gene expression.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Scale the data for principal component analysis.", - "code": "get_ipython().run_cell_magic('time', '', 'ov.pp.scale(adata)\\nadata\\n')" - }, - { - "action": "Perform principal component analysis (PCA) on the scaled data.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.pca(adata,layer='scaled',n_pcs=50)\\nadata\\n\")" - }, - { - "action": "Visualize the PCA embedding, coloring cells by the expression of the CST3 gene.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']\nov.utils.embedding(adata,\n basis='X_pca',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute the neighborhood graph of the cells using cagra method.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\\n use_rep='scaled|original|X_pca',method='cagra')\\n\")" - }, - { - "action": "Calculate mde embeddings", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])\nadata" - }, - { - "action": "Visualize the mde embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color='CST3',\n frameon='small')" - }, - { - "action": "Embed the neighborhood graph using UMAP.", - "code": "ov.pp.umap(adata)" - }, - { - "action": "Visualize the UMAP embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color='CST3',\n frameon='small')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph.", - "code": "ov.pp.leiden(adata)" - }, - { - "action": "Convert the AnnData object back to a CPU-compatible format.", - "code": "ov.pp.anndata_to_CPU(adata)" - }, - { - "action": "Visualize the mde embedding, coloring cells by Leiden cluster, CST3 expression, and NKG7 expression.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden', 'CST3', 'NKG7'],\n frameon='small')" - }, - { - "action": "Visualize specific clusters using a convex hull.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots( figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.ConvexHull(adata,\n basis='X_mde',\n cluster_key='leiden',\n hull_cluster='0',\n ax=ax)" - }, - { - "action": "Generate labels for the mde embedding, improving text overlap.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.utils.gen_mpl_labels(\n adata,\n 'leiden',\n exclude=(\"None\",), \n basis='X_mde',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize= 12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Define a list of marker genes.", - "code": "marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',\n 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',\n 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']" - }, - { - "action": "Create a dot plot of the marker genes, grouped by Leiden cluster.", - "code": "sc.pl.dotplot(adata, marker_genes, groupby='leiden',\n standard_scale='var');" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using a t-test.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='leiden_ttest')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_ttest',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using the COSG method.", - "code": "sc.tl.rank_genes_groups(adata, groupby='leiden', \n method='t-test',use_rep='scaled|original|X_pca',)\nov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_cosg',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Prepare data for the Stacked Volcano Chart by creating a dictionary of DataFrames, each containing gene names, log fold changes, and adjusted p-values for a specific Leiden cluster.", - "code": "data_dict={}\nfor i in adata.obs['leiden'].cat.categories:\n data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest',\n pval_cutoff=None,log2fc_min=None)" - }, - { - "action": "Display the keys of data_dict", - "code": "data_dict.keys()" - }, - { - "action": "Show the head of the DataFrame for a specific cluster.", - "code": "data_dict[i].head()" - }, - { - "action": "Create a dictionary mapping Leiden cluster names to their corresponding colors.", - "code": "type_color_dict=dict(zip(adata.obs['leiden'].cat.categories,\n adata.uns['leiden_colors']))\ntype_color_dict" - }, - { - "action": "Generate and display a Stacked Volcano Chart.", - "code": "fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict,\n pval_threshold=0.01,\n log2fc_threshold=2,\n figsize=(8,4),\n sig_color='#a51616',\n normal_color='#c7c7c7',\n plot_genes_num=2,\n plot_genes_fontsize=6,\n plot_genes_weight='bold',\n )\n\n#The following code will be removed in future\ny_min,y_max=0,0\nfor i in data_dict.keys():\n y_min=min(y_min,data_dict[i]['logfoldchanges'].min())\n y_max=max(y_max,data_dict[i]['logfoldchanges'].max())\nfor i in adata.obs['leiden'].cat.categories:\n axes[i].set_ylim(y_min,y_max)\nplt.suptitle('Stacking_vol',fontsize=12) " - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_scdeg.json b/rag_engine/ovrawmjson/t_scdeg.json deleted file mode 100644 index 5580f874..00000000 --- a/rag_engine/ovrawmjson/t_scdeg.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\n\nov.utils.ov_plot_set()" - }, - { - "action": "Load the pancreas dataset using `scv.datasets.pancreas()`.", - "code": "adata = scv.datasets.pancreas()\nadata" - }, - { - "action": "Check the maximum value in the `adata.X` matrix.", - "code": "adata.X.max()" - }, - { - "action": "Perform quality control, normalization, and calculate highly variable genes (HVGs). Save the whole genes and filter non-HVGs. Scale the `adata.X` matrix and perform dimensionality reduction using PCA.", - "code": "#quantity control\nadata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})\n#normalize and high variable genes (HVGs) calculated\nadata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\n\n#save the whole genes and filter the non-HVGs\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\n\n#scale the adata.X\nov.pp.scale(adata)\n\n#Dimensionality Reduction\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Check the maximum value in the scaled `adata.X` matrix.", - "code": "adata.X.max()" - }, - { - "action": "Select target cells ('Alpha' and 'Beta') for analysis, derive the expression matrix using `to_df()`, and build the differential expression analysis module using `pyDEG`.", - "code": "test_adata=adata[adata.obs['clusters'].isin(['Alpha','Beta'])]\ntest_adata\n\n\ndds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T)" - }, - { - "action": "Drop duplicate indices in the `dds` object.", - "code": "dds.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Set up treatment and control groups based on cell types ('Alpha' and 'Beta') and perform differential expression analysis using the t-test method.", - "code": "treatment_groups=test_adata.obs[test_adata.obs['clusters']=='Alpha'].index.tolist()\ncontrol_groups=test_adata.obs[test_adata.obs['clusters']=='Beta'].index.tolist()\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')" - }, - { - "action": "Display the top differentially expressed genes sorted by q-value.", - "code": "result.sort_values('qvalue').head()" - }, - { - "action": "Set fold change threshold, p-value threshold, and maximum -log10(p-value) for visualization.", - "code": "# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=10)" - }, - { - "action": "Plot a volcano plot of the differential expression analysis results.", - "code": "dds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot box plots for specific genes ('Irx1' and 'Adra2a') in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Irx1','Adra2a'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Visualize cell clusters and gene expression ('Irx1' and 'Adra2a') on a UMAP embedding.", - "code": "ov.utils.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=['clusters','Irx1','Adra2a'])" - }, - { - "action": "Create a MetaCell object using `ov.single.MetaCell` for constructing metacells.", - "code": "meta_obj=ov.single.MetaCell(adata,use_rep='scaled|original|X_pca',n_metacells=150,\n use_gpu=True)" - }, - { - "action": "Initialize archetypes for the MetaCell object.", - "code": "meta_obj.initialize_archetypes()" - }, - { - "action": "Train the MetaCell model.", - "code": "meta_obj.train(min_iter=10, max_iter=50)" - }, - { - "action": "Save the trained MetaCell model.", - "code": "meta_obj.save('seacells/model.pkl')" - }, - { - "action": "Load a saved MetaCell model.", - "code": "meta_obj.load('seacells/model.pkl')" - }, - { - "action": "Predict metacells using the trained model with the 'soft' method and summarize the 'lognorm' layer.", - "code": "ad=meta_obj.predicted(method='soft',celltype_label='clusters',\n summarize_layer='lognorm')" - }, - { - "action": "Check the minimum and maximum values of the predicted metacell matrix.", - "code": "ad.X.min(),ad.X.max()" - }, - { - "action": "Plot the metacells on the UMAP embedding of the original data.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\nov.utils.embedding(\n meta_obj.adata,\n basis=\"X_umap\",\n color=['clusters'],\n frameon='small',\n title=\"Meta cells\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n size=10,\n ax=ax,\n alpha=0.2,\n #legend_loc='', \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n #palette=ov.utils.blue_color[:],\n #legend_fontweight='normal'\n)\nov.single._metacell.plot_metacells(ax,meta_obj.adata,color='#CB3E35',\n )" - }, - { - "action": "Select metacells of types 'Alpha' and 'Beta' for differential expression analysis.", - "code": "test_adata=ad[ad.obs['celltype'].isin(['Alpha','Beta'])]\ntest_adata" - }, - { - "action": "Create a `pyDEG` object for differential expression analysis using metacell data.", - "code": "dds_meta=ov.bulk.pyDEG(test_adata.to_df().T)" - }, - { - "action": "Drop duplicate indices in the `dds_meta` object.", - "code": "dds_meta.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Set up treatment and control groups based on metacell types ('Alpha' and 'Beta') and perform differential expression analysis using the t-test method.", - "code": "treatment_groups=test_adata.obs[test_adata.obs['celltype']=='Alpha'].index.tolist()\ncontrol_groups=test_adata.obs[test_adata.obs['celltype']=='Beta'].index.tolist()\nresult=dds_meta.deg_analysis(treatment_groups,control_groups,method='ttest')" - }, - { - "action": "Display the top differentially expressed genes in metacells sorted by q-value.", - "code": "result.sort_values('qvalue').head()" - }, - { - "action": "Set fold change threshold, p-value threshold, and maximum -log10(p-value) for visualization in metacell analysis.", - "code": "# -1 means automatically calculates\ndds_meta.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=10)" - }, - { - "action": "Plot a volcano plot of the differential expression analysis results for metacells.", - "code": "dds_meta.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot box plots for specific genes ('Ctxn2' and 'Mnx1') in the treatment and control metacell groups.", - "code": "dds_meta.plot_boxplot(genes=['Ctxn2','Mnx1'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Visualize cell clusters and gene expression ('Ctxn2' and 'Mnx1') on a UMAP embedding for the original data.", - "code": "ov.utils.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=['clusters','Ctxn2','Mnx1'])" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_scdrug.json b/rag_engine/ovrawmjson/t_scdrug.json deleted file mode 100644 index c62be4e4..00000000 --- a/rag_engine/ovrawmjson/t_scdrug.json +++ /dev/null @@ -1,74 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, infercnvpy, and matplotlib. Set plotting parameters and verbosity level.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport infercnvpy as cnv\nimport matplotlib.pyplot as plt\nimport os\n\nsc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\nsc.settings.set_figure_params(dpi=80, facecolor='white')" - }, - { - "action": "Load the maynard2020_3k dataset using infercnvpy's datasets module.", - "code": "adata = cnv.datasets.maynard2020_3k()" - }, - { - "action": "Annotate gene coordinates using a GTF file. This step adds chromosomal location information to the AnnData object.", - "code": "ov.utils.get_gene_annotation(\n adata, gtf=\"gencode.v43.basic.annotation.gtf.gz\",\n gtf_by=\"gene_name\"\n)" - }, - { - "action": "Filter out genes with missing chromosome information and add chromosome, start, end, and ensg information to the var attribute of the AnnData object.", - "code": "adata=adata[:,~adata.var['chrom'].isnull()]\nadata.var['chromosome']=adata.var['chrom']\nadata.var['start']=adata.var['chromStart']\nadata.var['end']=adata.var['chromEnd']\nadata.var['ensg']=adata.var['gene_id']\nadata.var.loc[:, [\"ensg\", \"chromosome\", \"start\", \"end\"]].head()" - }, - { - "action": "Display the structure and content of the AnnData object, showing the number of cells, genes, and other associated data.", - "code": "adata" - }, - { - "action": "Infer copy number variations (CNVs) using infercnvpy. This step identifies potential tumor cells based on CNV profiles.", - "code": "# We provide all immune cell types as \"normal cells\".\ncnv.tl.infercnv(\n adata,\n reference_key=\"cell_type\",\n reference_cat=[\n \"B cell\",\n \"Macrophage\",\n \"Mast cell\",\n \"Monocyte\",\n \"NK cell\",\n \"Plasma cell\",\n \"T cell CD4\",\n \"T cell CD8\",\n \"T cell regulatory\",\n \"mDC\",\n \"pDC\",\n ],\n window_size=250,\n)\ncnv.tl.pca(adata)\ncnv.pp.neighbors(adata)\ncnv.tl.leiden(adata)\ncnv.tl.umap(adata)\ncnv.tl.cnv_score(adata)" - }, - { - "action": "Visualize the CNV score on a UMAP plot. This helps in identifying cells with high CNV scores, which are likely tumor cells.", - "code": "sc.pl.umap(adata, color=\"cnv_score\", show=False)" - }, - { - "action": "Annotate cells as 'normal' or 'tumor' based on their CNV score. A threshold of 0.03 is used to classify cells as tumor.", - "code": "adata.obs[\"cnv_status\"] = \"normal\"\nadata.obs.loc[\n adata.obs[\"cnv_score\"]>0.03, \"cnv_status\"\n] = \"tumor\"" - }, - { - "action": "Visualize the CNV status ('normal' or 'tumor') on a UMAP plot.", - "code": "sc.pl.umap(adata, color=\"cnv_status\", show=False)" - }, - { - "action": "Subset the AnnData object to include only tumor cells for further analysis.", - "code": "tumor=adata[adata.obs['cnv_status']=='tumor']\ntumor.X.max()" - }, - { - "action": "Preprocess the tumor AnnData object. This includes filtering cells and genes, identifying mitochondrial genes, calculating QC metrics, and identifying highly variable genes.", - "code": "adata=tumor\nprint('Preprocessing...')\nsc.pp.filter_cells(adata, min_genes=200)\nsc.pp.filter_genes(adata, min_cells=3)\nadata.var['mt'] = adata.var_names.str.startswith('MT-')\nsc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\nif not (adata.obs.pct_counts_mt == 0).all():\n adata = adata[adata.obs.pct_counts_mt < 30, :]\n\nadata.raw = adata.copy()\n\nsc.pp.highly_variable_genes(adata)\nadata = adata[:, adata.var.highly_variable]\nsc.pp.scale(adata)\nsc.tl.pca(adata, svd_solver='arpack')" - }, - { - "action": "Perform dimensionality reduction using PCA and UMAP, and compute nearest neighbors for clustering.", - "code": "sc.pp.neighbors(adata, n_pcs=20)\nsc.tl.umap(adata)" - }, - { - "action": "Download necessary data for drug response prediction, including the GDSC drug database and CaDRReS model.", - "code": "ov.utils.download_GDSC_data()\nov.utils.download_CaDRReS_model()" - }, - { - "action": "Apply single-cell data analysis to perform sub-clustering on the tumor clusters at an automatically determined resolution.", - "code": "adata, res,plot_df = ov.single.autoResolution(adata,cpus=4)" - }, - { - "action": "Save the AnnData object to an H5AD file.", - "code": "results_file = os.path.join('./', 'scanpyobj.h5ad')\nadata.write(results_file)" - }, - { - "action": "Reload the AnnData object from the H5AD file.", - "code": "results_file = os.path.join('./', 'scanpyobj.h5ad')\nadata=sc.read(results_file)" - }, - { - "action": "Clone the CaDRReS-Sc repository from GitHub. This repository contains the code for drug response prediction.", - "code": "get_ipython().system('git clone https://github.com/CSB5/CaDRReS-Sc')" - }, - { - "action": "Initialize and run the drug response prediction using the `ov.single.Drug_Response` function. This step predicts the IC50 values for each cell cluster.", - "code": "import ov\njob=ov.single.Drug_Response(adata,scriptpath='CaDRReS-Sc',\n modelpath='models/',\n output='result')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_scmulan.json b/rag_engine/ovrawmjson/t_scmulan.json deleted file mode 100644 index f8a94de0..00000000 --- a/rag_engine/ovrawmjson/t_scmulan.json +++ /dev/null @@ -1,82 +0,0 @@ -[ - { - "action": "Import necessary libraries and set plotting parameters.", - "code": "import os\nimport scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Load the liver dataset from an h5ad file.", - "code": "adata = sc.read('./data/liver_test.h5ad')" - }, - { - "action": "Display the AnnData object to inspect its structure.", - "code": "adata" - }, - { - "action": "Convert the sparse matrix format of `adata.X` to Compressed Sparse Column format for compatibility.", - "code": "from scipy.sparse import csc_matrix\nadata.X = csc_matrix(adata.X)" - }, - { - "action": "Transform the gene symbols in the input AnnData object to a uniform set of 42117 gene symbols, matching the pre-trained scMulan model.", - "code": "adata_GS_uniformed = ov.externel.scMulan.GeneSymbolUniform(input_adata=adata,\n output_dir=\"./data\",\n output_prefix='liver')" - }, - { - "action": "Load the uniformed AnnData object from the saved file.", - "code": "adata_GS_uniformed=sc.read_h5ad('./data/liver_uniformed.h5ad')" - }, - { - "action": "Display the uniformed AnnData object.", - "code": "adata_GS_uniformed" - }, - { - "action": "Normalize and log-transform the count matrix if the maximum value is greater than 10.", - "code": "if adata_GS_uniformed.X.max() > 10:\n sc.pp.normalize_total(adata_GS_uniformed, target_sum=1e4) \n sc.pp.log1p(adata_GS_uniformed)" - }, - { - "action": "Specify the path to the pre-trained scMulan model checkpoint.", - "code": "ckp_path = './ckpt/ckpt_scMulan.pt'" - }, - { - "action": "Initialize the scMulan model for inference and prepare it for CUDA processing.", - "code": "scml = ov.externel.scMulan.model_inference(ckp_path, adata_GS_uniformed)\nbase_process = scml.cuda_count()" - }, - { - "action": "Predict cell types and obtain cell embeddings using the scMulan model, with optional parallel processing.", - "code": "scml.get_cell_types_and_embds_for_adata(parallel=True, n_process = 1)" - }, - { - "action": "Copy the AnnData object with scMulan results for further analysis.", - "code": "adata_mulan = scml.adata.copy()" - }, - { - "action": "Scale the data, perform PCA, and then compute a 2-D embedding using pyMDE for visualization.", - "code": "ov.pp.scale(adata_mulan)\nov.pp.pca(adata_mulan)\nov.pp.mde(adata_mulan,embedding_dim=2,n_neighbors=15, basis='X_mde',\n n_pcs=10, use_rep='scaled|original|X_pca',)" - }, - { - "action": "Visualize the cell type annotations from scMulan using the computed 2-D embedding.", - "code": "ov.pl.embedding(adata_mulan,basis='X_mde',\n color=[\"cell_type_from_scMulan\",],\n ncols=1,frameon='small')" - }, - { - "action": "Copy the 'X_mde' embeddings to 'X_umap' for compatibility with other functions.", - "code": "adata_mulan.obsm['X_umap']=adata_mulan.obsm['X_mde']" - }, - { - "action": "Apply a smoothing function to filter false positives in the cell type predictions.", - "code": "ov.externel.scMulan.cell_type_smoothing(adata_mulan, threshold=0.1)" - }, - { - "action": "Visualize both the smoothed cell type predictions and the original annotations on the 2-D embedding.", - "code": "ov.pl.embedding(adata_mulan,basis='X_mde',\n color=[\"cell_type_from_mulan_smoothing\",\"cell_type\"],\n ncols=1,frameon='small')" - }, - { - "action": "Display the AnnData object with smoothed cell type annotations.", - "code": "adata_mulan" - }, - { - "action": "Get the top 20 most frequent cell types from scMulan's predictions.", - "code": "top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20]" - }, - { - "action": "Visualize selected cell types on the UMAP embedding, optionally with smoothing.", - "code": "selected_cell_types = top_celltypes\nov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_simba.json b/rag_engine/ovrawmjson/t_simba.json deleted file mode 100644 index b13d90e5..00000000 --- a/rag_engine/ovrawmjson/t_simba.json +++ /dev/null @@ -1,50 +0,0 @@ -[ - { - "action": "Import necessary libraries and set up the working directory. `ov.utils.ov_plot_set()` sets default plotting parameters.", - "code": "import omicverse as ov\nfrom omicverse.utils import mde\nworkdir = 'result_human_pancreas'\nov.utils.ov_plot_set()" - }, - { - "action": "Installation instructions for SIMBA, a tool for single-cell data integration and batch correction.", - "code": "# We need to install simba at first\n# \n# ```\n# conda install -c bioconda simba\n# ```\n# \n# or\n# \n# ```\n# pip install git+https://github.com/huidongchen/simba\n# pip install git+https://github.com/pinellolab/simba_pbg\n# ```" - }, - { - "action": "Read the combined AnnData object from a file. This object contains three scRNA-seq human pancreas datasets.", - "code": "adata=ov.utils.read('simba_adata_raw.h5ad')" - }, - { - "action": "Initialize a pySIMBA object with the AnnData object and the working directory.", - "code": "simba_object=ov.single.pySIMBA(adata,workdir)" - }, - { - "action": "Preprocess the data using default parameters. This includes filtering cells, normalizing library sizes, selecting highly variable genes, and binning genes.", - "code": "simba_object.preprocess(batch_key='batch',min_n_cells=3,\n method='lib_size',n_top_genes=3000,n_bins=5)" - }, - { - "action": "Generate a graph for training. The graph represents cells and genes as nodes, with edges connecting them based on relationships in the data.", - "code": "simba_object.gen_graph()" - }, - { - "action": "Train the PyTorch BigGraph (PBG) model using the generated graph. The `num_workers` parameter specifies the number of CPU cores to use for training.", - "code": "simba_object.train(num_workers=6)" - }, - { - "action": "Load a pre-trained model from a specified directory.", - "code": "simba_object.load('result_human_pancreas/pbg/graph0')" - }, - { - "action": "Perform batch correction using the `batch_correction()` method. This aligns the datasets to reduce batch effects.", - "code": "adata=simba_object.batch_correction()\nadata" - }, - { - "action": "Visualize the batch-corrected data using Minimum Distortion Embedding (MDE) instead of UMAP.", - "code": "adata.obsm[\"X_mde\"] = mde(adata.obsm[\"X_simba\"])" - }, - { - "action": "Plot the MDE visualization, coloring cells by cell type and batch.", - "code": "sc.pl.embedding(adata,basis='X_mde',color=['cell_type1','batch'])" - }, - { - "action": "Visualize the batch-corrected data using UMAP.", - "code": "import scanpy as sc\nsc.pp.neighbors(adata, use_rep=\"X_simba\")\nsc.tl.umap(adata)\nsc.pl.umap(adata,color=['cell_type1','batch'])" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_single_batch.json b/rag_engine/ovrawmjson/t_single_batch.json deleted file mode 100644 index 230e4890..00000000 --- a/rag_engine/ovrawmjson/t_single_batch.json +++ /dev/null @@ -1,138 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Read individual datasets (s1d3, s2d1, s3d7) from H5AD files, assigning batch labels.", - "code": "adata1=ov.read('neurips2021_s1d3.h5ad')\nadata1.obs['batch']='s1d3'\nadata2=ov.read('neurips2021_s2d1.h5ad')\nadata2.obs['batch']='s2d1'\nadata3=ov.read('neurips2021_s3d7.h5ad')\nadata3.obs['batch']='s3d7'" - }, - { - "action": "Concatenate the three AnnData objects into a single object, merging common variables.", - "code": "adata=sc.concat([adata1,adata2,adata3],merge='same')\nadata" - }, - { - "action": "Display the unique batch labels present in the combined dataset.", - "code": "adata.obs['batch'].unique()" - }, - { - "action": "Convert the data type of the `.X` attribute (gene expression matrix) to `np.int64`.", - "code": "import numpy as np\nadata.X=adata.X.astype(np.int64)" - }, - { - "action": "Perform quality control (QC) on the AnnData object, filtering cells based on mitochondrial percentage, number of UMIs, and detected genes. Considers batch information during QC.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},\n batch_key='batch')\nadata" - }, - { - "action": "Preprocess the data using shiftlog and pearson normalization, selecting the top 3000 highly variable genes (HVGs).", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',\n n_HVGs=3000,batch_key=None)\nadata" - }, - { - "action": "Store the raw counts in `adata.raw` and subset the data to include only highly variable genes.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Save the preprocessed data to an H5AD file with gzip compression.", - "code": "adata.write_h5ad('neurips2021_batch_normlog.h5ad',compression='gzip')" - }, - { - "action": "Scale the data, perform Principal Component Analysis (PCA) on the scaled data, and compute Minimum Distortion Embedding (MDE) based on the PCA results.", - "code": "ov.pp.scale(adata)\nov.pp.pca(adata,layer='scaled',n_pcs=50,mask_var='highly_variable_features')\n\nadata.obsm[\"X_mde_pca\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])" - }, - { - "action": "Visualize the data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_pca',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using Harmony, specifying 'batch' as the batch key and using 50 principal components.", - "code": "adata_harmony=ov.single.batch_correction(adata,batch_key='batch',\n methods='harmony',n_pcs=50)\nadata" - }, - { - "action": "Compute MDE based on the Harmony-corrected data.", - "code": "adata.obsm[\"X_mde_harmony\"] = ov.utils.mde(adata.obsm[\"X_harmony\"])" - }, - { - "action": "Visualize the Harmony-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_harmony',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using Combat, specifying 'batch' as the batch key and using 50 principal components.", - "code": "adata_combat=ov.single.batch_correction(adata,batch_key='batch',\n methods='combat',n_pcs=50)\nadata" - }, - { - "action": "Compute MDE based on the Combat-corrected data.", - "code": "adata.obsm[\"X_mde_combat\"] = ov.utils.mde(adata.obsm[\"X_combat\"])" - }, - { - "action": "Visualize the Combat-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_combat',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using Scanorama, specifying 'batch' as the batch key and using 50 principal components.", - "code": "adata_scanorama=ov.single.batch_correction(adata,batch_key='batch',\n methods='scanorama',n_pcs=50)\nadata" - }, - { - "action": "Compute MDE based on the Scanorama-corrected data.", - "code": "adata.obsm[\"X_mde_scanorama\"] = ov.utils.mde(adata.obsm[\"X_scanorama\"])" - }, - { - "action": "Visualize the Scanorama-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_scanorama',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using scVI, specifying 'batch' as the batch key, using 2 layers, 30 latent dimensions, and the negative binomial gene likelihood.", - "code": "adata_scvi=ov.single.batch_correction(adata,batch_key='batch',\n methods='scVI',n_layers=2, n_latent=30, gene_likelihood=\"nb\")\nadata" - }, - { - "action": "Compute MDE based on the scVI-corrected data.", - "code": "adata.obsm[\"X_mde_scVI\"] = ov.utils.mde(adata.obsm[\"X_scVI\"])" - }, - { - "action": "Visualize the scVI-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_scVI',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Initialize an LDA topic model object, specifying the feature type, highly variable key, layers, batch key, and learning rate.", - "code": "LDA_obj=ov.utils.LDA_topic(adata,feature_type='expression',\n highly_variable_key='highly_variable_features',\n layers='counts',batch_key='batch',learning_rate=1e-3)" - }, - { - "action": "Plot the topic contributions for topic 6.", - "code": "LDA_obj.plot_topic_contributions(6)" - }, - { - "action": "Predict topic compositions for 15 topics.", - "code": "LDA_obj.predicted(15)" - }, - { - "action": "Compute MDE based on the topic compositions and feature embeddings from the MIRA model.", - "code": "adata.obsm[\"X_mde_mira_topic\"] = ov.utils.mde(adata.obsm[\"X_topic_compositions\"])\nadata.obsm[\"X_mde_mira_feature\"] = ov.utils.mde(adata.obsm[\"X_umap_features\"])" - }, - { - "action": "Visualize the MIRA topic-based data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_mira_topic',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Visualize the MIRA feature-based data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_mira_feature',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Save the AnnData object with all batch correction results to an H5AD file with gzip compression.", - "code": "adata.write_h5ad('neurips2021_batch_all.h5ad',compression='gzip')" - }, - { - "action": "Reload the saved AnnData object from the H5AD file.", - "code": "adata=sc.read('neurips2021_batch_all.h5ad')" - }, - { - "action": "Copy specific embeddings to the `.obsm` attribute for benchmarking.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'].copy()\nadata.obsm['X_mira_topic']=adata.obsm['X_topic_compositions'].copy()\nadata.obsm['X_mira_feature']=adata.obsm['X_umap_features'].copy()" - }, - { - "action": "Initialize and run a Benchmarker object from the `scib_metrics` package to evaluate the performance of different batch correction methods.", - "code": "from scib_metrics.benchmark import Benchmarker\nbm = Benchmarker(\n adata,\n batch_key=\"batch\",\n label_key=\"cell_type\",\n embedding_obsm_keys=[\"X_pca\", \"X_combat\", \"X_harmony\",\n 'X_scanorama','X_mira_topic','X_mira_feature','X_scVI'],\n n_jobs=8,\n)\nbm.benchmark()" - }, - { - "action": "Plot the benchmarking results as a table.", - "code": "bm.plot_results_table(min_max_scale=False)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_slat.json b/rag_engine/ovrawmjson/t_slat.json deleted file mode 100644 index a2dba742..00000000 --- a/rag_engine/ovrawmjson/t_slat.json +++ /dev/null @@ -1,130 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, os, scanpy, numpy, pandas, and torch. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport os\n\nimport scanpy as sc\nimport numpy as np\nimport pandas as pd\nimport torch\nov.plot_set()" - }, - { - "action": "Import specific functions and classes from the `omicverse.externel.scSLAT` module. These include functions for data loading, spatial network calculation, SLAT execution, scanpy workflow, spatial matching, visualization tools (e.g., 3D matching, histograms, Sankey diagrams), and region statistics.", - "code": "#import scSLAT\nfrom omicverse.externel.scSLAT.model import load_anndatas, Cal_Spatial_Net, run_SLAT, scanpy_workflow, spatial_match\nfrom omicverse.externel.scSLAT.viz import match_3D_multi, hist, Sankey, match_3D_celltype, Sankey,Sankey_multi,build_3D\nfrom omicverse.externel.scSLAT.metrics import region_statistics" - }, - { - "action": "Load two AnnData objects from H5AD files. `adata1` represents the E11.5 mouse embryo dataset, and `adata2` represents the E12.5 mouse embryo dataset.", - "code": "adata1 = sc.read_h5ad('data/E115_Stereo.h5ad')\nadata2 = sc.read_h5ad('data/E125_Stereo.h5ad')" - }, - { - "action": "Add a 'week' column to the observation metadata (`.obs`) of each AnnData object, indicating the developmental stage (E11.5 or E12.5).", - "code": "adata1.obs['week']='E11.5'\nadata2.obs['week']='E12.5'" - }, - { - "action": "Generate spatial plots for `adata1` and `adata2`, coloring the spots by the 'annotation' variable and setting the spot size to 3.", - "code": "sc.pl.spatial(adata1, color='annotation', spot_size=3)\nsc.pl.spatial(adata2, color='annotation', spot_size=3)" - }, - { - "action": "Calculate spatial networks for `adata1` and `adata2` using the KNN model with a k_cutoff of 20. Load the AnnData objects into a format suitable for SLAT, using 'DPCA' as the feature representation and ensuring the order of features is not checked.", - "code": "Cal_Spatial_Net(adata1, k_cutoff=20, model='KNN')\nCal_Spatial_Net(adata2, k_cutoff=20, model='KNN')\nedges, features = load_anndatas([adata1, adata2], feature='DPCA', check_order=False)" - }, - { - "action": "Run the SLAT algorithm with the specified features and edges. The `LGCN_layer` parameter is set to 5. The function returns embeddings for each dataset (`embd0`, `embd1`) and the computation time.", - "code": "embd0, embd1, time = run_SLAT(features, edges, LGCN_layer=5)" - }, - { - "action": "Perform spatial matching between the embeddings `embd0` and `embd1`. The `reorder` parameter is set to False, and the original AnnData objects are provided. The function returns the best match indices, the index array, and the distances between matched points.", - "code": "best, index, distance = spatial_match([embd0, embd1], reorder=False, adatas=[adata1,adata2])" - }, - { - "action": "Create a matching array from the best match indices. Calculate region statistics for the best matches, starting from 0.5 with 10 intervals.", - "code": "matching = np.array([range(index.shape[0]), best])\nbest_match = distance[:,0]\nregion_statistics(best_match, start=0.5, number_of_interval=10)" - }, - { - "action": "Import the `matplotlib.pyplot` module. Build a 3D model from `adata1` and `adata2` using the provided matching list. The model is subsampled to 300 points. The `draw_3D` function visualizes the model with specified parameters.", - "code": "import matplotlib.pyplot as plt\nmatching_list=[matching]\nmodel = build_3D([adata1,adata2], matching_list,subsample_size=300, )\nax=model.draw_3D(hide_axis=True, line_color='#c2c2c2', height=1, size=[6,6], line_width=1)" - }, - { - "action": "Add a 'low_quality_index' column to `adata2.obs`, representing the quality of the alignment. Convert the column to float type.", - "code": "adata2.obs['low_quality_index']= best_match\nadata2.obs['low_quality_index'] = adata2.obs['low_quality_index'].astype(float)" - }, - { - "action": "Display the spatial coordinates stored in `adata2.obsm['spatial']`.", - "code": "adata2.obsm['spatial']" - }, - { - "action": "Generate a spatial plot for `adata2`, coloring the spots by the 'low_quality_index' variable, setting the spot size to 3, and adding the title 'Quality'.", - "code": "sc.pl.spatial(adata2, color='low_quality_index', spot_size=3, title='Quality')" - }, - { - "action": "Generate a Sankey diagram to visualize the correspondence between cell types in `adata1` and `adata2`. The diagram is customized with various parameters, including node and link opacity, layout, font size, and color. The `return_fig` parameter is set to True to return the figure object.", - "code": "fig=Sankey_multi(adata_li=[adata1,adata2],\n prefix_li=['E11.5','E12.5'],\n matching_li=[matching],\n clusters='annotation',filter_num=10,\n node_opacity = 0.8,\n link_opacity = 0.2,\n layout=[800,500],\n font_size=12,\n font_color='Black',\n save_name=None,\n format='png',\n width=1200,\n height=1000,\n return_fig=True)\nfig.show()" - }, - { - "action": "Save the generated Sankey diagram as an HTML file named \"slat_sankey.html\".", - "code": "fig.write_html(\"slat_sankey.html\")" - }, - { - "action": "Create DataFrames (`adata1_df`, `adata2_df`) from the AnnData objects, including spatial coordinates, cell type annotations, and corresponding colors. The colors are mapped from the `.uns` attribute of each AnnData object.", - "code": "color_dict1=dict(zip(adata1.obs['annotation'].cat.categories,\n adata1.uns['annotation_colors'].tolist()))\nadata1_df = pd.DataFrame({'index':range(embd0.shape[0]),\n 'x': adata1.obsm['spatial'][:,0],\n 'y': adata1.obsm['spatial'][:,1],\n 'celltype':adata1.obs['annotation'],\n 'color':adata1.obs['annotation'].map(color_dict1)\n }\n )\ncolor_dict2=dict(zip(adata2.obs['annotation'].cat.categories,\n adata2.uns['annotation_colors'].tolist()))\nadata2_df = pd.DataFrame({'index':range(embd1.shape[0]),\n 'x': adata2.obsm['spatial'][:,0],\n 'y': adata2.obsm['spatial'][:,1],\n 'celltype':adata2.obs['annotation'],\n 'color':adata2.obs['annotation'].map(color_dict2)\n }\n )" - }, - { - "action": "Use the `match_3D_celltype` function to visualize the alignment of specific cell types ('Urogenital ridge', 'Kidney', and 'Ovary') between `adata1` and `adata2`. The visualization is customized with parameters for subsampling, highlighting, and coordinate scaling. The `draw_3D` function then displays the 3D alignment.", - "code": "kidney_align = match_3D_celltype(adata1_df, adata2_df, matching, meta='celltype', \n highlight_celltype = [['Urogenital ridge'],['Kidney','Ovary']],\n subsample_size=10000, highlight_line = ['blue'], scale_coordinate = True )\nkidney_align.draw_3D(size= [6, 6], line_width =0.8, point_size=[0.6,0.6], hide_axis=True)" - }, - { - "action": "Define a function `cal_matching_cell` to find the cells in `target_adata` that are matched to a specific `query_cell` type in `query_adata` based on the provided `matching` information. The function returns a subset of `target_adata` containing the matched cells.", - "code": "def cal_matching_cell(target_adata,query_adata,matching,query_cell,clusters='annotation',):\n adata1_df = pd.DataFrame({'index':range(target_adata.shape[0]),\n 'x': target_adata.obsm['spatial'][:,0],\n 'y': target_adata.obsm['spatial'][:,1],\n 'celltype':target_adata.obs[clusters]})\n adata2_df = pd.DataFrame({'index':range(query_adata.shape[0]),\n 'x': query_adata.obsm['spatial'][:,0],\n 'y': query_adata.obsm['spatial'][:,1],\n 'celltype':query_adata.obs[clusters]})\n query_adata = target_adata[matching[1,adata2_df.loc[adata2_df.celltype==query_cell,'index'].values],:]\n #adata2_df['target_celltype'] = adata1_df.iloc[matching[1,:],:]['celltype'].to_list()\n #adata2_df['target_obs_names'] = adata1_df.iloc[matching[1,:],:].index.to_list()\n \n #query_obs=adata2_df.loc[adata2_df['celltype']==query_cell,'target_obs_names'].tolist()\n return query_adata" - }, - { - "action": "Call the `cal_matching_cell` function to find the cells in `adata1` that match the 'Kidney' cells in `adata2`. The result is stored in `query_adata`.", - "code": "query_adata=cal_matching_cell(target_adata=adata1,\n query_adata=adata2,\n matching=matching,\n query_cell='Kidney',clusters='annotation')\nquery_adata" - }, - { - "action": "Add a 'kidney_anno' column to `adata1.obs` and assign the 'annotation' values from `query_adata` to the corresponding cells in `adata1`.", - "code": "adata1.obs['kidney_anno']=''\nadata1.obs.loc[query_adata.obs.index,'kidney_anno']=query_adata.obs['annotation']" - }, - { - "action": "Generate a spatial plot for `adata1`, coloring the spots by the 'kidney_anno' variable. A custom palette is used to highlight specific annotations.", - "code": "sc.pl.spatial(adata1, color='kidney_anno', spot_size=3,\n palette=['#F5F5F5','#ff7f0e', 'green',])" - }, - { - "action": "Concatenate `query_adata` and the 'Kidney' cells from `adata2` into a new AnnData object `kidney_lineage_ad`. Preprocess the combined data using `ov.pp.preprocess`, selecting the top 3000 highly variable genes and normalizing the data. Store the raw data in `.raw`, select highly variable genes, scale the data, perform PCA, compute a nearest neighbor graph, cluster the data using Leiden clustering, and compute UMAP embeddings.", - "code": "kidney_lineage_ad=sc.concat([query_adata,adata2[adata2.obs['annotation']=='Kidney']],merge='same')\nkidney_lineage_ad=ov.pp.preprocess(kidney_lineage_ad,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4)\nkidney_lineage_ad.raw = kidney_lineage_ad\nkidney_lineage_ad = kidney_lineage_ad[:, kidney_lineage_ad.var.highly_variable_features]\nov.pp.scale(kidney_lineage_ad)\nov.pp.pca(kidney_lineage_ad)\nov.pp.neighbors(kidney_lineage_ad,use_rep='scaled|original|X_pca',metric=\"cosine\")\nov.utils.cluster(kidney_lineage_ad,method='leiden',resolution=1)\nov.pp.umap(kidney_lineage_ad)" - }, - { - "action": "Generate UMAP plots for `kidney_lineage_ad`, coloring the cells by 'annotation', 'week', and 'leiden' clustering.", - "code": "ov.pl.embedding(kidney_lineage_ad,basis='X_umap',\n color=['annotation','week','leiden'],\n frameon='small')" - }, - { - "action": "Generate a dot plot showing the expression of specific genes associated with nephron progenitors and metanephric/kidney development in the 'leiden' clusters of `kidney_lineage_ad`. The dot plot is customized with a color bar title and without a dendrogram.", - "code": "sc.pl.dotplot(kidney_lineage_ad,{'nephron progenitors':['Wnt9b','Osr1','Nphs1','Lhx1','Pax2','Pax8'],\n 'metanephric':['Eya1','Shisa3','Foxc1'], \n 'kidney':['Wt1','Wnt4','Nr2f2','Dach1','Cd44']} ,\n 'leiden',dendrogram=False,colorbar_title='Expression')" - }, - { - "action": "Re-annotate the 'leiden' clusters in `kidney_lineage_ad.obs` based on their developmental stage and cluster identity. Clusters 4, 2, 3, 1, and 5 are labeled as 'Nephron progenitors (E11.5)' and 'Metanephron progenitors (E11.5)', respectively. Cluster 0 is labeled as 'Kidney (E12.5)'.", - "code": "kidney_lineage_ad.obs['re_anno'] = 'Unknown'\nkidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden.isin(['4']),'re_anno'] = 'Nephron progenitors (E11.5)'\nkidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden.isin(['2','3','1','5']),'re_anno'] = 'Metanephron progenitors (E11.5)'\nkidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden=='0','re_anno'] = 'Kidney (E12.5)'" - }, - { - "action": "Generate UMAP plots for `kidney_lineage_ad`, coloring the cells by 'annotation' and the newly assigned 're_anno' labels.", - "code": "kidney_lineage_ad.obs.leiden = list(kidney_lineage_ad.obs.leiden)\nov.pl.embedding(kidney_lineage_ad,basis='X_umap',\n color=['annotation','re_anno'],\n frameon='small')" - }, - { - "action": "Assign the 're_anno' labels from `kidney_lineage_ad` (specifically the E11.5 cells) to the corresponding cells in `adata1.obs['kidney_anno']`.", - "code": "adata1.obs['kidney_anno']=''\nadata1.obs.loc[kidney_lineage_ad[kidney_lineage_ad.obs['week']=='E11.5'].obs.index,'kidney_anno']=kidney_lineage_ad[kidney_lineage_ad.obs['week']=='E11.5'].obs['re_anno']" - }, - { - "action": "Generate a spatial plot for `adata1`, coloring the spots by the 'kidney_anno' variable. A custom palette is used to highlight specific annotations, and the plot is displayed with a specified figure size.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(1, 1, figsize=(8, 8))\nsc.pl.spatial(adata1, color='kidney_anno', spot_size=1.5,\n palette=['#F5F5F5','#ff7f0e', 'green',],show=False,ax=ax)" - }, - { - "action": "Perform differential expression analysis between E12.5 and E11.5 cells in `kidney_lineage_ad` using `ov.bulk.pyDEG`. The analysis uses the t-test method and sets thresholds for fold change, p-value, and -log10(p-value). A volcano plot is generated to visualize the results, highlighting the top 8 differentially expressed genes.", - "code": "test_adata=kidney_lineage_ad\ndds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T)\ndds.drop_duplicates_index()\nprint('... drop_duplicates_index success')\ntreatment_groups=test_adata.obs[test_adata.obs['week']=='E12.5'].index.tolist()\ncontrol_groups=test_adata.obs[test_adata.obs['week']=='E11.5'].index.tolist()\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')\n# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=10)\n\n\ndds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Extract the top 3 up-regulated and down-regulated genes from the differential expression analysis results based on q-value. Combine these genes into a single list `deg_gene`.", - "code": "up_gene=dds.result.loc[dds.result['sig']=='up'].sort_values('qvalue')[:3].index.tolist()\ndown_gene=dds.result.loc[dds.result['sig']=='down'].sort_values('qvalue')[:3].index.tolist()\ndeg_gene=up_gene+down_gene" - }, - { - "action": "Generate a dot plot showing the expression of the differentially expressed genes (`deg_gene`) in the 're_anno' groups of `kidney_lineage_ad`.", - "code": "sc.pl.dotplot(kidney_lineage_ad,deg_gene,\n groupby='re_anno')" - }, - { - "action": "Calculate a dendrogram for `kidney_lineage_ad` based on the 're_anno' groups and the specified representation. Perform a t-test to rank genes based on their differential expression between the 're_anno' groups. Generate a dot plot showing the top 3 ranked genes for each group, using a specified color map and scaling method.", - "code": "sc.tl.dendrogram(kidney_lineage_ad,'re_anno',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(kidney_lineage_ad, 're_anno', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='re_anno_ttest')\nsc.pl.rank_genes_groups_dotplot(kidney_lineage_ad,groupby='re_anno',\n cmap='RdBu_r',key='re_anno_ttest',\n standard_scale='var',n_genes=3)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_spaceflow.json b/rag_engine/ovrawmjson/t_spaceflow.json deleted file mode 100644 index ab57d05e..00000000 --- a/rag_engine/ovrawmjson/t_spaceflow.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from a specified path and file, and make variable names unique.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics, filter genes with total counts less than 100, identify spatially variable genes using the 'prost' method, and subset the AnnData object to include only spatially variable features.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)\nadata.raw = adata\nadata = adata[:, adata.var.space_variable_features]\nadata" - }, - { - "action": "Read ground truth annotations from a text file and assign them to the 'Ground Truth' column in the observation metadata of the AnnData object. Visualize the spatial distribution of the ground truth annotations.", - "code": "import pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0)\nAnn_df.columns = ['Ground Truth']\nadata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])" - }, - { - "action": "Initialize a SpaceFlow object using the AnnData object.", - "code": "sf_obj=ov.space.pySpaceFlow(adata)" - }, - { - "action": "Train the SpaceFlow model with specified parameters for spatial regularization, embedding dimension, learning rate, epochs, patience, stopping criteria, random seed, GPU usage, and regularization acceleration.", - "code": "sf_obj.train(spatial_regularization_strength=0.1, \n z_dim=50, lr=1e-3, epochs=1000, \n max_patience=50, min_stop=100, \n random_seed=42, gpu=0, \n regularization_acceleration=True, edge_subset_sz=1000000)" - }, - { - "action": "Calculate the Pseudo-Spatial Map (pSM) using the trained SpaceFlow model with specified parameters for the number of neighbors, resolution, maximum cells for subsampling, and the key to store the pSM results.", - "code": "sf_obj.cal_pSM(n_neighbors=20,resolution=1,\n max_cell_for_subsampling=5000,psm_key='pSM_spaceflow')" - }, - { - "action": "Visualize the spatial distribution of the calculated pSM and the ground truth annotations.", - "code": "sc.pl.spatial(adata, color=['pSM_spaceflow','Ground Truth'],cmap='RdBu_r')" - }, - { - "action": "Cluster the spatial data using Gaussian Mixture Model (GMM) with specified parameters for the number of components, covariance type, tolerance, maximum iterations, and random state, using the 'spaceflow' representation.", - "code": "ov.utils.cluster(adata,use_rep='spaceflow',method='GMM',n_components=7,covariance_type='full',\n tol=1e-9, max_iter=1000, random_state=3607)" - }, - { - "action": "Visualize the spatial distribution of the GMM clusters and the ground truth annotations.", - "code": "sc.pl.spatial(adata, color=['gmm_cluster',\"Ground Truth\"])" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_stagate.json b/rag_engine/ovrawmjson/t_stagate.json deleted file mode 100644 index c400f9a2..00000000 --- a/rag_engine/ovrawmjson/t_stagate.json +++ /dev/null @@ -1,90 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from a directory, specifying the path and count file. Ensure unique variable names.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics and filter genes with total counts less than 100. Identify spatially variable genes (SVGs) using the `prost` method, targeting 3000 SVGs and setting the target sum for normalization to 1e4.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)" - }, - { - "action": "Write the processed AnnData object to an H5AD file with gzip compression.", - "code": "adata.write('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "Read ground truth annotations from a tab-separated file, assign them to the AnnData object, and visualize the spatial distribution of the ground truth labels.", - "code": "import pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\\t', header=None, index_col=0)\nAnn_df.columns = ['Ground Truth']\nadata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])" - }, - { - "action": "Define a GraphST model with the AnnData object and specify the device for computation.", - "code": "model = ov.externel.GraphST.GraphST(adata, device='cuda:0')" - }, - { - "action": "Train the GraphST model, specifying the number of principal components (n_pcs) to use.", - "code": "adata = model.train(n_pcs=30)" - }, - { - "action": "Cluster the spatial data using the `mclust` method with specified parameters, including the number of components and model name. Refine the cluster labels using `ov.utils.refine_label`.", - "code": "ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10,\n modelNames='EEV', random_state=112,\n )\nadata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust')" - }, - { - "action": "Compute a neighborhood graph using the specified representation and cluster the data using `louvain` and `leiden` methods. Refine the cluster labels using `ov.utils.refine_label`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20,\n use_rep='graphst|original|X_pca')\nov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='louvain',resolution=0.7)\nov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='leiden',resolution=0.7)\nadata.obs['louvain_GraphST'] = ov.utils.refine_label(adata, radius=50, key='louvain') \nadata.obs['leiden_GraphST'] = ov.utils.refine_label(adata, radius=50, key='leiden')" - }, - { - "action": "Visualize the spatial distribution of the cluster labels obtained from `mclust`, `leiden`, and `louvain` methods, along with the ground truth.", - "code": "sc.pl.spatial(adata, color=['mclust_GraphST','leiden_GraphST',\n 'louvain_GraphST',\"Ground Truth\"])" - }, - { - "action": "Assign spatial coordinates from `adata.obsm['spatial']` to `adata.obs['X']` and `adata.obs['Y']`.", - "code": "adata.obs['X'] = adata.obsm['spatial'][:,0]\nadata.obs['Y'] = adata.obsm['spatial'][:,1]" - }, - { - "action": "Construct a STAGATE object with specified parameters, including the number of batches, spatial keys, radius cutoff, number of epochs, learning rate, weight decay, and hidden dimensions.", - "code": "STA_obj=ov.space.pySTAGATE(adata,num_batch_x=3,num_batch_y=2,\n spatial_key=['X','Y'],rad_cutoff=200,num_epoch = 1000,lr=0.001,\n weight_decay=1e-4,hidden_dims = [512, 30],\n device='cuda:0')" - }, - { - "action": "Train the STAGATE model.", - "code": "STA_obj.train()" - }, - { - "action": "Predict latent embeddings and denoised expressions using the trained STAGATE model.", - "code": "STA_obj.predicted()" - }, - { - "action": "Cluster the spatial data using the `mclust` method on the STAGATE embeddings and refine the cluster labels.", - "code": "ov.utils.cluster(adata,use_rep='STAGATE',method='mclust',n_components=8,\n modelNames='EEV', random_state=112,\n )\nadata.obs['mclust_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='mclust')" - }, - { - "action": "Compute a neighborhood graph using the STAGATE embeddings and cluster the data using `louvain` and `leiden` methods. Refine the cluster labels.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20,\n use_rep='STAGATE')\nov.utils.cluster(adata,use_rep='STAGATE',method='louvain',resolution=0.5)\nov.utils.cluster(adata,use_rep='STAGATE',method='leiden',resolution=0.5)\nadata.obs['louvain_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='louvain') \nadata.obs['leiden_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='leiden')" - }, - { - "action": "Visualize the spatial distribution of the cluster labels obtained from `mclust`, `leiden`, and `louvain` methods on the STAGATE embeddings, along with the ground truth.", - "code": "sc.pl.spatial(adata, color=['mclust_STAGATE','leiden_STAGATE',\n 'louvain_STAGATE',\"Ground Truth\"])" - }, - { - "action": "Sort genes by their spatial information score (PI) in descending order and display the top 10 genes.", - "code": "adata.var.sort_values('PI',ascending=False).head(10)" - }, - { - "action": "Plot the spatial expression of a specific gene (e.g., 'MBP') in both raw and denoised (STAGATE) forms.", - "code": "plot_gene = 'MBP'\nimport matplotlib.pyplot as plt\nfig, axs = plt.subplots(1, 2, figsize=(8, 4))\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99')\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99')" - }, - { - "action": "Calculate the pseudo-spatial map (pSM) using the STAGATE model with specified parameters.", - "code": "STA_obj.cal_pSM(n_neighbors=20,resolution=1,\n max_cell_for_subsampling=5000)" - }, - { - "action": "Visualize the spatial distribution of the ground truth and the calculated pSM.", - "code": "sc.pl.spatial(adata, color=['Ground Truth','pSM_STAGATE'],\n cmap='RdBu_r')" - }, - { - "action": "Evaluate the clustering performance using the Adjusted Rand Index (ARI) for different clustering methods and models (GraphST and STAGATE) compared to the ground truth.", - "code": "from sklearn.metrics.cluster import adjusted_rand_score\n\nobs_df = adata.obs.dropna()\n#GraphST\nARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth'])\nprint('mclust_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['leiden_GraphST'], obs_df['Ground Truth'])\nprint('leiden_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['louvain_GraphST'], obs_df['Ground Truth'])\nprint('louvain_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_STAGATE'], obs_df['Ground Truth'])\nprint('mclust_STAGATE: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['leiden_STAGATE'], obs_df['Ground Truth'])\nprint('leiden_STAGATE: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['louvain_STAGATE'], obs_df['Ground Truth'])\nprint('louvain_STAGATE: Adjusted rand index = %.2f' %ARI)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_staligner.json b/rag_engine/ovrawmjson/t_staligner.json deleted file mode 100644 index 3136035c..00000000 --- a/rag_engine/ovrawmjson/t_staligner.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "action": "Import necessary libraries: scipy.sparse, omicverse, scanpy, anndata, pandas, and os. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "from scipy.sparse import csr_matrix\nimport omicverse as ov\nimport scanpy as sc\nimport anndata as ad\nimport pandas as pd\nimport os\n\nov.utils.ov_plot_set()" - }, - { - "action": "Initialize empty lists `Batch_list` and `adj_list` to store AnnData objects and adjacency matrices, respectively. Define `section_ids` list containing the names of the datasets to be processed. Set the `pathway` variable to the directory containing the data files.", - "code": "Batch_list = []\nadj_list = []\nsection_ids = ['Slide-seqV2_MoB', 'Stereo-seq_MoB']\nprint(section_ids)\npathway = '/storage/zengjianyangLab/hulei/scRNA-seq/scripts/STAligner'" - }, - { - "action": "Iterate through each `section_id` in `section_ids`. Load the corresponding h5ad file into an AnnData object `adata`. Check if `adata.X` is a pandas DataFrame and convert it to a sparse matrix if necessary. Make variable names unique. Prepend `section_id` to each observation name to ensure uniqueness across datasets.", - "code": "for section_id in section_ids:\n print(section_id)\n adata = sc.read_h5ad(os.path.join(pathway,section_id+\".h5ad\"))\n\n # check whether the adata.X is sparse matrix\n if isinstance(adata.X, pd.DataFrame):\n adata.X = csr_matrix(adata.X)\n else:\n pass\n\n adata.var_names_make_unique(join=\"++\")\n\n # make spot name unique\n adata.obs_names = [x+'_'+section_id for x in adata.obs_names]" - }, - { - "action": "Construct the spatial network using `ov.space.Cal_Spatial_Net` with a specified `rad_cutoff`. Perform normalization by selecting highly variable genes using `sc.pp.highly_variable_genes`, normalizing total counts with `sc.pp.normalize_total`, and applying log transformation with `sc.pp.log1p`. Subset `adata` to include only highly variable genes. Append the adjacency matrix and the processed `adata` to `adj_list` and `Batch_list`, respectively.", - "code": " # Constructing the spatial network\n ov.space.Cal_Spatial_Net(adata, rad_cutoff=50) # the spatial network are saved in adata.uns[‘adj’]\n\n # Normalization\n sc.pp.highly_variable_genes(adata, flavor=\"seurat_v3\", n_top_genes=10000)\n sc.pp.normalize_total(adata, target_sum=1e4)\n sc.pp.log1p(adata)\n\n adata = adata[:, adata.var['highly_variable']]\n adj_list.append(adata.uns['adj'])\n Batch_list.append(adata)" - }, - { - "action": "Print the `Batch_list` which now contains the processed AnnData objects for each dataset.", - "code": "Batch_list" - }, - { - "action": "Concatenate the AnnData objects in `Batch_list` into a single AnnData object `adata_concat`. Assign `slice_name` as the label for concatenation and use `section_ids` as keys. Add a new column `batch_name` to `adata_concat.obs` and set it as a categorical variable with the same values as `slice_name`. Print the shape of the concatenated AnnData object.", - "code": "adata_concat = ad.concat(Batch_list, label=\"slice_name\", keys=section_ids)\nadata_concat.obs[\"batch_name\"] = adata_concat.obs[\"slice_name\"].astype('category')\nprint('adata_concat.shape: ', adata_concat.shape)" - }, - { - "action": "Train the STAligner model using the `ov.space.pySTAligner` function. Set parameters for the model, including the number of nearest neighbors (`knn_neigh`), number of epochs (`n_epochs`), integration order (`iter_comb`), batch key (`batch_key`), and the key to add the results (`key_added`). Also, pass the list of AnnData objects (`Batch_list`) to the function.", - "code": "get_ipython().run_cell_magic('time', '', \"# iter_comb is used to specify the order of integration. For example, (0, 1) means slice 0 will be algined with slice 1 as reference.\\niter_comb = [(i, i + 1) for i in range(len(section_ids) - 1)]\\n\\n# Here, to reduce GPU memory usage, each slice is considered as a subgraph for training.\\nSTAligner_obj = ov.space.pySTAligner(adata_concat, verbose=True, knn_neigh = 100, n_epochs = 600, iter_comb = iter_comb,\\n batch_key = 'batch_name', key_added='STAligner', Batch_list = Batch_list)\\n\")" - }, - { - "action": "Train the STAligner model by calling the `train()` method on the `STAligner_obj`.", - "code": "STAligner_obj.train()" - }, - { - "action": "Retrieve the predicted AnnData object with the latent embedding stored in `adata.obsm['STAligner']` by calling the `predicted()` method on the `STAligner_obj`.", - "code": "adata = STAligner_obj.predicted()" - }, - { - "action": "Compute the neighbor graph using the 'STAligner' representation with `sc.pp.neighbors`. Perform clustering using the Leiden algorithm with `ov.utils.cluster` and a specified resolution. Calculate UMAP embeddings with `sc.tl.umap`. Visualize the UMAP embeddings colored by 'batch_name' and 'leiden' clusters using `sc.pl.umap`.", - "code": "sc.pp.neighbors(adata, use_rep='STAligner', random_state=666)\nov.utils.cluster(adata,use_rep='STAligner',method='leiden',resolution=0.4)\nsc.tl.umap(adata, random_state=666)\nsc.pl.umap(adata, color=['batch_name',\"leiden\"],wspace=0.5)" - }, - { - "action": "Create a spatial plot of the clustering results. Define `spot_size` and `title_size` for plot aesthetics. Generate a subplot with two axes. Plot the spatial distribution of 'leiden' clusters for 'Slide-seqV2_MoB' and 'Stereo-seq_MoB' datasets using `sc.pl.spatial`. Adjust the title size and invert the y-axis for the 'Stereo-seq' plot.", - "code": "import matplotlib.pyplot as plt\nspot_size = 50\ntitle_size = 15\nfig, ax = plt.subplots(1, 2, figsize=(6, 3), gridspec_kw={'wspace': 0.05, 'hspace': 0.2})\n_sc_0 = sc.pl.spatial(adata[adata.obs['batch_name'] == 'Slide-seqV2_MoB'], img_key=None, color=['leiden'], title=['Slide-seqV2'],\n legend_fontsize=10, show=False, ax=ax[0], frameon=False, spot_size=spot_size, legend_loc=None)\n_sc_0[0].set_title('Slide-seqV2', size=title_size)\n\n_sc_1 = sc.pl.spatial(adata[adata.obs['batch_name'] == 'Stereo-seq_MoB'], img_key=None, color=['leiden'], title=['Stereo-seq'],\n legend_fontsize=10, show=False, ax=ax[1], frameon=False, spot_size=spot_size)\n_sc_1[0].set_title('Stereo-seq',size=title_size)\n_sc_1[0].invert_yaxis()\nplt.show()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_starfysh.json b/rag_engine/ovrawmjson/t_starfysh.json deleted file mode 100644 index 0e9a3ce8..00000000 --- a/rag_engine/ovrawmjson/t_starfysh.json +++ /dev/null @@ -1,126 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy and omicverse. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Import specific modules from the omicverse package related to Starfysh, including Archetypal Analysis (AA), utility functions (utils), plotting utilities (plot_utils), and post-analysis tools (post_analysis). Also, import the Starfysh model itself (_starfysh).", - "code": "from omicverse.externel.starfysh import (AA, utils, plot_utils, post_analysis)\nfrom omicverse.externel.starfysh import _starfysh as sf_model" - }, - { - "action": "Define file paths for the data, sample ID, and signature gene set file name.", - "code": "# Specify data paths\ndata_path = 'data/star_data'\nsample_id = 'CID44971_TNBC'\nsig_name = 'bc_signatures_version_1013.csv'" - }, - { - "action": "Load spatial transcriptomics data and signature gene sets using utility functions. The `load_adata` function reads the data, and `filter_gene_sig` filters the signature gene sets to include only genes present in the spatial transcriptomics data.", - "code": "# Load expression counts and signature gene sets\nadata, adata_normed = utils.load_adata(data_folder=data_path,\n sample_id=sample_id, # sample id\n n_genes=2000 # number of highly variable genes to keep\n )" - }, - { - "action": "Import pandas and os libraries. Read the signature gene sets from a CSV file into a pandas DataFrame. Filter the gene signatures to keep only those genes that are also present in the spatial transcriptomics data.", - "code": "import pandas as pd\nimport os\ngene_sig = pd.read_csv(os.path.join(data_path, sig_name))\ngene_sig = utils.filter_gene_sig(gene_sig, adata.to_df())\ngene_sig.head()" - }, - { - "action": "Load spatial information and preprocess the histology image associated with the spatial transcriptomics data. This includes reading the image, extracting mapping information, and calculating scale factors.", - "code": "# Load spatial information\nimg_metadata = utils.preprocess_img(data_path,\n sample_id,\n adata_index=adata.obs.index,\n #hchannel=False\n )\nimg, map_info, scalefactor = img_metadata['img'], img_metadata['map_info'], img_metadata['scalefactor']\numap_df = utils.get_umap(adata, display=True)" - }, - { - "action": "Import the matplotlib.pyplot module for plotting. Create a new figure and display the histology image using `imshow`.", - "code": "import matplotlib.pyplot as plt\nplt.figure(figsize=(6, 6), dpi=80)\nplt.imshow(img)" - }, - { - "action": "Display the first few rows of the `map_info` DataFrame, which contains spatial mapping information for the spots in the spatial transcriptomics data.", - "code": "map_info.head()" - }, - { - "action": "Prepare arguments for the Visium data processing, including raw and normalized count data, filtered signature genes, image metadata, number of anchor spots, window size for spatial smoothing, and sample ID.", - "code": "# Parameters for training\nvisium_args = utils.VisiumArguments(adata,\n adata_normed,\n gene_sig,\n img_metadata,\n n_anchors=60,\n window_size=3,\n sample_id=sample_id\n )\n\nadata, adata_normed = visium_args.get_adata()\nanchors_df = visium_args.get_anchors()" - }, - { - "action": "Add new columns to the `adata.obs` DataFrame for log-transformed library size and windowed log-transformed library size, which are calculated during the Visium data processing.", - "code": "adata.obs['log library size']=visium_args.log_lib\nadata.obs['windowed log library size']=visium_args.win_loglib" - }, - { - "action": "Use scanpy's `sc.pl.spatial` function to visualize the log library size on the spatial map. The plot is colored using the 'magma' colormap, and the size of the spots is adjusted for better visualization.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the windowed log library size on the spatial map using `sc.pl.spatial`. This plot shows the spatially smoothed library size, which can help in understanding the spatial distribution of sequencing depth.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='windowed log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the raw gene expression of the 'IL7R' gene on the spatial map using `sc.pl.spatial`. This plot helps in understanding the spatial expression pattern of a specific gene.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='IL7R',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Plot the anchor spots identified during the preprocessing step. Anchor spots are locations in the tissue that are representative of specific cell types. The plot shows the UMAP representation of the data with anchor spots highlighted.", - "code": "plot_utils.plot_anchor_spots(umap_df,\n visium_args.pure_spots,\n visium_args.sig_mean,\n bbox_x=2\n )" - }, - { - "action": "Initialize the Archetypal Analysis (AA) model using the normalized spatial transcriptomics data. Compute archetypes, which represent the 'purest' cell types in the data. Find archetypal spots and marker genes associated with each archetype. Assign archetypes to the closest anchor spots and identify distant archetypes that may represent novel cell types or states.", - "code": "aa_model = AA.ArchetypalAnalysis(adata_orig=adata_normed)\narchetype, arche_dict, major_idx, evs = aa_model.compute_archetypes(cn=40)\n# (1). Find archetypal spots & archetypal clusters\narche_df = aa_model.find_archetypal_spots(major=True)\n\n# (2). Find marker genes associated with each archetypal cluster\nmarkers_df = aa_model.find_markers(n_markers=30, display=False)\n\n# (3). Map archetypes to closest anchors (1-1 per cell type)\nmap_df, map_dict = aa_model.assign_archetypes(anchors_df)\n\n# (4). Optional: Find the most distant archetypes that are not assigned to any annotated cell types\ndistant_arches = aa_model.find_distant_archetypes(anchors_df, n=3)" - }, - { - "action": "Plot the explained variances (evs) from the Archetypal Analysis to help determine the optimal number of archetypes. The plot shows the cumulative explained variance as a function of the number of archetypes.", - "code": "plot_utils.plot_evs(evs, kmin=aa_model.kmin)" - }, - { - "action": "Visualize the archetypes in a 2D UMAP representation. The plot shows the distribution of archetypes and their relationships to each other.", - "code": "aa_model.plot_archetypes(do_3d=False, major=True, disp_cluster=False)" - }, - { - "action": "Visualize the mapping between archetypes and cell types. This plot helps in understanding how the identified archetypes correspond to known cell types based on the anchor spots.", - "code": "aa_model.plot_mapping(map_df)" - }, - { - "action": "Refine the anchor spots by appending marker genes from the best-aligned archetypes. This step updates the signature genes and anchor spots based on the Archetypal Analysis results.", - "code": "visium_args = utils.refine_anchors(\n visium_args,\n aa_model,\n #thld=0.7, # alignment threshold\n n_genes=5,\n #n_iters=1\n)\n\n# Get updated adata & signatures\nadata, adata_normed = visium_args.get_adata()\ngene_sig = visium_args.gene_sig\ncell_types = gene_sig.columns" - }, - { - "action": "Import the torch library. Set the number of random restarts for model training, the number of epochs, and the patience for early stopping. Define the device for model training (CPU or GPU).", - "code": "import torch\nn_repeats = 3\nepochs = 200\npatience = 50\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - }, - { - "action": "Train the Starfysh model without histology integration. The `run_starfysh` function performs the model training with the specified parameters and returns the trained model and the training loss.", - "code": "# Run models\nmodel, loss = utils.run_starfysh(visium_args,\n n_repeats=n_repeats,\n epochs=epochs,\n #patience=patience,\n device=device\n )" - }, - { - "action": "Evaluate the trained Starfysh model and obtain inference and generative outputs. The `model_eval` function processes the model outputs and returns the inferred cell type proportions and other relevant parameters.", - "code": "adata, adata_normed = visium_args.get_adata()\ninference_outputs, generative_outputs,adata_ = sf_model.model_eval(model,\n adata,\n visium_args,\n poe=False,\n device=device)" - }, - { - "action": "Import numpy. Select a random cell type index and use the `gene_mean_vs_inferred_prop` function to compare the signature gene mean expression with the inferred cell type proportion for the selected cell type.", - "code": "import numpy as np\nn_cell_types = gene_sig.shape[1]\nidx = np.random.randint(0, n_cell_types)\npost_analysis.gene_mean_vs_inferred_prop(inference_outputs,\n visium_args,\n idx=idx,\n figsize=(4,4)\n )" - }, - { - "action": "Use the `pl_spatial_inf_feature` function to visualize an inferred feature (e.g., 'ql_m') on the spatial map. The plot is colored using the 'Blues' colormap.", - "code": "plot_utils.pl_spatial_inf_feature(adata_, feature='ql_m', cmap='Blues')" - }, - { - "action": "Define a function `cell2proportion` to extract cell type proportion data from the `adata_` object and create a new AnnData object (`adata_plot`) for plotting.", - "code": "def cell2proportion(adata):\n adata_plot=sc.AnnData(adata.X)\n adata_plot.obs=utils.extract_feature(adata_, 'qc_m').obs.copy()\n adata_plot.var=adata.var.copy()\n adata_plot.obsm=adata.obsm.copy()\n adata_plot.obsp=adata.obsp.copy()\n adata_plot.uns=adata.uns.copy()\n return adata_plot\nadata_plot=cell2proportion(adata_)" - }, - { - "action": "Display a summary of the `adata_plot` object, which contains the cell type proportion data.", - "code": "adata_plot" - }, - { - "action": "Visualize the inferred cell type proportions for specific cell types ('Basal', 'LumA', 'LumB') on the spatial map using `sc.pl.spatial`. The plot is colored using the 'Spectral_r' colormap, and the color scale is limited to the 90th percentile.", - "code": "sc.pl.spatial(adata_plot, cmap='Spectral_r',\n # show first 8 cell types\n color=['Basal','LumA','LumB'],\n ncols=4, size=1.3,\n img_key='hires',\n vmin=0, vmax='p90'\n )" - }, - { - "action": "Use `ov.pl.embedding` to visualize the cell type proportions in a 2D UMAP representation. The plot shows the distribution of cell types ('Basal', 'LumA', 'MBC', 'Normal epithelial') and their relationships to each other.", - "code": "ov.pl.embedding(adata_plot,\n basis='z_umap',\n color=['Basal', 'LumA', 'MBC', 'Normal epithelial'],\n frameon='small',\n vmin=0, vmax='p90',\n cmap='Spectral_r',\n )" - }, - { - "action": "Calculate the predicted expression of specific genes in each cell type using the `model_ct_exp` function. This function processes the model outputs and returns the predicted gene expression values.", - "code": "pred_exprs = sf_model.model_ct_exp(model,\n adata,\n visium_args,\n device=device)" - }, - { - "action": "Select a specific gene ('IL7R') and cell type ('Tem'). Add a new layer to the `adata_` object with the predicted expression values for the selected gene in the selected cell type. Visualize the predicted expression on the spatial map using `sc.pl.spatial`.", - "code": "gene='IL7R'\ngene_celltype='Tem'\nadata_.layers[f'infer_{gene_celltype}']=pred_exprs[gene_celltype]\n\nsc.pl.spatial(adata_, cmap='Spectral_r',\n # show first 8 cell types\n color=gene,\n title=f'{gene} (Predicted expression)\\n{gene_celltype}',\n layer=f'infer_{gene_celltype}',\n ncols=4, size=1.3,\n img_key='hires',\n #vmin=0, vmax='p90'\n )" - }, - { - "action": "Specify the output directory for saving the model and inferred parameters. Create the directory if it doesn't exist. Save the trained Starfysh model's state dictionary to a .pt file. Save the `adata` object with inferred parameters to a .h5ad file.", - "code": "# Specify output directory\noutdir = './results/'\nif not os.path.exists(outdir):\n os.mkdir(outdir)\n\n# save the model\ntorch.save(model.state_dict(), os.path.join(outdir, 'starfysh_model.pt'))\n\n# save `adata` object with inferred parameters\nadata.write(os.path.join(outdir, 'st.h5ad'))" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_stt.json b/rag_engine/ovrawmjson/t_stt.json deleted file mode 100644 index 31b451c4..00000000 --- a/rag_engine/ovrawmjson/t_stt.json +++ /dev/null @@ -1,218 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy and omicverse. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Import specific modules from the omicverse.external.starfysh subpackage: AA (Archetypal Analysis), utils (utility functions), plot_utils (plotting utilities), post_analysis (post-analysis functions), and _starfysh (Starfysh model).", - "code": "from omicverse.externel.starfysh import (AA, utils, plot_utils, post_analysis)\nfrom omicverse.externel.starfysh import _starfysh as sf_model" - }, - { - "action": "Define file paths for the data, sample ID, and signature gene set file name.", - "code": "# Specify data paths\ndata_path = 'data/star_data'\nsample_id = 'CID44971_TNBC'\nsig_name = 'bc_signatures_version_1013.csv'" - }, - { - "action": "Load expression count data and signature gene sets using custom utility functions. The `load_adata` function reads spatial transcriptomics data, and `filter_gene_sig` filters the signature gene sets based on the expression data.", - "code": "# Load expression counts and signature gene sets\nadata, adata_normed = utils.load_adata(data_folder=data_path,\n sample_id=sample_id, # sample id\n n_genes=2000 # number of highly variable genes to keep\n )" - }, - { - "action": "Import pandas and os libraries. Load signature gene sets from a CSV file into a pandas DataFrame. Filter the gene signatures to include only genes present in the expression data.", - "code": "import pandas as pd\nimport os\ngene_sig = pd.read_csv(os.path.join(data_path, sig_name))\ngene_sig = utils.filter_gene_sig(gene_sig, adata.to_df())\ngene_sig.head()" - }, - { - "action": "Load and preprocess spatial information associated with the expression data. This includes image data, mapping information, and scaling factors. Calculate a UMAP representation of the data for visualization.", - "code": "# Load spatial information\nimg_metadata = utils.preprocess_img(data_path,\n sample_id,\n adata_index=adata.obs.index,\n #hchannel=False\n )\nimg, map_info, scalefactor = img_metadata['img'], img_metadata['map_info'], img_metadata['scalefactor']\numap_df = utils.get_umap(adata, display=True)" - }, - { - "action": "Import the matplotlib.pyplot module for plotting. Create a new figure and display the image data loaded in the previous step.", - "code": "import matplotlib.pyplot as plt\nplt.figure(figsize=(6, 6), dpi=80)\nplt.imshow(img)" - }, - { - "action": "Display the first few rows of the `map_info` DataFrame, which contains spatial mapping information.", - "code": "map_info.head()" - }, - { - "action": "Define parameters for Starfysh model training using the `VisiumArguments` class. This includes raw and normalized expression data, filtered signature genes, image metadata, number of anchor spots, window size for spatial smoothing, and sample ID. Prepare the AnnData objects and calculate anchor spots.", - "code": "# Parameters for training\nvisium_args = utils.VisiumArguments(adata,\n adata_normed,\n gene_sig,\n img_metadata,\n n_anchors=60,\n window_size=3,\n sample_id=sample_id\n )\n\nadata, adata_normed = visium_args.get_adata()\nanchors_df = visium_args.get_anchors()" - }, - { - "action": "Add log-transformed library size and windowed log-transformed library size to the observation metadata of the `adata` object.", - "code": "adata.obs['log library size']=visium_args.log_lib\nadata.obs['windowed log library size']=visium_args.win_loglib" - }, - { - "action": "Use scanpy's `sc.pl.spatial` function to visualize the log library size on a spatial map. The plot is colored using the 'magma' colormap.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the windowed log library size on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'magma' colormap.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='windowed log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the raw gene expression of the gene 'IL7R' on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'magma' colormap.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='IL7R',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Plot anchor spots on a UMAP representation of the data using the `plot_anchor_spots` function from `plot_utils`. This visualization helps to identify the locations of anchor spots for each cell type.", - "code": "plot_utils.plot_anchor_spots(umap_df,\n visium_args.pure_spots,\n visium_args.sig_mean,\n bbox_x=2\n )" - }, - { - "action": "Initialize an Archetypal Analysis (AA) model using the normalized AnnData object. Compute archetypes, find archetypal spots and clusters, define marker genes for each archetypal cluster, map archetypes to the closest anchor spots, and optionally find distant archetypes not assigned to any annotated cell types.", - "code": "aa_model = AA.ArchetypalAnalysis(adata_orig=adata_normed)\narchetype, arche_dict, major_idx, evs = aa_model.compute_archetypes(cn=40)\n# (1). Find archetypal spots & archetypal clusters\narche_df = aa_model.find_archetypal_spots(major=True)\n\n# (2). Find marker genes associated with each archetypal cluster\nmarkers_df = aa_model.find_markers(n_markers=30, display=False)\n\n# (3). Map archetypes to closest anchors (1-1 per cell type)\nmap_df, map_dict = aa_model.assign_archetypes(anchors_df)\n\n# (4). Optional: Find the most distant archetypes that are not assigned to any annotated cell types\ndistant_arches = aa_model.find_distant_archetypes(anchors_df, n=3)" - }, - { - "action": "Plot the explained variances (evs) from the Archetypal Analysis model using the `plot_evs` function. This helps to determine the optimal number of archetypes.", - "code": "plot_utils.plot_evs(evs, kmin=aa_model.kmin)" - }, - { - "action": "Visualize the archetypes in a 2D or 3D plot using the `plot_archetypes` function from the `aa_model`. This helps to understand the geometric structure of the data and the identified archetypes.", - "code": "aa_model.plot_archetypes(do_3d=False, major=True, disp_cluster=False)" - }, - { - "action": "Visualize the mapping between archetypes and cell types using the `plot_mapping` function from the `aa_model`. This shows how archetypes correspond to known cell types.", - "code": "aa_model.plot_mapping(map_df)" - }, - { - "action": "Refine the anchor spots and update the signature genes by appending archetypal marker genes with the best-aligned anchors. This step uses the `refine_anchors` function from `utils` and updates the `visium_args` object.", - "code": "visium_args = utils.refine_anchors(\n visium_args,\n aa_model,\n #thld=0.7, # alignment threshold\n n_genes=5,\n #n_iters=1\n)\n\n# Get updated adata & signatures\nadata, adata_normed = visium_args.get_adata()\ngene_sig = visium_args.gene_sig\ncell_types = gene_sig.columns" - }, - { - "action": "Import the torch library. Define parameters for model training, including the number of random restarts (`n_repeats`), number of epochs, patience for early stopping, and the device to use for training (GPU if available, otherwise CPU).", - "code": "import torch\nn_repeats = 3\nepochs = 200\npatience = 50\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - }, - { - "action": "Train the Starfysh model using the `run_starfysh` function from `utils`. This function runs the model training with specified parameters and returns the trained model and the training loss.", - "code": "# Run models\nmodel, loss = utils.run_starfysh(visium_args,\n n_repeats=n_repeats,\n epochs=epochs,\n #patience=patience,\n device=device\n )" - }, - { - "action": "Evaluate the trained Starfysh model using the `model_eval` function from `sf_model`. This function parses the model inference outputs and generative outputs, and updates the `adata` object with these results.", - "code": "adata, adata_normed = visium_args.get_adata()\ninference_outputs, generative_outputs,adata_ = sf_model.model_eval(model,\n adata,\n visium_args,\n poe=False,\n device=device)" - }, - { - "action": "Import the numpy library. Select a random cell type index and use the `gene_mean_vs_inferred_prop` function from `post_analysis` to compare the signature gene means with the inferred proportions for that cell type.", - "code": "import numpy as np\nn_cell_types = gene_sig.shape[1]\nidx = np.random.randint(0, n_cell_types)\npost_analysis.gene_mean_vs_inferred_prop(inference_outputs,\n visium_args,\n idx=idx,\n figsize=(4,4)\n )" - }, - { - "action": "Visualize the inferred feature 'ql_m' on a spatial map using the `pl_spatial_inf_feature` function from `plot_utils`. The plot is colored using the 'Blues' colormap.", - "code": "plot_utils.pl_spatial_inf_feature(adata_, feature='ql_m', cmap='Blues')" - }, - { - "action": "Define a function `cell2proportion` to extract cell type proportions from the `adata_` object and create a new AnnData object (`adata_plot`) for visualization.", - "code": "def cell2proportion(adata):\n adata_plot=sc.AnnData(adata.X)\n adata_plot.obs=utils.extract_feature(adata_, 'qc_m').obs.copy()\n adata_plot.var=adata.var.copy()\n adata_plot.obsm=adata.obsm.copy()\n adata_plot.obsp=adata.obsp.copy()\n adata_plot.uns=adata.uns.copy()\n return adata_plot\nadata_plot=cell2proportion(adata_)" - }, - { - "action": "Display a summary of the `adata_plot` object, which contains the cell type proportions extracted from `adata_`.", - "code": "adata_plot" - }, - { - "action": "Visualize the inferred cell type proportions for specific cell types ('Basal', 'LumA', 'LumB') on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'Spectral_r' colormap and displays values up to the 90th percentile.", - "code": "sc.pl.spatial(adata_plot, cmap='Spectral_r',\n # show first 8 cell types\n color=['Basal','LumA','LumB'],\n ncols=4, size=1.3,\n img_key='hires',\n vmin=0, vmax='p90'\n )" - }, - { - "action": "Visualize the inferred cell type proportions for specific cell types ('Basal', 'LumA', 'MBC', 'Normal epithelial') on a UMAP representation using `ov.pl.embedding`. The plot is colored using the 'Spectral_r' colormap and displays values up to the 90th percentile.", - "code": "ov.pl.embedding(adata_plot,\n basis='z_umap',\n color=['Basal', 'LumA', 'MBC', 'Normal epithelial'],\n frameon='small',\n vmin=0, vmax='p90',\n cmap='Spectral_r',\n )" - }, - { - "action": "Predict cell type-specific gene expression using the `model_ct_exp` function from `sf_model`. This function calculates the predicted expression levels for each cell type based on the trained model.", - "code": "pred_exprs = sf_model.model_ct_exp(model,\n adata,\n visium_args,\n device=device)" - }, - { - "action": "Visualize the predicted expression of the gene 'IL7R' for the cell type 'Tem' on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'Spectral_r' colormap and displays the predicted expression values.", - "code": "gene='IL7R'\ngene_celltype='Tem'\nadata_.layers[f'infer_{gene_celltype}']=pred_exprs[gene_celltype]\n\nsc.pl.spatial(adata_, cmap='Spectral_r',\n # show first 8 cell types\n color=gene,\n title=f'{gene} (Predicted expression)\\n{gene_celltype}',\n layer=f'infer_{gene_celltype}',\n ncols=4, size=1.3,\n img_key='hires',\n #vmin=0, vmax='p90'\n )" - }, - { - "action": "Specify an output directory to save the model and inferred parameters. Create the directory if it does not exist. Save the trained model's state dictionary to a .pt file and write the `adata` object with inferred parameters to a .h5ad file.", - "code": "# Specify output directory\noutdir = './results/'\nif not os.path.exists(outdir):\n os.mkdir(outdir)\n\n# save the model\ntorch.save(model.state_dict(), os.path.join(outdir, 'starfysh_model.pt'))\n\n# save `adata` object with inferred parameters\nadata.write(os.path.join(outdir, 'st.h5ad'))" - }, - { - "action": "Import the omicverse, scvelo, and scanpy libraries. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\n#import omicverse.STT as st\nimport scvelo as scv\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Read the 'mouse_brain.h5ad' file into an AnnData object using `sc.read_h5ad`.", - "code": "adata = sc.read_h5ad('mouse_brain.h5ad')\nadata" - }, - { - "action": "Construct an STT object using the `ov.space.STT` class. Initialize it with the AnnData object, specifying the spatial location key as 'xy_loc' and the region key as 'Region'.", - "code": "STT_obj=ov.space.STT(adata,spatial_loc='xy_loc',region='Region')" - }, - { - "action": "Estimate the stages for the STT model using the `stage_estimate` method.", - "code": "STT_obj.stage_estimate()" - }, - { - "action": "Train the STT model with specified parameters: 9 states, 15 iterations, connectivity weight of 0.5, 50 neighbors, threshold for MS gene of 0.2, and spatial weight of 0.3.", - "code": "STT_obj.train(n_states = 9, n_iter = 15, weight_connectivities = 0.5, \n n_neighbors = 50,thresh_ms_gene = 0.2, spa_weight =0.3)" - }, - { - "action": "Visualize the 'attractor' attribute on a 2D embedding using the 'xy_loc' basis with `ov.pl.embedding`. The plot is colored by the 'attractor' values and uses a specific color palette.", - "code": "ov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"attractor\"],frameon='small',\n palette=ov.pl.sc_color[11:])" - }, - { - "action": "Visualize the 'Region' attribute on a 2D embedding using the 'xy_loc' basis with `ov.pl.embedding`. The plot is colored by the 'Region' values.", - "code": "ov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Region\"],frameon='small',\n )" - }, - { - "action": "Prepare a pathway dictionary using the `ov.utils.geneset_prepare` function. The pathway data is loaded from the 'KEGG_2019_Mouse.txt' file, and the organism is specified as 'Mouse'.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/KEGG_2019_Mouse.txt',organism='Mouse')" - }, - { - "action": "Compute pathway enrichment for the STT model using the `compute_pathway` method and the prepared pathway dictionary.", - "code": "STT_obj.compute_pathway(pathway_dict)" - }, - { - "action": "Plot the pathway enrichment results using the `plot_pathway` method. The plot is displayed with a specified figure size, marker size, and font size. Axis labels are adjusted for better readability.", - "code": "fig = STT_obj.plot_pathway(figsize = (10,8),size = 100,fontsize = 12)\nfor ax in fig.axes:\n ax.set_xlabel('Embedding 1', fontsize=20) # Adjust font size as needed\n ax.set_ylabel('Embedding 2', fontsize=20) # Adjust font size as needed\nfig.show()" - }, - { - "action": "Create a subplot and visualize the streamlines for the 'Wnt signaling pathway' using the `plot_tensor_pathway` method. The plot is based on the 'xy_loc' coordinates.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(1, 1, figsize=(4, 4))\nSTT_obj.plot_tensor_pathway(pathway_name = 'Wnt signaling pathway',basis = 'xy_loc',\n ax=ax)" - }, - { - "action": "Create a subplot and visualize the streamlines for the 'TGF-beta signaling pathway' using the `plot_tensor_pathway` method. The plot is based on the 'xy_loc' coordinates.", - "code": "fig, ax = plt.subplots(1, 1, figsize=(4, 4))\nSTT_obj.plot_tensor_pathway( 'TGF-beta signaling pathway',basis = 'xy_loc',\n ax=ax)" - }, - { - "action": "Plot the tensor for specific attractors [1, 3, 5, 6] using the `plot_tensor` method. The plot filters cells based on a membership threshold and adjusts the density of the visualization.", - "code": "STT_obj.plot_tensor(list_attractor = [1,3,5,6],\n filter_cells = True, member_thresh = 0.1, density = 1)" - }, - { - "action": "Construct a landscape representation of the STT model using the `construct_landscape` method. The landscape is based on the 'X_xy_loc' coordinate key.", - "code": "STT_obj.construct_landscape(coord_key = 'X_xy_loc')" - }, - { - "action": "Visualize the 'attractor' and 'Region' attributes on a 2D embedding using the 'trans_coord' basis with `sc.pl.embedding`.", - "code": "sc.pl.embedding(adata, color = ['attractor', 'Region'],basis= 'trans_coord')" - }, - { - "action": "Infer the lineage of the STT model using the `infer_lineage` method. The method used is 'MPPT' (most probable path tree), with specified start and end indices, flux fraction, color palette, point size, and text size.", - "code": "STT_obj.infer_lineage(si=3,sf=4, method = 'MPPT',flux_fraction=0.8,color_palette_name = 'tab10',size_point = 8,\n size_text=12)" - }, - { - "action": "Plot a Sankey diagram showing the relationship between STT attractors and spatial region annotations using the `plot_sankey` method.", - "code": "fig = STT_obj.plot_sankey(adata.obs['attractor'].tolist(),adata.obs['Region'].tolist())" - }, - { - "action": "Write the `adata` and `adata_aggr` objects to H5AD files. The `adata` object is saved as 'mouse_brain_adata.h5ad', and the `adata_aggr` object is saved as 'mouse_brain_adata_aggr.h5ad'.", - "code": "STT_obj.adata.write('data/mouse_brain_adata.h5ad')\nSTT_obj.adata_aggr.write('data/mouse_brain_adata_aggr.h5ad')" - }, - { - "action": "Read the `adata` and `adata_aggr` objects from the H5AD files 'mouse_brain_adata.h5ad' and 'mouse_brain_adata_aggr.h5ad', respectively.", - "code": "adata=ov.read('data/mouse_brain_adata.h5ad')\nadata_aggr=ov.read('data/mouse_brain_adata_aggr.h5ad')" - }, - { - "action": "Construct an STT object using the `ov.space.STT` class and load the previously saved `adata` and `adata_aggr` objects into it.", - "code": "STT_obj=ov.space.STT(adata,spatial_loc='xy_loc',region='Region')\nSTT_obj.load(adata,adata_aggr)" - }, - { - "action": "Display the 'r2_test' values from the `adata.var` DataFrame, sorted in descending order. These values represent genes with high multistability scores.", - "code": "adata.var['r2_test'].sort_values(ascending=False)" - }, - { - "action": "Plot the top 6 genes with the highest multistability scores using the `plot_top_genes` method. The plot is displayed with 2 columns and a figure size of 8x8.", - "code": "STT_obj.plot_top_genes(top_genes = 6, ncols = 2, figsize = (8,8),)" - }, - { - "action": "Create a 1x4 subplot and visualize the expression of the 'Sim1' gene in different layers ('Ms', 'Mu', 'velo') and the raw expression. Each subplot displays the 'Sim1' expression on the 'xy_loc' basis using the 'RdBu_r' colormap.", - "code": "import matplotlib.pyplot as plt\nfig, axes = plt.subplots(1, 4, figsize=(12, 3))\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:Ms',show=False,\n layer='Ms',cmap='RdBu_r',ax=axes[0]\n )\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:Mu',show=False,\n layer='Mu',cmap='RdBu_r',ax=axes[1]\n )\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:Velo',show=False,\n layer='velo',cmap='RdBu_r',ax=axes[2]\n )\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:exp',show=False,\n #layer='Mu',\n cmap='RdBu_r',ax=axes[3]\n )\nplt.tight_layout()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_tcga.json b/rag_engine/ovrawmjson/t_tcga.json deleted file mode 100644 index c192158a..00000000 --- a/rag_engine/ovrawmjson/t_tcga.json +++ /dev/null @@ -1,38 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Initialize a TCGA object using the provided sample sheet, download files, and clinical cart paths. Then, initialize the AnnData object to store the raw count, FPKM, and TPM matrices.", - "code": "get_ipython().run_cell_magic('time', '', \"gdc_sample_sheep='data/TCGA_OV/gdc_sample_sheet.2024-07-05.tsv'\\ngdc_download_files='data/TCGA_OV/gdc_download_20240705_180129.081531'\\nclinical_cart='data/TCGA_OV/clinical.cart.2024-07-05'\\naml_tcga=ov.bulk.pyTCGA(gdc_sample_sheep,gdc_download_files,clinical_cart)\\naml_tcga.adata_init()\\n\")" - }, - { - "action": "Save the AnnData object to an H5AD file for later use.", - "code": "aml_tcga.adata.write_h5ad('data/TCGA_OV/ov_tcga_raw.h5ad',compression='gzip')" - }, - { - "action": "Initialize a TCGA object and read the previously saved AnnData file. This step is necessary to ensure that subsequent TCGA functions, such as survival analysis, can be used properly.", - "code": "gdc_sample_sheep='data/TCGA_OV/gdc_sample_sheet.2024-07-05.tsv'\ngdc_download_files='data/TCGA_OV/gdc_download_20240705_180129.081531'\nclinical_cart='data/TCGA_OV/clinical.cart.2024-07-05'\naml_tcga=ov.bulk.pyTCGA(gdc_sample_sheep,gdc_download_files,clinical_cart)\naml_tcga.adata_read('data/TCGA_OV/ov_tcga_raw.h5ad')" - }, - { - "action": "Initialize the metadata for the AnnData object. This involves converting gene IDs to gene names and adding basic patient information.", - "code": "aml_tcga.adata_meta_init()" - }, - { - "action": "Initialize the survival data for the TCGA object. This step imports the clinical information from the previously set clinical cart path.", - "code": "aml_tcga.survial_init()\naml_tcga.adata" - }, - { - "action": "Perform survival analysis for the gene 'MYC' using the 'deseq_normalize' layer and generate a survival plot.", - "code": "aml_tcga.survival_analysis('MYC',layer='deseq_normalize',plot=True)" - }, - { - "action": "Perform survival analysis for all genes in the dataset. This process may take a significant amount of time.", - "code": "aml_tcga.survial_analysis_all()\naml_tcga.adata" - }, - { - "action": "Save the updated AnnData object, which now includes the results of the survival analysis, to an H5AD file.", - "code": "aml_tcga.adata.write_h5ad('data/TCGA_OV/ov_tcga_survial_all.h5ad',compression='gzip')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_tosica.json b/rag_engine/ovrawmjson/t_tosica.json deleted file mode 100644 index c8fea34b..00000000 --- a/rag_engine/ovrawmjson/t_tosica.json +++ /dev/null @@ -1,86 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Load the reference AnnData object from 'demo_train.h5ad' and print its shape and cell type counts.", - "code": "ref_adata = sc.read('demo_train.h5ad')\nref_adata = ref_adata[:,ref_adata.var_names]\nprint(ref_adata)\nprint(ref_adata.obs.Celltype.value_counts())" - }, - { - "action": "Load the query AnnData object from 'demo_test.h5ad', subset it to the same genes as the reference data, and print its shape and cell type counts.", - "code": "query_adata = sc.read('demo_test.h5ad')\nquery_adata = query_adata[:,ref_adata.var_names]\nprint(query_adata)\nprint(query_adata.obs.Celltype.value_counts())" - }, - { - "action": "Make variable names unique and select the common genes between the reference and query datasets.", - "code": "ref_adata.var_names_make_unique()\nquery_adata.var_names_make_unique()\nret_gene=list(set(query_adata.var_names) & set(ref_adata.var_names))\nlen(ret_gene)" - }, - { - "action": "Subset both reference and query datasets to the common genes.", - "code": "query_adata=query_adata[:,ret_gene]\nref_adata=ref_adata[:,ret_gene]" - }, - { - "action": "Print the maximum values of the expression matrices in the reference and query datasets to confirm that they have been normalized and log-transformed.", - "code": "print(f\"The max of ref_adata is {ref_adata.X.max()}, query_data is {query_adata.X.max()}\",)" - }, - { - "action": "Download the TOSICA gene sets (GMT files) using `ov.utils.download_tosica_gmt()`. These gene sets will be used as pathways for the TOSICA model.", - "code": "ov.utils.download_tosica_gmt()" - }, - { - "action": "Initialize the TOSICA model using the `ov.single.pyTOSICA` class. Set the reference AnnData object, the path to the GMT file, the depth of the transformer model, the key for cell type labels, the project path for saving the model, and the batch size.", - "code": "tosica_obj=ov.single.pyTOSICA(adata=ref_adata,\n gmt_path='genesets/GO_bp.gmt', depth=1,\n label_name='Celltype',\n project_path='hGOBP_demo',\n batch_size=8)" - }, - { - "action": "Train the TOSICA model using the `train` method. Set the number of epochs.", - "code": "tosica_obj.train(epochs=5)" - }, - { - "action": "Save the trained TOSICA model to the specified project path.", - "code": "tosica_obj.save()" - }, - { - "action": "Load the saved TOSICA model from the project path.", - "code": "tosica_obj.load()" - }, - { - "action": "Predict cell types in the query dataset using the trained TOSICA model and the `predicted` method. The predicted cell types and associated information are stored in a new AnnData object.", - "code": "new_adata=tosica_obj.predicted(pre_adata=query_adata)" - }, - { - "action": "Preprocess the query dataset by scaling the data, performing PCA, computing a neighborhood graph, and reducing the dimensionality using MDE.", - "code": "ov.pp.scale(query_adata)\nov.pp.pca(query_adata,layer='scaled',n_pcs=50)\nsc.pp.neighbors(query_adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')\nquery_adata.obsm[\"X_mde\"] = ov.utils.mde(query_adata.obsm[\"scaled|original|X_pca\"])\nquery_adata" - }, - { - "action": "Copy the low-dimensional embeddings and neighborhood graph from the query dataset to the new AnnData object containing the predicted cell types.", - "code": "new_adata.obsm=query_adata[new_adata.obs.index].obsm.copy()\nnew_adata.obsp=query_adata[new_adata.obs.index].obsp.copy()\nnew_adata" - }, - { - "action": "Set the colors for the predicted and original cell types in the new AnnData object for visualization purposes.", - "code": "import numpy as np\ncol = np.array([\n\"#98DF8A\",\"#E41A1C\" ,\"#377EB8\", \"#4DAF4A\" ,\"#984EA3\" ,\"#FF7F00\" ,\"#FFFF33\" ,\"#A65628\" ,\"#F781BF\" ,\"#999999\",\"#1F77B4\",\"#FF7F0E\",\"#279E68\",\"#FF9896\"\n]).astype('', color='gray'),size=12)\n\n#Set the title\nplt.title('Venn4',fontsize=13)\n\n#save figure\nfig.savefig(\"figures/bulk_venn4.png\",dpi=300,bbox_inches = 'tight')" - }, - { - "action": "Create another Venn diagram with three sets and a different color palette.", - "code": "fig,ax=plt.subplots(figsize = (4,4))\n#dict of sets\nsets = {\n 'Set1:name': {1,2,3},\n 'Set2': {1,2,3,4},\n 'Set3': {3,4},\n}\n \nov.pl.venn(sets=sets,ax=ax,fontsize=5.5,\n palette=ov.pl.red_color)\n\nplt.title('Venn3',fontsize=13)" - }, - { - "action": "Read differentially expressed genes (DEGs) result from a CSV file.", - "code": "result=ov.read('data/dds_result.csv',index_col=0)\nresult.head()" - }, - { - "action": "Create a volcano plot to visualize DEGs.", - "code": "ov.pl.volcano(result,pval_name='qvalue',fc_name='log2FoldChange',\n pval_threshold=0.05,fc_max=1.5,fc_min=-1.5,\n pval_max=10,FC_max=10,\n figsize=(4,4),title='DEGs in Bulk',titlefont={'weight':'normal','size':14,},\n up_color='#e25d5d',down_color='#7388c1',normal_color='#d7d7d7',\n up_fontcolor='#e25d5d',down_fontcolor='#7388c1',normal_fontcolor='#d7d7d7',\n legend_bbox=(0.8, -0.2),legend_ncol=2,legend_fontsize=12,\n plot_genes=None,plot_genes_num=10,plot_genes_fontsize=11,\n ticks_fontsize=12,)" - }, - { - "action": "Load the 'tips' dataset from seaborn for box plot visualization.", - "code": "import seaborn as sns\ndata = sns.load_dataset(\"tips\")\ndata.head()" - }, - { - "action": "Create a box plot to compare total bill amounts across different days, separated by sex, and add a p-value annotation.", - "code": "fig,ax=ov.pl.boxplot(data,hue='sex',x_value='day',y_value='total_bill',\n palette=ov.pl.red_color,\n figsize=(4,2),fontsize=12,title='Tips',)\n\nov.pl.add_palue(ax,line_x1=-0.5,line_x2=0.5,line_y=40,\n text_y=0.2,\n text='$p={}$'.format(round(0.001,3)),\n fontsize=11,fontcolor='#000000',\n horizontalalignment='center',)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_visualize_colorsystem.json b/rag_engine/ovrawmjson/t_visualize_colorsystem.json deleted file mode 100644 index e5c6188b..00000000 --- a/rag_engine/ovrawmjson/t_visualize_colorsystem.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\n#import scvelo as scv\nov.plot_set()" - }, - { - "action": "Read single-cell RNA-seq data from a file named '10X43_1.h5ad' located in the 'data/DentateGyrus' directory and store it in the 'adata' variable.", - "code": "adata = ov.read('data/DentateGyrus/10X43_1.h5ad')\nadata" - }, - { - "action": "Create an instance of the ForbiddenCity class from the omicverse plotting module to visualize the color system.", - "code": "fb=ov.pl.ForbiddenCity()" - }, - { - "action": "Generate an HTML visualization of the Forbidden City color palette, displaying colors in a grid with 24 colors per row, covering the entire range of 384 colors.", - "code": "from IPython.display import HTML\nHTML(fb.visual_color(loc_range=(0,384),\n num_per_row=24))" - }, - { - "action": "Retrieve the color named '凝夜紫' from the Forbidden City color palette.", - "code": "fb.get_color(name='凝夜紫')" - }, - { - "action": "Create a subplot with 1 row and 3 columns, each with a figure size of 9x3 inches. Then, generate three UMAP embeddings of the 'adata' object with different color palettes: 'fb.red[:]', 'fb.pink1[:]', and a combination of 'fb.red1[:4]' and 'fb.blue1'. The embeddings are displayed without legends and with small frames.", - "code": "import matplotlib.pyplot as plt\nfig, axes = plt.subplots(1,3,figsize=(9,3)) \nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=fb.red[:],\n ncols=3,\n show=False,\n legend_loc=None,\n ax=axes[0])\n\nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=fb.pink1[:],\n ncols=3,show=False,\n legend_loc=None,\n ax=axes[1])\n\nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=fb.red1[:4]+fb.blue1,\n ncols=3,show=False,\n ax=axes[2])" - }, - { - "action": "Define a dictionary 'color_dict' that maps cell type names to specific hexadecimal color codes. Then, generate a UMAP embedding of the 'adata' object, coloring the cells based on their cluster assignment according to the 'color_dict'. The embedding is displayed without a legend and with a small frame.", - "code": "color_dict={'Astrocytes': '#e40414',\n 'Cajal Retzius': '#ec5414',\n 'Cck-Tox': '#ec4c2c',\n 'Endothelial': '#d42c24',\n 'GABA': '#2c5ca4',\n 'Granule immature': '#acd4ec',\n 'Granule mature': '#a4bcdc',\n 'Microglia': '#8caccc',\n 'Mossy': '#8cacdc',\n 'Neuroblast': '#6c9cc4',\n 'OL': '#6c94cc',\n 'OPC': '#5c74bc',\n 'Radial Glia-like': '#4c94c4',\n 'nIPC': '#3474ac'}\n\nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=color_dict,\n ncols=3,show=False,\n )" - }, - { - "action": "Define a list of colors using RGB values obtained from the 'get_color_rgb' method of the 'fb' object for the colors '群青', '半见', and '丹罽'. Then, create a segmented colormap using these colors.", - "code": "colors=[\n fb.get_color_rgb('群青'),\n fb.get_color_rgb('半见'),\n fb.get_color_rgb('丹罽'),\n]\nfb.get_cmap_seg(colors)" - }, - { - "action": "Define a list of colors using RGB values obtained from the 'get_color_rgb' method of the 'fb' object for the colors '群青', '山矾', and '丹罽'. Then, create a segmented colormap using these colors.", - "code": "colors=[\n fb.get_color_rgb('群青'),\n fb.get_color_rgb('山矾'),\n fb.get_color_rgb('丹罽'),\n]\nfb.get_cmap_seg(colors)" - }, - { - "action": "Define a list of colors using RGB values obtained from the 'get_color_rgb' method of the 'fb' object for the colors '山矾' and '丹罽'. Then, create a segmented colormap using these colors.", - "code": "colors=[\n fb.get_color_rgb('山矾'),\n fb.get_color_rgb('丹罽'),\n]\nfb.get_cmap_seg(colors)" - }, - { - "action": "Generate a UMAP embedding of the 'adata' object, coloring the cells based on the expression levels of the gene 'Sox7'. The colormap used is a segmented colormap created from the 'colors' list. The embedding is displayed with a small frame and without a legend.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"Sox7\"],\n cmap=fb.get_cmap_seg(colors),\n ncols=3,show=False,\n #vmin=-1,vmax=1\n )" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_visualize_single.json b/rag_engine/ovrawmjson/t_visualize_single.json deleted file mode 100644 index 9cdeb180..00000000 --- a/rag_engine/ovrawmjson/t_visualize_single.json +++ /dev/null @@ -1,90 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\n#import scvelo as scv\nov.plot_set()" - }, - { - "action": "Read single-cell RNA-seq data from a file named '10X43_1.h5ad' located in the 'data/DentateGyrus/' directory and store it in the `adata` variable.", - "code": "adata = ov.read('data/DentateGyrus/10X43_1.h5ad')" - }, - { - "action": "Optimize color mapping for the 'clusters' variable in the AnnData object `adata` based on the 'X_umap' embedding using `ov.pl.optim_palette`.", - "code": "optim_palette=ov.pl.optim_palette(adata,basis='X_umap',colors='clusters')" - }, - { - "action": "Create an embedding plot of the `adata` object, coloring cells by 'clusters' using the optimized palette, and display the plot with a title.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (4,4))\nov.pl.embedding(adata,\n basis='X_umap',\n color='clusters',\n frameon='small',\n show=False,\n palette=optim_palette,\n ax=ax,)\nplt.title('Cell Type of DentateGyrus',fontsize=15)" - }, - { - "action": "Create an embedding plot of the `adata` object, coloring cells by 'age(days)' and display the plot.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color='age(days)',\n frameon='small',\n show=False,)" - }, - { - "action": "Create a stacked histogram of cell proportions, grouped by 'age(days)' and colored by 'clusters', with a legend.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (1,4))\nov.pl.cellproportion(adata=adata,celltype_clusters='clusters',\n groupby='age(days)',legend=True,ax=ax)" - }, - { - "action": "Create a stacked histogram of cell proportions for specific cell types ('nIPC', 'Granule immature', 'Granule mature'), grouped by 'clusters' and colored by 'age(days)', with a legend.", - "code": "fig,ax=plt.subplots(figsize = (2,2))\nov.pl.cellproportion(adata=adata,celltype_clusters='age(days)',\n groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'],\n legend=True,ax=ax)" - }, - { - "action": "Create a stacked area graph showing the changes in cell types ('nIPC', 'Granule immature', 'Granule mature') across different groups defined by 'clusters', colored by 'age(days)', with a legend.", - "code": "fig,ax=plt.subplots(figsize = (2,2))\nov.pl.cellstackarea(adata=adata,celltype_clusters='age(days)',\n groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'],\n legend=True,ax=ax)" - }, - { - "action": "Create an embedding plot with cell type proportions, using 'X_umap' as the basis and 'clusters' as the cell type key.", - "code": "ov.pl.embedding_celltype(adata,figsize=(7,4),basis='X_umap',\n celltype_key='clusters',\n title=' Cell type',\n celltype_range=(1,10),\n embedding_range=(4,10),)" - }, - { - "action": "Create an embedding plot and highlight the 'Granule mature' cell type with a convex hull.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_umap',\n color=['clusters'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.ConvexHull(adata,\n basis='X_umap',\n cluster_key='clusters',\n hull_cluster='Granule mature',\n ax=ax)" - }, - { - "action": "Create an embedding plot and highlight the 'Granule immature' and 'Granule mature' cell types with contours.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_umap',\n color=['clusters'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.contour(ax=ax,adata=adata,groupby='clusters',clusters=['Granule immature','Granule mature'],\n basis='X_umap',contour_threshold=0.1,colors='#000000',\n linestyles='dashed',)" - }, - { - "action": "Create an embedding plot with adjusted legend to prevent masking, excluding the 'OL' cell type.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.pl.embedding(adata,\n basis='X_umap',\n color=['clusters'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.pl.embedding_adjust(\n adata,\n groupby='clusters',\n exclude=(\"OL\",), \n basis='X_umap',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize=12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Create an embedding plot showing the density distribution of the 'Granule mature' cell type.", - "code": "ov.pl.embedding_density(adata,\n basis='X_umap',\n groupby='clusters',\n target_clusters='Granule mature',\n frameon='small',\n show=False,cmap='RdBu_r',alpha=0.8)" - }, - { - "action": "Calculate the AUCell score for the 'Sox' gene set and add it to the AnnData object.", - "code": "ov.single.geneset_aucell(adata,\n geneset_name='Sox',\n geneset=['Sox17', 'Sox4', 'Sox7', 'Sox18', 'Sox5'])" - }, - { - "action": "Create an embedding plot colored by the expression of the 'Sox4' gene.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color=['Sox4'],\n frameon='small',\n show=False,)" - }, - { - "action": "Create a violin plot of 'Sox4' expression grouped by 'clusters'.", - "code": "ov.pl.violin(adata,keys='Sox4',groupby='clusters',figsize=(6,3))" - }, - { - "action": "Create a bar-dot plot of 'Sox_aucell' grouped by 'clusters' and add a p-value annotation.", - "code": "fig, ax = plt.subplots(figsize=(6,2))\nov.pl.bardotplot(adata,groupby='clusters',color='Sox_aucell',figsize=(6,2),\n ax=ax,\n ylabel='Expression',\n bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4},\n scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'})\n\nov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=0.1,\n text_y=0.02,\n text='$p={}$'.format(round(0.001,3)),\n fontsize=11,fontcolor='#000000',\n horizontalalignment='center',)" - }, - { - "action": "Create a bar-dot plot of 'Sox17' expression grouped by 'clusters' and add a p-value annotation.", - "code": "fig, ax = plt.subplots(figsize=(6,2))\nov.pl.bardotplot(adata,groupby='clusters',color='Sox17',figsize=(6,2),\n ax=ax,\n ylabel='Expression',xlabel='Cell Type',\n bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4},\n scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'})\n\nov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=2,\n text_y=0.2,\n text='$p={}$'.format(round(0.001,3)),\n fontsize=11,fontcolor='#000000',\n horizontalalignment='center',)" - }, - { - "action": "Create a box plot with jitter points for 'Sox_aucell' expression grouped by 'clusters', with Kruskal-Wallis test results and customized appearance.", - "code": "import pandas as pd\nimport seaborn as sns\n#sns.set_style('white')\n\nov.pl.single_group_boxplot(adata,groupby='clusters',\n color='Sox_aucell',\n type_color_dict=dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])),\n x_ticks_plot=True,\n figsize=(5,2),\n kruskal_test=True,\n ylabel='Sox_aucell',\n legend_plot=False,\n bbox_to_anchor=(1,1),\n title='Expression',\n scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'},\n point_number=15,\n sort=False,\n save=False,\n )\nplt.grid(False)\nplt.xticks(rotation=90,fontsize=12)" - }, - { - "action": "Define a dictionary of marker genes for the 'Sox' cell type and create a complex heatmap of gene expression grouped by 'clusters'.", - "code": "import pandas as pd\nmarker_genes_dict = {\n 'Sox':['Sox4', 'Sox7', 'Sox18', 'Sox5'],\n}\n\ncolor_dict = {'Sox':'#EFF3D8',}\n\ngene_color_dict = {}\ngene_color_dict_black = {}\nfor cell_type, genes in marker_genes_dict.items():\n cell_type_color = color_dict.get(cell_type)\n for gene in genes:\n gene_color_dict[gene] = cell_type_color\n gene_color_dict_black[gene] = '#000000'\n\ncm = ov.pl.complexheatmap(adata,\n groupby ='clusters',\n figsize =(5,2),\n layer = None,\n use_raw = False,\n standard_scale = 'var',\n col_color_bars = dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])),\n col_color_labels = dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])),\n left_color_bars = color_dict,\n left_color_labels = None,\n right_color_bars = color_dict,\n right_color_labels = gene_color_dict_black,\n marker_genes_dict = marker_genes_dict,\n cmap = 'coolwarm', #parula,jet\n legend_gap = 15,\n legend_hpad = 0,\n left_add_text = True,\n col_split_gap = 2,\n row_split_gap = 1,\n col_height = 6,\n left_height = 4,\n right_height = 6,\n col_split = None,\n row_cluster = False,\n col_cluster = False,\n value_name='Gene',\n xlabel = \"Expression of selected genes\",\n label = 'Gene Expression',\n save = True,\n show = False,\n legend = False,\n plot_legend = False,\n #save_pathway = \"complexheatmap.png\",\n )" - }, - { - "action": "Preprocess the AnnData object and define a dictionary of marker genes for different cell types.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\n\nmarker_genes_dict = {'Granule immature': ['Sepw1', 'Camk2b', 'Cnih2'],\n 'Radial Glia-like': ['Dbi', 'Fabp7', 'Aldoc'],\n 'Granule mature': ['Malat1', 'Rasl10a', 'Ppp3ca'],\n 'Neuroblast': ['Igfbpl1', 'Tubb2b', 'Tubb5'],\n 'Microglia': ['Lgmn', 'C1qa', 'C1qb'],\n 'Cajal Retzius': ['Diablo', 'Ramp1', 'Stmn1'],\n 'OPC': ['Olig1', 'C1ql1', 'Pllp'],\n 'Cck-Tox': ['Tshz2', 'Cck', 'Nap1l5'],\n 'GABA': ['Gad2', 'Gad1', 'Snhg11'],\n 'Endothelial': ['Sparc', 'Myl12a', 'Itm2a'],\n 'Astrocytes': ['Apoe', 'Atp1a2'],\n 'OL': ['Plp1', 'Mog', 'Mag'],\n 'Mossy': ['Arhgdig', 'Camk4'],\n 'nIPC': ['Hmgn2', 'Ptma', 'H2afz']}" - }, - { - "action": "Create a marker gene heatmap using the defined marker genes dictionary and customize its appearance.", - "code": "ov.pl.marker_heatmap(\n adata,\n marker_genes_dict,\n groupby='clusters',\n color_map=\"RdBu_r\",\n use_raw=False,\n standard_scale=\"var\",\n expression_cutoff=0.0,\n fontsize=12,\n bbox_to_anchor=(7, -2),\n figsize=(8.5,4),\n spines=False,\n show_rownames=False,\n show_colnames=True,\n)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_wgcna.json b/rag_engine/ovrawmjson/t_wgcna.json deleted file mode 100644 index 21f381af..00000000 --- a/rag_engine/ovrawmjson/t_wgcna.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy, omicverse, and matplotlib. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nimport matplotlib.pyplot as plt\nov.plot_set()" - }, - { - "action": "Load the expression data from a CSV file into a pandas DataFrame. The data is from the 5xFAD paper and is part of the PyWGCNA tutorial data.", - "code": "import pandas as pd\ndata=ov.utils.read('data/5xFAD_paper/expressionList.csv',\n index_col=0)\ndata.head()" - }, - { - "action": "Calculate the Median Absolute Deviation (MAD) for each gene in the expression data. Then, select the top 2000 genes with the highest MAD values and transpose the DataFrame.", - "code": "from statsmodels import robust #import package\ngene_mad=data.apply(robust.mad) #use function to calculate MAD\ndata=data.T\ndata=data.loc[gene_mad.sort_values(ascending=False).index[:2000]]\ndata.head()" - }, - { - "action": "Initialize a PyWGCNA object named '5xFAD_2k' for bulk RNA-seq analysis. The object is configured for mouse data and uses the transposed expression data. The results will be saved to the specified output path.", - "code": "#import PyWGCNA\npyWGCNA_5xFAD = ov.bulk.pyWGCNA(name='5xFAD_2k', \n species='mus musculus', \n geneExp=data.T, \n outputPath='',\n save=True)\npyWGCNA_5xFAD.geneExpr.to_df().head(5)" - }, - { - "action": "Preprocess the expression data using the `preprocess()` method of the PyWGCNA object. This step includes removing genes with too many missing values or low expression and removing samples with too many missing values.", - "code": "pyWGCNA_5xFAD.preprocess()" - }, - { - "action": "Calculate the soft-thresholding power for network construction using the `calculate_soft_threshold()` method.", - "code": "pyWGCNA_5xFAD.calculate_soft_threshold()" - }, - { - "action": "Calculate the adjacency matrix based on the selected soft-thresholding power using the `calculating_adjacency_matrix()` method.", - "code": "pyWGCNA_5xFAD.calculating_adjacency_matrix()" - }, - { - "action": "Calculate the Topological Overlap Matrix (TOM) similarity matrix using the `calculating_TOM_similarity_matrix()` method.", - "code": "pyWGCNA_5xFAD.calculating_TOM_similarity_matrix()" - }, - { - "action": "Calculate the gene tree, dynamic modules, and gene-module relationships. The `calculate_geneTree()` method computes the gene dendrogram. The `calculate_dynamicMods()` method identifies modules using dynamic tree cutting with specified parameters. The `calculate_gene_module()` method calculates module eigengenes with the chosen soft power.", - "code": "pyWGCNA_5xFAD.calculate_geneTree()\npyWGCNA_5xFAD.calculate_dynamicMods(kwargs_function={'cutreeHybrid': {'deepSplit': 2, 'pamRespectsDendro': False}})\npyWGCNA_5xFAD.calculate_gene_module(kwargs_function={'moduleEigengenes': {'softPower': 8}})" - }, - { - "action": "Plot the TOM matrix using the `plot_matrix()` method. The plot visualizes the relationships between genes based on topological overlap.", - "code": "pyWGCNA_5xFAD.plot_matrix(save=False)" - }, - { - "action": "Save the current state of the PyWGCNA object using the `saveWGCNA()` method. This allows for later retrieval of the object and its associated data.", - "code": "pyWGCNA_5xFAD.saveWGCNA()" - }, - { - "action": "Load a previously saved PyWGCNA object from a file named '5xFAD_2k.p' using the `ov.bulk.readWGCNA()` function.", - "code": "pyWGCNA_5xFAD=ov.bulk.readWGCNA('5xFAD_2k.p')" - }, - { - "action": "Display the first few rows of the `mol` attribute of the PyWGCNA object, which likely contains module information.", - "code": "pyWGCNA_5xFAD.mol.head()" - }, - { - "action": "Display the first few rows of the `datExpr.var` attribute of the PyWGCNA object. This likely contains variable information related to the expression data.", - "code": "pyWGCNA_5xFAD.datExpr.var.head()" - }, - { - "action": "Extract a subset of modules ('gold' and 'lightgreen') from the PyWGCNA object using the `get_sub_module()` method. The `mod_type` parameter specifies that the selection is based on module colors.", - "code": "sub_mol=pyWGCNA_5xFAD.get_sub_module(['gold','lightgreen'],\n mod_type='module_color')\nsub_mol.head(),sub_mol.shape" - }, - { - "action": "Extract a subnetwork from the PyWGCNA object corresponding to the 'lightgreen' module. The `get_sub_network()` method is used with a specified correlation threshold of 0.2.", - "code": "G_sub=pyWGCNA_5xFAD.get_sub_network(mod_list=['lightgreen'],\n mod_type='module_color',correlation_threshold=0.2)\nG_sub" - }, - { - "action": "Count the number of edges in the extracted subnetwork `G_sub`.", - "code": "len(G_sub.edges())" - }, - { - "action": "Visualize the subnetwork for the 'gold' and 'lightgreen' modules using the `plot_sub_network()` method. The plot uses the 'kamada_kawai' layout algorithm and includes specific styling options.", - "code": "pyWGCNA_5xFAD.plot_sub_network(['gold','lightgreen'],pos_type='kamada_kawai',pos_scale=10,pos_dim=2,\n figsize=(8,8),node_size=10,label_fontsize=8,correlation_threshold=0.2,\n label_bbox={\"ec\": \"white\", \"fc\": \"white\", \"alpha\": 0.6})" - }, - { - "action": "Update the sample information of the PyWGCNA object with data from a CSV file. Additionally, assign colors to different categories within the metadata for downstream analysis.", - "code": "pyWGCNA_5xFAD.updateSampleInfo(path='data/5xFAD_paper/sampleInfo.csv', sep=',')\n\n# add color for metadata\npyWGCNA_5xFAD.setMetadataColor('Sex', {'Female': 'green',\n 'Male': 'yellow'})\npyWGCNA_5xFAD.setMetadataColor('Genotype', {'5xFADWT': 'darkviolet',\n '5xFADHEMI': 'deeppink'})\npyWGCNA_5xFAD.setMetadataColor('Age', {'4mon': 'thistle',\n '8mon': 'plum',\n '12mon': 'violet',\n '18mon': 'purple'})\npyWGCNA_5xFAD.setMetadataColor('Tissue', {'Hippocampus': 'red',\n 'Cortex': 'blue'})" - }, - { - "action": "Perform a comprehensive analysis of the PyWGCNA object using the `analyseWGCNA()` method. This includes quantifying module-trait relationships and identifying important genes.", - "code": "pyWGCNA_5xFAD.analyseWGCNA()" - }, - { - "action": "Retrieve the column names from the observation data (`datExpr.obs`) of the PyWGCNA object, which represent the metadata fields.", - "code": "metadata = pyWGCNA_5xFAD.datExpr.obs.columns.tolist()" - }, - { - "action": "Plot the module eigengene for the 'lightgreen' module against the specified metadata using the `plotModuleEigenGene()` method.", - "code": "pyWGCNA_5xFAD.plotModuleEigenGene('lightgreen', metadata, show=True)" - }, - { - "action": "Create a bar plot of the module eigengene for the 'lightgreen' module against the specified metadata using the `barplotModuleEigenGene()` method.", - "code": "pyWGCNA_5xFAD.barplotModuleEigenGene('lightgreen', metadata, show=True)" - }, - { - "action": "Identify the top 10 hub genes for the 'lightgreen' module based on their connectivity using the `top_n_hub_genes()` method.", - "code": "pyWGCNA_5xFAD.top_n_hub_genes(moduleName=\"lightgreen\", n=10)" - } -] \ No newline at end of file diff --git a/rag_engine/rag_system.py b/rag_engine/rag_system.py deleted file mode 100644 index dcd7be30..00000000 --- a/rag_engine/rag_system.py +++ /dev/null @@ -1,596 +0,0 @@ -import logging -import sys -import os -import json -from datetime import datetime, timezone -from typing import Dict, List, Optional -from dataclasses import dataclass, field -from langchain_community.document_loaders import JSONLoader -from langchain_community.vectorstores import Chroma -from langchain_community.embeddings import GPT4AllEmbeddings -from langchain_community.llms import Ollama -from langchain_core.prompts import PromptTemplate -from langchain.callbacks.manager import CallbackManager -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from langchain.chains import RetrievalQA -from langchain.text_splitter import RecursiveCharacterTextSplitter -import requests -from functools import lru_cache -from concurrent.futures import ThreadPoolExecutor -import asyncio -import time -from prometheus_client import Counter, Histogram, Gauge -import tenacity -import chromadb -from collections import OrderedDict -from logging.handlers import RotatingFileHandler - -# Custom Logger Class -class RAGLogger: - def __init__(self, name): - self.logger = logging.getLogger(name) - self.logger.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - - # Add rotating file handler - handler = RotatingFileHandler('rag_system.log', maxBytes=10485760, backupCount=5) - handler.setFormatter(formatter) - self.logger.addHandler(handler) - - # Add stream handler for console output - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setFormatter(formatter) - self.logger.addHandler(stream_handler) - - def info(self, message): - self.logger.info(message) - - def error(self, message): - self.logger.error(message) - - def warning(self, message): - self.logger.warning(message) - -# Initialize logger -logger = RAGLogger(__name__) - -@dataclass -class PerformanceMetrics: - query_counter: Counter = Counter('rag_queries_total', 'Total number of queries processed') - query_latency: Histogram = Histogram('rag_query_duration_seconds', 'Query processing duration') - cache_hits: Counter = Counter('rag_cache_hits_total', 'Number of cache hits') - model_calls: Dict[str, Counter] = field(default_factory=dict) - memory_usage: Gauge = Gauge('rag_memory_usage_bytes', 'Memory usage in bytes') - request_duration: Histogram = field( - default_factory=lambda: Histogram( - 'rag_request_duration_seconds', - 'Request duration in seconds', - buckets=(0.1, 0.5, 1.0, 2.0, 5.0) - ) - ) - - def record_query(self, duration: float): - self.query_counter.inc() - self.query_latency.observe(duration) - - def record_cache_hit(self): - self.cache_hits.inc() - - def record_model_call(self, model_name: str): - try: - # Sanitize the model name for Prometheus compatibility - sanitized_name = model_name.replace('.', '_').replace(':', '_').replace('-', '_') - metric_name = f'rag_model_calls_{sanitized_name}' - - if model_name not in self.model_calls: - self.model_calls[model_name] = Counter( - metric_name, - f'Number of calls to model {model_name}' - ) - self.model_calls[model_name].inc() - - except ValueError as ve: - logger.error(f"Invalid metric name creation: {str(ve)}") - # Create a fallback metric with a generic name - fallback_name = f"rag_model_calls_model_{len(self.model_calls)}" - self.model_calls[model_name] = Counter( - fallback_name, - f'Number of calls to model (fallback counter)' - ) - self.model_calls[model_name].inc() - except Exception as e: - logger.error(f"Unexpected error in record_model_call: {str(e)}") - # Don't let metric recording failures affect the main application flow - pass - - def record_memory_usage(self): - import psutil - process = psutil.Process(os.getpid()) - self.memory_usage.set(process.memory_info().rss) - - def record_request_time(self, duration: float): - self.request_duration.observe(duration) - -# TTL Cache Class -class TTLCache(OrderedDict): - def __init__(self, maxsize=1000, ttl=3600): - super().__init__() - self.maxsize = maxsize - self.ttl = ttl - - def __getitem__(self, key): - value, timestamp = super().__getitem__(key) - if time.time() - timestamp > self.ttl: - del self[key] - raise KeyError(key) - return value - - def __setitem__(self, key, value): - super().__setitem__(key, (value, time.time())) - if len(self) > self.maxsize: - self.popitem(last=False) - -class RAGSystem: - def __init__(self, json_directory: str, kbi_path: str): - self.json_directory = json_directory - self.kbi_path = kbi_path - self.executor = ThreadPoolExecutor(max_workers=3) - self.cache = TTLCache() - self.ollama_session = requests.Session() - self.metrics = PerformanceMetrics() - self.models = { - 'file_selection': 'qwen2.5-coder:3b', - 'query_processing': 'qwen2.5-coder:7b' - } - - # Add persistent directory - self.persist_directory = os.path.join(os.getcwd(), "chroma_db") - os.makedirs(self.persist_directory, exist_ok=True) - - # Initialize Chroma client settings - self.chroma_settings = chromadb.config.Settings( - anonymized_telemetry=False, - is_persistent=True, - persist_directory=self.persist_directory - ) - - # Initialize Chroma client with connection pooling - self.chroma_client = chromadb.Client(self.chroma_settings) - - # Initialize connection pool for Ollama - self.ollama_session.mount( - 'http://', - requests.adapters.HTTPAdapter( - max_retries=3, - pool_connections=10, - pool_maxsize=10 - ) - ) - - self.kbi_vectorstore = self.create_kbi_vectorstore() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.cleanup() - - @lru_cache(maxsize=100) - def get_file_embeddings(self, file_path): - """Cache embeddings for frequently accessed files""" - try: - with open(file_path, 'r') as file: - file_data = [{"content": file.read(), "source": file_path}] - - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50 - ) - - file_splits = text_splitter.create_documents( - texts=[doc["content"] for doc in file_data], - metadatas=[{"source": doc["source"]} for doc in file_data] - ) - - embeddings = GPT4AllEmbeddings().embed_documents([doc.page_content for doc in file_splits]) - return embeddings - except Exception as e: - logger.error(f"Error generating embeddings for {file_path}: {str(e)}") - return [] - - def batch_embed_documents(self, documents, batch_size=32): - """Generate embeddings in batches""" - embeddings = [] - for i in range(0, len(documents), batch_size): - batch = documents[i:i + batch_size] - batch_embeddings = self.get_file_embeddings(batch) - embeddings.extend(batch_embeddings) - return embeddings - - async def batch_process_queries(self, queries): - """Process multiple queries in parallel""" - tasks = [self.process_query(q) for q in queries] - return await asyncio.gather(*tasks) - - def check_ollama_status(self): - """Check if Ollama is running and required models are available""" - try: - # Check if Ollama server is running - response = requests.get("http://localhost:11434/api/tags", timeout=5) - if response.status_code != 200: - return False, "Ollama server is not running" - - # Check for required models - models = response.json().get("models", []) - required_models = list(self.models.values()) - logger.info(f"Available models: {[m.get('name', '') for m in models]}") - logger.info(f"Required models: {required_models}") - - missing_models = [model for model in required_models - if not any(m.get("name") == model for m in models)] - - if missing_models: - return False, f"Missing required models: {', '.join(missing_models)}" - - return True, "Ollama is ready" - except requests.ConnectionError: - return False, "Cannot connect to Ollama server" - except requests.exceptions.Timeout: - return False, "Ollama server connection timed out" - except Exception as e: - return False, f"An unexpected error occurred: {str(e)}" - - def validate_json_file(self, file_path): - """Validate a JSON file""" - try: - with open(file_path, 'r') as file: - json.load(file) - logger.info(f"✓ {file_path} is valid JSON") - return True - except json.JSONDecodeError as e: - logger.error(f"Error in file {file_path}: {str(e)}") - return False - except Exception as e: - logger.error(f"Error reading file {file_path}: {str(e)}") - return False - - def check_all_json_files(self): - """Check all JSON files in the directory""" - logger.info(f"Checking JSON files in {self.json_directory}") - all_valid = True - for filename in os.listdir(self.json_directory): - if filename.endswith('.json'): - file_path = os.path.join(self.json_directory, filename) - if not self.validate_json_file(file_path): - all_valid = False - return all_valid - - def create_kbi_vectorstore(self, persistence_dir="./chroma_db"): - try: - # Load and validate KBI data - with open(self.kbi_path, 'r') as file: - kbi_data = json.load(file) - logger.info(f"Successfully loaded KBI data from {self.kbi_path}") - - if not isinstance(kbi_data, dict) or 'files' not in kbi_data: - raise ValueError("Invalid KBI data structure") - - # Process documents - kbi_docs = [] - for file_info in kbi_data.get('files', []): - try: - if not all(key in file_info for key in ['name', 'introduction']): - logger.warning(f"Skipping incomplete file info: {file_info}") - continue - - text = f"File: {file_info['name']}\nIntroduction: {file_info['introduction']}" - kbi_docs.append({"content": text, "source": "KBI.json"}) - except Exception as doc_error: - logger.error(f"Error processing document: {str(doc_error)}") - continue - - if not kbi_docs: - raise ValueError("No valid documents found in KBI data") - - # Create text splitter - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=100 - ) - - # Create splits from the documents - kbi_splits = text_splitter.create_documents( - texts=[doc["content"] for doc in kbi_docs], - metadatas=[{"source": doc["source"]} for doc in kbi_docs] - ) - - if not kbi_splits: - raise ValueError("Text splitting produced no documents") - - # Create vector store - vectorstore = Chroma.from_documents( - documents=kbi_splits, # Now kbi_splits is properly defined - embedding=GPT4AllEmbeddings(), - persist_directory=persistence_dir, - collection_name="kbi_collection", - client=self.chroma_client - ) - - logger.info(f"Successfully created vector store with {len(kbi_splits)} chunks") - return vectorstore - - except FileNotFoundError: - logger.error(f"KBI file not found at {self.kbi_path}") - raise - except json.JSONDecodeError as je: - logger.error(f"Invalid JSON in KBI file: {str(je)}") - raise - except Exception as e: - logger.error(f"Unexpected error in create_kbi_vectorstore: {str(e)}") - raise - - def find_relevant_file(self, query): - """Find the most relevant file for a given query""" - start_time = time.time() - try: - # Check Ollama status first - status, message = self.check_ollama_status() - if not status: - raise Exception(f"Ollama is not ready: {message}") - - if query in self.cache: - self.metrics.record_cache_hit() - cached_result = self.cache[query] - logger.info(f"Cache hit for query: {query}") - return cached_result - - file_template = """Based on the following context and question, determine which JSON file would be most relevant. - Return ONLY the filename, nothing else. - Context: {context} - Question: {question} - Filename:""" - - file_prompt = PromptTemplate( - input_variables=["context", "question"], - template=file_template, - ) - - logger.info(f"Using model {self.models['file_selection']} for file selection") - self.metrics.record_model_call(self.models['file_selection']) - llm = Ollama( - model=self.models['file_selection'], - callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) - ) - - file_chain = RetrievalQA.from_chain_type( - llm, - retriever=self.kbi_vectorstore.as_retriever(), - chain_type_kwargs={"prompt": file_prompt}, - ) - - result = file_chain({"query": query})["result"].strip() - self.cache[query] = result - logger.info(f"Found relevant file: {result}") - return result - - except Exception as e: - logger.error(f"Error in find_relevant_file: {str(e)}") - raise - finally: - duration = time.time() - start_time - self.metrics.record_request_time(duration) - - def process_query(self, query, relevant_file): - """Process a query using the relevant file""" - start_time = time.time() - try: - # Check Ollama status first - status, message = self.check_ollama_status() - if not status: - raise Exception(f"Ollama is not ready: {message}") - - file_path = os.path.join(self.json_directory, relevant_file) - if not os.path.exists(file_path): - raise FileNotFoundError(f"File {relevant_file} not found") - - with open(file_path, 'r') as file: - file_data = [{"content": file.read(), "source": relevant_file}] - - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50 - ) - - file_splits = text_splitter.create_documents( - texts=[doc["content"] for doc in file_data], - metadatas=[{"source": doc["source"]} for doc in file_data] - ) - - collection_name = f"query_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - file_vectorstore = Chroma.from_documents( - documents=file_splits, - embedding=GPT4AllEmbeddings(), - collection_name=collection_name, - client=self.chroma_client # Pass the client instance - ) - - template = """Use the following context to answer the question about the JSON data. - If you don't know the answer, just say that you don't know, don't try to make up an answer. - You should always using omicverse in python to provide the answer. - You are focus on the code not the specific gene and disease in the JSON file. - Use Step by Step with code and keep the answer as concise as possible. - Context: {context} - Question: {question} - Helpful Answer:""" - - qa_prompt = PromptTemplate( - input_variables=["context", "question"], - template=template, - ) - - logger.info(f"Using model {self.models['query_processing']} for query processing") - self.metrics.record_model_call(self.models['query_processing']) - llm = Ollama( - model=self.models['query_processing'], - callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) - ) - - qa_chain = RetrievalQA.from_chain_type( - llm, - retriever=file_vectorstore.as_retriever(), - chain_type_kwargs={"prompt": qa_prompt}, - ) - - logger.info("Generating answer...") - result = qa_chain({"query": query}) - - self._cleanup_old_collections() - - return result["result"] - - except Exception as e: - logger.error(f"Error processing query: {str(e)}") - raise - finally: - duration = time.time() - start_time - self.metrics.record_query(duration) - self.metrics.record_memory_usage() - self.metrics.record_request_time(duration) - - def list_json_files(self): - """List all JSON files in the directory""" - return [f for f in os.listdir(self.json_directory) if f.endswith('.json')] - - def get_system_health(self) -> Dict: - """Get system health metrics""" - try: - total_queries = float(self.metrics.query_counter._value.get()) - cache_hits = float(self.metrics.cache_hits._value.get()) - query_latency_sum = float(self.metrics.query_latency.sum._value.get()) - - return { - 'cache_size': len(self.cache), - 'cache_hits': int(cache_hits), - 'total_queries': int(total_queries), - 'avg_latency': round(query_latency_sum / total_queries, 2) if total_queries > 0 else 0.00, - 'model_usage': { - model: int(counter._value.get()) - for model, counter in self.metrics.model_calls.items() - }, - 'memory_usage': self.get_memory_usage(), - 'ollama_status': self.check_ollama_status() - } - except Exception as e: - logger.error(f"Error getting system health metrics: {str(e)}") - return { - 'cache_size': 0, - 'cache_hits': 0, - 'total_queries': 0, - 'avg_latency': 0.00, - 'model_usage': {}, - 'memory_usage': 0, - 'ollama_status': ("Unknown", str(e)) - } - - def get_memory_usage(self): - """Get current memory usage""" - import psutil - process = psutil.Process(os.getpid()) - return process.memory_info().rss - - def get_cache_health(self): - """Get cache health metrics""" - return { - 'size': len(self.cache), - 'maxsize': self.cache.maxsize, - 'ttl': self.cache.ttl, - 'hits': int(self.metrics.cache_hits._value.get()) - } - - def check_vectorstore_health(self): - """Check vector store health""" - try: - self.chroma_client.heartbeat() - return "OK" - except Exception as e: - return f"Error: {str(e)}" - - def get_detailed_health(self): - """Get detailed system health""" - return { - 'system_status': self.check_ollama_status(), - 'cache_status': self.get_cache_health(), - 'vectorstore_status': self.check_vectorstore_health(), - 'memory_usage': self.get_memory_usage() - } - - def _cleanup_old_collections(self): - """Clean up old vector store collections""" - try: - current_time = datetime.now() - collections = self.chroma_client.list_collections() - - for collection in collections: - if collection.name.startswith('query_'): - collection_time_str = collection.name.split('_')[1] - try: - collection_time = datetime.strptime(collection_time_str, '%Y%m%d_%H%M%S') - if (current_time - collection_time).total_seconds() > 3600: # 1 hour - self.chroma_client.delete_collection(collection.name) - logger.info(f"Deleted old collection: {collection.name}") - except ValueError: - logger.warning(f"Failed to parse timestamp from collection name: {collection.name}") - except Exception as e: - logger.error(f"Error in cleanup: {str(e)}") - - @tenacity.retry( - stop=tenacity.stop_after_attempt(3), - wait=tenacity.wait_exponential(multiplier=1, min=4, max=10), - retry=tenacity.retry_if_exception_type(requests.ConnectionError) - ) - def _call_ollama(self, endpoint: str, data: Dict) -> Dict: - """Resilient Ollama API calls with retry logic""" - try: - response = self.ollama_session.post( - f"http://localhost:11434/api/{endpoint}", - json=data, - timeout=30 - ) - response.raise_for_status() - return response.json() - except requests.exceptions.Timeout: - logger.error(f"Ollama API timeout for endpoint {endpoint}") - raise TimeoutError("Ollama API request timed out") - except requests.exceptions.ConnectionError as ce: - logger.error(f"Connection error to Ollama API: {str(ce)}") - raise - except requests.exceptions.RequestException as e: - logger.error(f"Ollama API call failed: {str(e)}") - raise - except json.JSONDecodeError as je: - logger.error(f"Failed to decode Ollama API response: {str(je)}") - raise ValueError("Invalid JSON response from Ollama API") - - def cleanup(self): - """Cleanup method to handle resources properly""" - try: - # Clean up vector stores - if hasattr(self, 'kbi_vectorstore') and self.kbi_vectorstore is not None: - self.kbi_vectorstore._client.reset() - - # Clean up Chroma client - self.chroma_client.reset() - - # Close Ollama session - self.ollama_session.close() - - # Shutdown thread pool executor - self.executor.shutdown() - - # Remove persistent directory - if os.path.exists(self.persist_directory): - import shutil - shutil.rmtree(self.persist_directory, ignore_errors=True) - - except Exception as e: - logger.error(f"Error during cleanup: {str(e)}") \ No newline at end of file diff --git a/rag_engine/requirements.txt b/rag_engine/requirements.txt deleted file mode 100644 index f3155834..00000000 --- a/rag_engine/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -langchain-community -chromadb -gpt4all -ollama -streamlit -python-dotenv -requests \ No newline at end of file From 335fbdc849a0595836c847b73801cde9003a085b Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sun, 15 Dec 2024 13:20:01 +0800 Subject: [PATCH 05/40] ByPass the Pytest --- OvStudent/Converted_Scripts_Annotated/conftest.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 OvStudent/Converted_Scripts_Annotated/conftest.py diff --git a/OvStudent/Converted_Scripts_Annotated/conftest.py b/OvStudent/Converted_Scripts_Annotated/conftest.py new file mode 100644 index 00000000..faa7a0d1 --- /dev/null +++ b/OvStudent/Converted_Scripts_Annotated/conftest.py @@ -0,0 +1,2 @@ +# old_tests/conftest.py +collect_ignore = ["."] \ No newline at end of file From 8c8827db528f35dbb28e5b1a8dbda4a4d01409b9 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:13:48 -0800 Subject: [PATCH 06/40] Update t_aucell_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py index ddd4a44f..c856d836 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_aucell_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library as ov. -- import omicverse as ov # Line 2: Imports the scanpy library as sc. -- import scanpy as sc # Line 3: Imports the scvelo library as scv. -- import scvelo as scv @@ -39,4 +39,4 @@ # Line 69: Performs differential gene expression analysis with t-test for adata. -- sc.tl.rank_genes_groups(adata, 'clusters', method='t-test',n_genes=100) # Line 71: Performs pathway enrichment analysis on adata. -- res=ov.single.pathway_enrichment(adata,pathways_dict=pathway_dict,organism='Mouse', # Line 73: Plots the pathway enrichment analysis results. -- ax=ov.single.pathway_enrichment_plot(res,plot_title='Enrichment',cmap='Reds', -``` \ No newline at end of file +``` From 64be2a9cb3c2de6ec888cebe6248df08e988af36 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:14:03 -0800 Subject: [PATCH 07/40] Update t_bulk2single_annotated.py --- .../Converted_Scripts_Annotated/t_bulk2single_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py index 059c3954..cf9be945 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_bulk2single_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the scanpy library for single-cell analysis -- import scanpy as sc # Line 2: Import the omicverse library for omics data analysis -- import omicverse as ov # Line 3: Import the matplotlib plotting library -- import matplotlib.pyplot as plt @@ -49,4 +49,4 @@ # Line 67: Compute the MDE embedding using the PCA coordinates of the generated data -- generate_adata.obsm["X_mde"] = ov.utils.mde(generate_adata.obsm["X_pca"]) # Line 68: Generate and display an embedding plot with specified color, palette and settings -- ov.utils.embedding(generate_adata,basis='X_mde',color=['clusters'],wspace=0.4, # Line 69: Use a Pyomic color palette, and 'small' frame -- palette=ov.utils.pyomic_palette(),frameon='small') -``` \ No newline at end of file +``` From 8b2a1f9b039bfb5cf135851081a0e801c727c9b1 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:14:16 -0800 Subject: [PATCH 08/40] Update t_bulk_combat_annotated.py --- .../Converted_Scripts_Annotated/t_bulk_combat_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py index acbc4e87..1765d3a3 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_bulk_combat_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the anndata library for working with annotated data objects. -- import anndata # Line 2: Imports the pandas library for data manipulation and analysis. -- import pandas as pd # Line 3: Imports the omicverse library, likely for omics data analysis. -- import omicverse as ov @@ -54,4 +54,4 @@ # Line 71: Specifies embedding basis as 'batch_correction|original|X_pca' and labels color by 'batch', with no frame. -- basis='batch_correction|original|X_pca', # Line 72: Specifies embedding color is 'batch'. -- color='batch', # Line 73: Specifies smaller frame around the plot. -- frameon='small') -``` \ No newline at end of file +``` From ae7c103b8c0f4a3d4dbf6e40a807d5c40aff18b0 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:14:32 -0800 Subject: [PATCH 09/40] Update t_bulktrajblend_annotated.py --- .../Converted_Scripts_Annotated/t_bulktrajblend_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py index 03f41109..4678c80b 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_bulktrajblend_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov -- import omicverse as ov # Line 2: Import the mde function from omicverse.utils -- from omicverse.utils import mde # Line 3: Import the scanpy library as sc -- import scanpy as sc @@ -102,4 +102,4 @@ # Line 139: Specify group -- groups='clusters') # Line 141: Generate and display a PAGA graph for the interpolated AnnData -- ov.utils.plot_paga(adata1,basis='mde', size=50, alpha=.1,title='PAGA LTNN-graph', # Line 142: Set plotting parameters -- min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False) -``` \ No newline at end of file +``` From a439d2c688477d2a7540795170b11ec35acaa675 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:14:48 -0800 Subject: [PATCH 10/40] Update t_cellanno_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py index b52d8c00..d21bfa84 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_cellanno_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov -- import omicverse as ov # Line 2: Print the version of the omicverse library. -- print(f'omicverse version:{ov.__version__}') # Line 3: Import the scanpy library as sc -- import scanpy as sc @@ -93,4 +93,4 @@ # Line 161: Print the keys of the marker dictionary. -- marker_dict.keys() # Line 163: Print the marker genes for the 'B cell' cell type. -- marker_dict['B cell'] # Line 165: Get a list of tissues in the pySCSA database. -- scsa.get_model_tissue() -``` \ No newline at end of file +``` From 8965d57ee4f43d47812083c70d90cfcec49f765b Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:15:02 -0800 Subject: [PATCH 11/40] Update t_cellfate_gene_annotated.py --- .../Converted_Scripts_Annotated/t_cellfate_gene_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py index 6e330f0c..156a8c67 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_cellfate_gene_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov. -- import omicverse as ov # Line 2: Import the scvelo library as scv. -- import scvelo as scv # Line 3: Import the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt @@ -162,4 +162,4 @@ # Line 161: Set the size of the heatmap figure. -- g.fig.set_size_inches(2, 4) # Line 162: Set the font size of the y-axis labels in the heatmap. -- g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(),fontsize=12) # Line 163: Display the heatmap. -- plt.show() -``` \ No newline at end of file +``` From 2396aa5c79ddfe415298a6a1623c195401703df7 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:15:15 -0800 Subject: [PATCH 12/40] Update t_cellfate_genesets_annotated.py --- .../t_cellfate_genesets_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_cellfate_genesets_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cellfate_genesets_annotated.py index 60b02493..1aae2d40 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_cellfate_genesets_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_cellfate_genesets_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov -- import omicverse as ov # Line 2: Import the scvelo library as scv -- import scvelo as scv # Line 3: Import the matplotlib.pyplot module as plt -- import matplotlib.pyplot as plt @@ -40,4 +40,4 @@ # Line 60: Generate the word cloud -- gw_obj1.get() # Line 62: Plot a heatmap for the wordcloud with a specific figure width and color map -- g=gw_obj1.plot_heatmap(figwidth=6,cmap='RdBu_r') # Line 63: Set the main title of the plot. -- plt.suptitle('CellFateGenie',x=0.18,y=0.95, -``` \ No newline at end of file +``` From 70c911b7dd5d2c5714190cdef800cf7baa263726 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:15:31 -0800 Subject: [PATCH 13/40] Update t_cluster_space_annotated.py --- .../Converted_Scripts_Annotated/t_cluster_space_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py index b3c4e884..07f51181 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_cluster_space_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov. -- import omicverse as ov # Line 3: Import the scanpy library as sc. -- import scanpy as sc # Line 5: Set plotting parameters using omicverse. -- ov.plot_set() @@ -80,4 +80,4 @@ # Line 172: Print the ARI for mclustpy_BINARY. -- print('mclustpy_BINARY: Adjusted rand index = %.2f' %ARI) # Line 174: Calculate the adjusted rand index for mclust_CAST compared to the Ground Truth and print it. -- ARI = adjusted_rand_score(obs_df['mclust_CAST'], obs_df['Ground Truth']) # Line 175: Print the ARI for mclust_CAST. -- print('mclust_CAST: Adjusted rand index = %.2f' %ARI) -``` \ No newline at end of file +``` From b52a6acc78b48f2b70644dd6fe1bab5a04f1ccd6 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:15:44 -0800 Subject: [PATCH 14/40] Update t_cnmf_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py index d9c2daef..37737b81 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_cnmf_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the scanpy library for single-cell analysis -- import scanpy as sc # Line 2: Import the omicverse library, likely for multi-omics analysis and visualization -- import omicverse as ov # Line 3: Set plotting style using omicverse's plot_set function -- ov.plot_set() @@ -184,4 +184,4 @@ # Line 183: -- # Line 184: Create a dotplot of top genes for each cNMF cluster using scanpy -- sc.pl.dotplot(adata,plot_genes, # Line 185: -- "cNMF_cluster", dendrogram=False,standard_scale='var',) -``` \ No newline at end of file +``` From 66ebbae8b60a81dd47c469314b32c22fe09b06f5 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:15:56 -0800 Subject: [PATCH 15/40] Update t_cytotrace_annotated.py --- .../Converted_Scripts_Annotated/t_cytotrace_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py index b494b4cf..b1848238 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_cytotrace_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov. -- import omicverse as ov # Line 2: Apply default plotting settings from omicverse. -- ov.plot_set() # Line 4: Import the scvelo library as scv. -- import scvelo as scv @@ -22,4 +22,4 @@ # Line 27: Generate another UMAP embedding plot of adata, colored by CytoTRACE2_Potency and CytoTRACE2_Relative. -- ov.utils.embedding(adata,basis='X_umap', # Line 28: Set plot frame to 'small', colormap to 'Reds' and horizontal spacing. -- color=['CytoTRACE2_Potency','CytoTRACE2_Relative'], # Line 29: Set plot frame to 'small', colormap to 'Reds' and horizontal spacing. -- frameon='small',cmap='Reds',wspace=0.55) -``` \ No newline at end of file +``` From c68b6d01377f4249f4404be796d2bed2c9fa029d Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:16:07 -0800 Subject: [PATCH 16/40] Update t_deg_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py index f35fe220..ffc26145 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_deg_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library as ov -- import omicverse as ov # Line 2: Imports the scanpy library as sc -- import scanpy as sc # Line 3: Imports the matplotlib.pyplot library as plt -- import matplotlib.pyplot as plt @@ -126,4 +126,4 @@ # Line 160: Sets the y label of the colorbar -- cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2) # Line 161: Disables grid in the colorbar -- cbar.grid(False) # Line 162: Returns the axis object -- return ax -``` \ No newline at end of file +``` From 516edcb4d89d9f657b00963c89267314fb2372d5 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:16:18 -0800 Subject: [PATCH 17/40] Update t_deseq2_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py index 9fdccd5e..a21dccbc 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_deseq2_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library as ov. -- import omicverse as ov # Line 2: Sets the plotting style for omicverse. -- ov.utils.ov_plot_set() # Line 3: Reads data from a URL into a pandas DataFrame, using the first column as the index and the second row as the header. -- data=ov.utils.read('https://raw.githubusercontent.com/Starlitnightly/Pyomic/master/sample/counts.txt',index_col=0,header=1) @@ -29,4 +29,4 @@ # Line 44: Plots the gene set enrichment results. -- gsea_obj.plot_enrichment(num=10,node_size=[10,20,30], # Line 49: Displays the first 5 indices of the enrichment results. -- gsea_obj.enrich_res.index[:5] # Line 51: Plots a Gene Set Enrichment Analysis plot for a specified gene set. -- fig=gsea_obj.plot_gsea(term_num=1, -``` \ No newline at end of file +``` From 47344e0459cd7311203a23ea6489ba61fdb059b5 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:16:31 -0800 Subject: [PATCH 18/40] Update t_gptanno_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py index f99afd6a..907c49aa 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_gptanno_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: -- import omicverse as ov # Line 1: Imports the omicverse library and assigns it the alias 'ov'. # Line 2: -- print(f'omicverse version:{ov.__version__}') @@ -173,4 +173,4 @@ # Line 123: Specifies the local model and top number of genes. # Line 124: -- result # Line 124: Displays the result. -``` \ No newline at end of file +``` From f41c4dfa9734b977258589acc3a42101c39688c6 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:16:46 -0800 Subject: [PATCH 19/40] Update t_mapping_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py index c3a4bd91..a34c18e7 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_mapping_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library and assigns it the alias ov. -- import omicverse as ov # Line 3: Imports the scanpy library and assigns it the alias sc. -- import scanpy as sc # Line 5: Sets the plotting parameters for omicverse. -- ov.utils.ov_plot_set() @@ -33,4 +33,4 @@ # Line 62: Converts cell type labels to strings for column name compatibility. -- clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labels # Line 64: Creates a context for matplotlib rc parameters for specific plot configurations. -- with mpl.rc_context({'figure.figsize': (8, 8),'axes.grid': False}): # Line 65: Generates and displays a spatial plot using omicverse, visualizing cell types. -- fig = ov.pl.plot_spatial( -``` \ No newline at end of file +``` From 7e2019899c7bbe31a3775cde14b4a46465c2d5ed Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:17:03 -0800 Subject: [PATCH 20/40] Update t_metatime_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py index a8fd8782..80e0fe4a 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_metatime_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library as ov. -- import omicverse as ov # Line 2: Sets up the plotting configurations using the ov_plot_set function. -- ov.utils.ov_plot_set() # Line 9: Imports the scanpy library as sc. -- import scanpy as sc @@ -22,4 +22,4 @@ # Line 40: Turns the frame off for the plot. -- color=["Major_MetaTiME"], # Line 41: Sets the number of columns for subplots to 1. -- frameon=False, # Line 42: Closes the function call. -- ncols=1, -``` \ No newline at end of file +``` From 1b9fde164b9064b3ad70bf8b463bfc188745f7f1 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:17:17 -0800 Subject: [PATCH 21/40] Update t_mofa_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py index adce6d70..a4c15545 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_mofa_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: import omicverse as ov -- import omicverse as ov # Line 2: rna=ov.utils.read('data/sample/rna_p_n_raw.h5ad') -- rna=ov.utils.read('data/sample/rna_p_n_raw.h5ad') # Line 3: atac=ov.utils.read('data/sample/atac_p_n_raw.h5ad') -- atac=ov.utils.read('data/sample/atac_p_n_raw.h5ad') @@ -38,4 +38,4 @@ # Line 56: pymofa_obj.plot_weights(view='RNA',factor=6,color='#5de25d', -- pymofa_obj.plot_weights(view='RNA',factor=6,color='#5de25d', # Line 57: ascending=True) -- ascending=True) # Line 59: pymofa_obj.plot_top_feature_heatmap(view='RNA') -- pymofa_obj.plot_top_feature_heatmap(view='RNA') -``` \ No newline at end of file +``` From 35eeaad1235ce3d42320fef0156812a9e2769ba7 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:17:29 -0800 Subject: [PATCH 22/40] Update t_mofa_glue_annotated.py --- .../Converted_Scripts_Annotated/t_mofa_glue_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py index 43fc2f73..ec20abb7 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_mofa_glue_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library. -- import omicverse as ov # Line 2: Set the plotting parameters for omicverse. -- ov.utils.ov_plot_set() # Line 4: Read RNA data from an h5ad file. -- rna=ov.utils.read("chen_rna-emb.h5ad") @@ -47,4 +47,4 @@ # Line 76: Plot the weights of genes for specific factors for the RNA view. -- pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=1,factor2=3,) # Line 78: Plot the weights for the specified factor in RNA data. -- pymofa_obj.plot_weights(view='RNA',factor=1, ascending=False) # Line 81: Plot a heatmap of the top features for the RNA view. -- pymofa_obj.plot_top_feature_heatmap(view='RNA') -``` \ No newline at end of file +``` From a185d54e2869d74a2051da2f2e8938ac66eaea6a Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:17:40 -0800 Subject: [PATCH 23/40] Update t_network_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_network_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py index b2b0210f..e70ea3f5 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_network_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov. -- import omicverse as ov # Line 2: Set plot settings for omicverse. -- ov.utils.ov_plot_set() # Line 4: Create a list of gene names. -- gene_list=['FAA4','POX1','FAT1','FAS2','FAS1','FAA1','OLE1','YJU3','TGL3','INA1','TGL5'] @@ -12,4 +12,4 @@ # Line 15: Set the species as 4932. -- species=4932) # Line 18: Perform interaction analysis on the pyPPI object. -- ppi.interaction_analysis() # Line 20: Plot the network of the pyPPI object. -- ppi.plot_network() -``` \ No newline at end of file +``` From cc348a1f8ff321f078b87494e6d63e9b162b71aa Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:19:53 -0800 Subject: [PATCH 24/40] Update t_nocd_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py index d6d7e7fd..5d68baa7 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_nocd_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library as ov -- import omicverse as ov # Line 2: Imports the anndata library -- import anndata # Line 3: Imports the scanpy library as sc -- import scanpy as sc @@ -26,4 +26,4 @@ # Line 34: Calculates the nocd scores using the scNOCD object -- scbrca.calculate_nocd() # Line 36: Generates a UMAP plot colored by 'leiden' and 'nocd', setting spacing and palette -- sc.pl.umap(scbrca.adata, color=['leiden','nocd'],wspace=0.4,palette=sc_color) # Line 38: Generates a UMAP plot colored by 'leiden' and 'nocd_n', setting spacing and palette -- sc.pl.umap(scbrca.adata, color=['leiden','nocd_n'],wspace=0.4,palette=sc_color) -``` \ No newline at end of file +``` From 4f0221d01b5398577e4f17649667c1b61d8cf4f4 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:20:32 -0800 Subject: [PATCH 25/40] Update t_preprocess_gpu_annotated.py --- .../Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py index fe6e2b61..754649fe 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_preprocess_gpu_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: import omicverse as ov -- import omicverse as ov # Line 2: import scanpy as sc -- import scanpy as sc # Line 3: ov.plot_set() -- ov.plot_set() @@ -119,4 +119,4 @@ # Line 158: for i in adata.obs['leiden'].cat.categories: -- for i in adata.obs['leiden'].cat.categories: # Line 159: axes[i].set_ylim(y_min,y_max) -- axes[i].set_ylim(y_min,y_max) # Line 160: plt.suptitle('Stacking_vol',fontsize=12) -- plt.suptitle('Stacking_vol',fontsize=12) -``` \ No newline at end of file +``` From 976c47588145a86f6f88e77a45b5b6a13cb8e5ec Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:20:47 -0800 Subject: [PATCH 26/40] Update t_scmulan_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py index c1cdd303..1431fb86 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_scmulan_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: import os -- import os # Line 3: import scanpy as sc -- import scanpy as sc # Line 4: import omicverse as ov -- import omicverse as ov @@ -36,4 +36,4 @@ # Line 71: top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20] -- top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20] # Line 74: selected_cell_types = top_celltypes -- selected_cell_types = top_celltypes # Line 75: ov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True) -- ov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True) -``` \ No newline at end of file +``` From 0ada47b716b323a7ea4c2c3590da62d96ad9a307 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:20:58 -0800 Subject: [PATCH 27/40] Update t_simba_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py index 550644d1..fb006ca9 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_simba_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov. -- import omicverse as ov # Line 2: Import the mde function from the omicverse.utils module. -- from omicverse.utils import mde # Line 3: Set the working directory to 'result_human_pancreas'. -- workdir = 'result_human_pancreas' @@ -18,4 +18,4 @@ # Line 26: Compute the neighbor graph using the X_simba representation. -- sc.pp.neighbors(adata, use_rep="X_simba") # Line 27: Compute the UMAP embedding. -- sc.tl.umap(adata) # Line 28: Plot the UMAP embedding colored by cell_type1 and batch. -- sc.pl.umap(adata,color=['cell_type1','batch']) -``` \ No newline at end of file +``` From 747e2b3101336445478fceec22e362f881ab2499 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:21:08 -0800 Subject: [PATCH 28/40] Update t_single2spatial_annotated.py --- .../Converted_Scripts_Annotated/t_single2spatial_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py index 0b486d85..f6f93964 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_single2spatial_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the scanpy library for single-cell analysis. -- import scanpy as sc # Line 2: Import the pandas library for data manipulation. -- import pandas as pd # Line 3: Import the numpy library for numerical operations. -- import numpy as np @@ -53,4 +53,4 @@ # Line 75: Turn off displaying plot. -- ncols=4, # Line 76: Use a specific color palette from omicverse for the plot. -- show=False, # Line 77: Use a specific color palette from omicverse for the plot. -- palette=ov.utils.ov_palette()[11:] -``` \ No newline at end of file +``` From f8999bd21f8fd6facbc5661135928b986d1baca6 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:21:21 -0800 Subject: [PATCH 29/40] Update t_spaceflow_annotated.py --- .../Converted_Scripts_Annotated/t_spaceflow_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py index a7f7f0d8..e56c33fc 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_spaceflow_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library and alias it as ov. -- import omicverse as ov # Line 3: Import the scanpy library and alias it as sc. -- import scanpy as sc # Line 5: Set plotting parameters for omicverse. -- ov.utils.ov_plot_set() @@ -28,4 +28,4 @@ # Line 38: Cluster the AnnData object using a Gaussian Mixture Model on the spaceflow representation. -- ov.utils.cluster(adata,use_rep='spaceflow',method='GMM',n_components=7,covariance_type='full', # Line 39: Continuation of GMM parameters -- tol=1e-9, max_iter=1000, random_state=3607) # Line 41: Generate a spatial plot colored by GMM cluster assignment and the ground truth annotations. -- sc.pl.spatial(adata, color=['gmm_cluster',"Ground Truth"]) -``` \ No newline at end of file +``` From e74fd99190b7685853e0782bacda65f3572fa387 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:21:39 -0800 Subject: [PATCH 30/40] Update t_stagate_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py index 6b174081..8ca1f9c4 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_stagate_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library, aliased as ov. -- import omicverse as ov # Line 3: Import the scanpy library, aliased as sc. -- import scanpy as sc # Line 5: Set the plotting parameters for omicverse. -- ov.plot_set() @@ -63,4 +63,4 @@ # Line 103: Prints the adjusted rand index of leiden_STAGATE vs ground truth. -- print('leiden_STAGATE: Adjusted rand index = %.2f' %ARI) # Line 105: Calculates and prints Adjusted Rand Index between louvain_STAGATE labels and ground truth. -- ARI = adjusted_rand_score(obs_df['louvain_STAGATE'], obs_df['Ground Truth']) # Line 106: Prints the adjusted rand index of louvain_STAGATE vs ground truth. -- print('louvain_STAGATE: Adjusted rand index = %.2f' %ARI) -``` \ No newline at end of file +``` From 8699f7676e7fc7f026e96dde3db9f3a39efe3ceb Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:22:05 -0800 Subject: [PATCH 31/40] Update t_staligner_annotated.py --- .../Converted_Scripts_Annotated/t_staligner_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py index 5023d6de..91483b51 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_staligner_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the csr_matrix class from the scipy.sparse module for creating sparse matrices. -- from scipy.sparse import csr_matrix # Line 2: Imports the omicverse library as ov. -- import omicverse as ov # Line 3: Imports the scanpy library as sc. -- import scanpy as sc @@ -53,4 +53,4 @@ # Line 76: Sets the title of the second spatial plot and sets the size. -- _sc_1[0].set_title('Stereo-seq',size=title_size) # Line 77: Inverts the y-axis of the second spatial plot. -- _sc_1[0].invert_yaxis() # Line 78: Displays the generated plots. -- plt.show() -``` \ No newline at end of file +``` From 30cd0a160c03158f35af1da277ec2eac26e7e4a8 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:22:18 -0800 Subject: [PATCH 32/40] Update t_stt_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py index a2845c8d..40367e49 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_stt_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library -- import omicverse as ov # Line 3: Import the scvelo library as scv -- import scvelo as scv # Line 4: Import the scanpy library as sc -- import scanpy as sc @@ -67,4 +67,4 @@ # Line 94: Continue plotting embedding for 'Sim1' expression using expression layer -- cmap='RdBu_r',ax=axes[3] # Line 95: Continue plotting embedding for 'Sim1' expression using expression layer -- ) # Line 96: Adjust the layout to fit the subplots -- plt.tight_layout() -``` \ No newline at end of file +``` From d8e2108296a921d416c68a3901a76ac94257abcb Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:22:34 -0800 Subject: [PATCH 33/40] Update t_traj_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_traj_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_traj_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_traj_annotated.py index 978c3093..edd40c4c 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_traj_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_traj_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the scanpy library for single-cell data analysis. -- import scanpy as sc # Line 2: Import the scvelo library for RNA velocity analysis. -- import scvelo as scv # Line 3: Import matplotlib's pyplot for plotting. -- import matplotlib.pyplot as plt @@ -61,4 +61,4 @@ # Line 81: Group the data by clusters. -- groups='clusters') # Line 83: Plot the PAGA graph using omicverse's plotting function with LTNN graph title. -- ov.utils.plot_paga(adata,basis='umap', size=50, alpha=.1,title='PAGA LTNN-graph', # Line 84: specify the PAGA graph plotting parameters, legend locations and whether to show plot -- min_edge_width=2, node_size_scale=1.5,show=False,legend_loc=False) -``` \ No newline at end of file +``` From 4cf3feea5c40c8fa29c12a510f6e0327e442e7f0 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:22:46 -0800 Subject: [PATCH 34/40] Update t_via_annotated.py --- OvStudent/Converted_Scripts_Annotated/t_via_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_via_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_via_annotated.py index 1e233336..e74f630c 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_via_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_via_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Import the omicverse library as ov. -- import omicverse as ov # Line 2: Import the scanpy library as sc. -- import scanpy as sc # Line 3: Import the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt @@ -46,4 +46,4 @@ # Line 60: Plot the gene trend heatmap, highlighting lineage 2 and save as PNG. -- fig,ax=v0.plot_gene_trend_heatmap(gene_list=gene_list_magic,figsize=(4,4), # Line 61: marker_lineages=[2]) # Line 62: Save the gene trend heatmap as a PNG with specified DPI and tight bounding box. -- fig.savefig('figures/via_fig9.png',dpi=300,bbox_inches = 'tight') -``` \ No newline at end of file +``` From b7c25f8d4830ef12ec997ac2ba80a17830ee7474 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:22:58 -0800 Subject: [PATCH 35/40] Update t_visualize_bulk_annotated.py --- .../Converted_Scripts_Annotated/t_visualize_bulk_annotated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OvStudent/Converted_Scripts_Annotated/t_visualize_bulk_annotated.py b/OvStudent/Converted_Scripts_Annotated/t_visualize_bulk_annotated.py index 1b37ef45..ad4e4442 100644 --- a/OvStudent/Converted_Scripts_Annotated/t_visualize_bulk_annotated.py +++ b/OvStudent/Converted_Scripts_Annotated/t_visualize_bulk_annotated.py @@ -1,4 +1,4 @@ -```python +``` # Line 1: Imports the omicverse library as ov. -- import omicverse as ov # Line 2: Imports the scanpy library as sc. -- import scanpy as sc # Line 3: Imports the matplotlib.pyplot library as plt. -- import matplotlib.pyplot as plt @@ -47,4 +47,4 @@ # Line 63: Adds text for p value annotation. -- text='$p={}$'.format(round(0.001,3)), # Line 64: Sets the font size, color, and alignment for p-value annotation. -- fontsize=11,fontcolor='#000000', # Line 65: Sets the horizontal alignment for the p-value annotation. -- horizontalalignment='center',) -``` \ No newline at end of file +``` From 769d4ddad666a92dc9487896dfe1aa81640a7417 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:28:01 -0800 Subject: [PATCH 36/40] Delete OvStudent/Converted_Scripts_Annotated/conftest.py --- OvStudent/Converted_Scripts_Annotated/conftest.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 OvStudent/Converted_Scripts_Annotated/conftest.py diff --git a/OvStudent/Converted_Scripts_Annotated/conftest.py b/OvStudent/Converted_Scripts_Annotated/conftest.py deleted file mode 100644 index faa7a0d1..00000000 --- a/OvStudent/Converted_Scripts_Annotated/conftest.py +++ /dev/null @@ -1,2 +0,0 @@ -# old_tests/conftest.py -collect_ignore = ["."] \ No newline at end of file From e1817b46d0713991f4f586ac0a288ecc2937efff Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sun, 15 Dec 2024 14:35:47 +0800 Subject: [PATCH 37/40] ByPASS the CSA folder --- pytest.ini | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..0fdd97cd --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +norecursedirs = Converted_Scripts_Annotated \ No newline at end of file From b8b5fc152534367c75fd90f54b724b5558690d7e Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:41:09 -0800 Subject: [PATCH 38/40] Update pytest.ini --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 0fdd97cd..4be80396 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -norecursedirs = Converted_Scripts_Annotated \ No newline at end of file +norecursedirs = /OvStudent/Converted_Scripts_Annotated From 5a440661a2dcd0c77344339c0ae83ab4ecfd7e79 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 22:51:07 -0800 Subject: [PATCH 39/40] Update and rename pytest.ini to setup.cfg --- pytest.ini | 2 -- setup.cfg | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 pytest.ini create mode 100644 setup.cfg diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 4be80396..00000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -norecursedirs = /OvStudent/Converted_Scripts_Annotated diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..dc5a51b3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[tool:pytest] +norecursedirs = OvStudent/Converted_Scripts_Annotated From 2cd77e56901e9d76997a8b51f7f0884a79f28382 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:03:14 -0800 Subject: [PATCH 40/40] Update setup.cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index dc5a51b3..ad8efd0a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [tool:pytest] -norecursedirs = OvStudent/Converted_Scripts_Annotated +norecursedirs = omicverse/OvStudent/Converted_Scripts_Annotated