From aaa341472e7874c8a2e9757aa9330335536acf57 Mon Sep 17 00:00:00 2001 From: HendricksJudy <61645034+HendricksJudy@users.noreply.github.com> Date: Sat, 14 Dec 2024 21:19:10 -0800 Subject: [PATCH] Delete rag_engine directory --- rag_engine/PreViewRead101.txt | 80 --- rag_engine/__init__.py | 3 - rag_engine/app.py | 388 ------------ rag_engine/config.json | 5 - rag_engine/ovrawmjson/KBI.json | 176 ------ rag_engine/ovrawmjson/t_anno_trans.json | 58 -- rag_engine/ovrawmjson/t_aucell.json | 70 -- rag_engine/ovrawmjson/t_bulk_combat.json | 74 --- rag_engine/ovrawmjson/t_cellanno.json | 126 ---- rag_engine/ovrawmjson/t_cellfate.json | 98 --- rag_engine/ovrawmjson/t_cellfate_gene.json | 186 ------ .../ovrawmjson/t_cellfate_genesets.json | 74 --- rag_engine/ovrawmjson/t_cellphonedb.json | 214 ------- rag_engine/ovrawmjson/t_cluster.json | 98 --- rag_engine/ovrawmjson/t_cluster_space.json | 122 ---- rag_engine/ovrawmjson/t_cnmf.json | 110 ---- rag_engine/ovrawmjson/t_commot_flowsig.json | 110 ---- rag_engine/ovrawmjson/t_cytotrace.json | 26 - rag_engine/ovrawmjson/t_deg.json | 82 --- rag_engine/ovrawmjson/t_deseq2.json | 82 --- rag_engine/ovrawmjson/t_gptanno.json | 98 --- rag_engine/ovrawmjson/t_mapping.json | 46 -- rag_engine/ovrawmjson/t_metacells.json | 94 --- rag_engine/ovrawmjson/t_metatime.json | 42 -- rag_engine/ovrawmjson/t_mofa.json | 78 --- rag_engine/ovrawmjson/t_mofa_glue.json | 98 --- rag_engine/ovrawmjson/t_network.json | 30 - rag_engine/ovrawmjson/t_nocd.json | 38 -- rag_engine/ovrawmjson/t_preprocess.json | 130 ---- rag_engine/ovrawmjson/t_preprocess_cpu.json | 122 ---- rag_engine/ovrawmjson/t_preprocess_gpu.json | 122 ---- rag_engine/ovrawmjson/t_scdeg.json | 122 ---- rag_engine/ovrawmjson/t_scdrug.json | 74 --- rag_engine/ovrawmjson/t_scmulan.json | 82 --- rag_engine/ovrawmjson/t_simba.json | 50 -- rag_engine/ovrawmjson/t_single_batch.json | 138 ---- rag_engine/ovrawmjson/t_slat.json | 130 ---- rag_engine/ovrawmjson/t_spaceflow.json | 42 -- rag_engine/ovrawmjson/t_stagate.json | 90 --- rag_engine/ovrawmjson/t_staligner.json | 46 -- rag_engine/ovrawmjson/t_starfysh.json | 126 ---- rag_engine/ovrawmjson/t_stt.json | 218 ------- rag_engine/ovrawmjson/t_tcga.json | 38 -- rag_engine/ovrawmjson/t_tosica.json | 86 --- rag_engine/ovrawmjson/t_traj.json | 98 --- rag_engine/ovrawmjson/t_via.json | 58 -- rag_engine/ovrawmjson/t_via_velo.json | 38 -- rag_engine/ovrawmjson/t_visualize_bulk.json | 34 - .../ovrawmjson/t_visualize_colorsystem.json | 46 -- rag_engine/ovrawmjson/t_visualize_single.json | 90 --- rag_engine/ovrawmjson/t_wgcna.json | 98 --- rag_engine/rag_system.py | 596 ------------------ rag_engine/requirements.txt | 7 - 53 files changed, 5387 deletions(-) delete mode 100644 rag_engine/PreViewRead101.txt delete mode 100644 rag_engine/__init__.py delete mode 100644 rag_engine/app.py delete mode 100644 rag_engine/config.json delete mode 100644 rag_engine/ovrawmjson/KBI.json delete mode 100644 rag_engine/ovrawmjson/t_anno_trans.json delete mode 100644 rag_engine/ovrawmjson/t_aucell.json delete mode 100644 rag_engine/ovrawmjson/t_bulk_combat.json delete mode 100644 rag_engine/ovrawmjson/t_cellanno.json delete mode 100644 rag_engine/ovrawmjson/t_cellfate.json delete mode 100644 rag_engine/ovrawmjson/t_cellfate_gene.json delete mode 100644 rag_engine/ovrawmjson/t_cellfate_genesets.json delete mode 100644 rag_engine/ovrawmjson/t_cellphonedb.json delete mode 100644 rag_engine/ovrawmjson/t_cluster.json delete mode 100644 rag_engine/ovrawmjson/t_cluster_space.json delete mode 100644 rag_engine/ovrawmjson/t_cnmf.json delete mode 100644 rag_engine/ovrawmjson/t_commot_flowsig.json delete mode 100644 rag_engine/ovrawmjson/t_cytotrace.json delete mode 100644 rag_engine/ovrawmjson/t_deg.json delete mode 100644 rag_engine/ovrawmjson/t_deseq2.json delete mode 100644 rag_engine/ovrawmjson/t_gptanno.json delete mode 100644 rag_engine/ovrawmjson/t_mapping.json delete mode 100644 rag_engine/ovrawmjson/t_metacells.json delete mode 100644 rag_engine/ovrawmjson/t_metatime.json delete mode 100644 rag_engine/ovrawmjson/t_mofa.json delete mode 100644 rag_engine/ovrawmjson/t_mofa_glue.json delete mode 100644 rag_engine/ovrawmjson/t_network.json delete mode 100644 rag_engine/ovrawmjson/t_nocd.json delete mode 100644 rag_engine/ovrawmjson/t_preprocess.json delete mode 100644 rag_engine/ovrawmjson/t_preprocess_cpu.json delete mode 100644 rag_engine/ovrawmjson/t_preprocess_gpu.json delete mode 100644 rag_engine/ovrawmjson/t_scdeg.json delete mode 100644 rag_engine/ovrawmjson/t_scdrug.json delete mode 100644 rag_engine/ovrawmjson/t_scmulan.json delete mode 100644 rag_engine/ovrawmjson/t_simba.json delete mode 100644 rag_engine/ovrawmjson/t_single_batch.json delete mode 100644 rag_engine/ovrawmjson/t_slat.json delete mode 100644 rag_engine/ovrawmjson/t_spaceflow.json delete mode 100644 rag_engine/ovrawmjson/t_stagate.json delete mode 100644 rag_engine/ovrawmjson/t_staligner.json delete mode 100644 rag_engine/ovrawmjson/t_starfysh.json delete mode 100644 rag_engine/ovrawmjson/t_stt.json delete mode 100644 rag_engine/ovrawmjson/t_tcga.json delete mode 100644 rag_engine/ovrawmjson/t_tosica.json delete mode 100644 rag_engine/ovrawmjson/t_traj.json delete mode 100644 rag_engine/ovrawmjson/t_via.json delete mode 100644 rag_engine/ovrawmjson/t_via_velo.json delete mode 100644 rag_engine/ovrawmjson/t_visualize_bulk.json delete mode 100644 rag_engine/ovrawmjson/t_visualize_colorsystem.json delete mode 100644 rag_engine/ovrawmjson/t_visualize_single.json delete mode 100644 rag_engine/ovrawmjson/t_wgcna.json delete mode 100644 rag_engine/rag_system.py delete mode 100644 rag_engine/requirements.txt diff --git a/rag_engine/PreViewRead101.txt b/rag_engine/PreViewRead101.txt deleted file mode 100644 index 9567797f..00000000 --- a/rag_engine/PreViewRead101.txt +++ /dev/null @@ -1,80 +0,0 @@ -This rag_engine is for preview only. It is not for production use. - -First time users should read the following instructions carefully. - -### Tutorial for RAG Engine Preview - -This tutorial will guide you through the setup and usage of the RAG Engine Preview. Follow the steps below to get started. - -#### Prerequisites - -Ensure you have the following installed: -- Python 3.12 -- Required Python packages: `langchain`, `langchain-community`, `sentence-transformers`, `numpy`, `faiss-cpu`, `chromadb`, `requests`, `psutil`, `prometheus_client`, `tenacity`, `streamlit` - -#### Step 1: Install Dependencies - -Install the necessary dependencies using pip: - -```bash -pip install langchain langchain-community sentence-transformers numpy faiss-cpu chromadb requests psutil prometheus_client tenacity streamlit -``` - -#### Step 2: Dive into the rag_engine_preview - -```bash -cd rag_engine -``` - -#### Step 3: Double-Check the Required Files - -Create the following files in your project directory: - -1. `rag_system.py` -2. `app.py` -3. `PreViewRead101.txt` -4. `ovrawjson folder` -5. `__init__.py` -6. `config.json` -7. `requirements.txt` - -Ensure the files are correctly named and placed in the project directory. - -#### Step 4: test the RAG System - -Run the app.py and rag_system.py to test the RAG System. - -#### Step 5: Run the Streamlit Application - -Navigate to the project directory and run the Streamlit application: - -```bash -streamlit run app.py -``` - -#### Step 6: Interact with the Application - -Attention: Your first time to initialize the RAG Engine Preview, you need to wait for a while to load the model. (Up to 30 minutes) - -Open the provided URL in your browser to interact with the RAG Engine Preview. You can enter queries and view the results processed by the RAG system. - -#### Step 7: Review the Logs - -Logs are generated in the `logs` directory. Review these logs to monitor the system's performance and debug any issues. - -#### Additional Information - -- **Configuration**: Modify the configuration settings in the Streamlit sidebar to customize the models and rate limits. -- **Health Checks**: Use the system health and status indicators in the sidebar to ensure the system is running correctly. -- **Query History**: View the history of queries processed by the system in the sidebar. - -This completes the setup and usage tutorial for the RAG Engine Preview. -For any issues or feedback, please refer to the provided documentation or contact the support team. - -In the Next Update, we will provide more features and improvements to enhance the user experience. -1. The Local Reasoning Engine has 90% ability to understand the context of the query compared to the OpenAI o1 model in the Bioinformatics domain. -2. The online model API supports the user to query the data from the SOTA model. -3. The Local elastic search engine is able to search the data from the local database. -4. The Local elastic knowledge base is able to update and delete the data from the local database. - -Love from 3910❤️ \ No newline at end of file diff --git a/rag_engine/__init__.py b/rag_engine/__init__.py deleted file mode 100644 index 114a8f88..00000000 --- a/rag_engine/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .rag_system import RAGSystem - -__version__ = "1.0.0" \ No newline at end of file diff --git a/rag_engine/app.py b/rag_engine/app.py deleted file mode 100644 index a13bfd84..00000000 --- a/rag_engine/app.py +++ /dev/null @@ -1,388 +0,0 @@ -import streamlit as st -import json -from datetime import datetime, timezone, timedelta -import os -import subprocess -import time -import requests -import getpass -import psutil -from pathlib import Path -import logging -from logging.handlers import RotatingFileHandler -from collections import OrderedDict -# Import the RAGSystem -from rag_system import RAGSystem, RAGLogger - -# Set up logging with rotating file handler -def setup_logging(): - log_dir = Path("logs") - log_dir.mkdir(exist_ok=True) - - handler = RotatingFileHandler( - log_dir / 'streamlit_app.log', - maxBytes=10*1024*1024, # 10 MB - backupCount=5 - ) - - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler(), - handler - ] - ) - -setup_logging() - -# Initialize session state -def initialize_session_state(): - default_state = { - 'ollama_ready': False, - 'models_installed': False, - 'query_history': [], - 'rate_limiter': None, - 'query_cache': None, - 'config': { - 'file_selection_model': 'qwen2.5-coder:3b', - 'query_processing_model': 'qwen2.5-coder:7b', - 'rate_limit': 5, # seconds between queries - }, - 'current_time': datetime(2024, 12, 8, 13, 19, 36, tzinfo=timezone.utc), - 'current_user': 'HendricksJudy' - } - - for key, value in default_state.items(): - if key not in st.session_state: - st.session_state[key] = value - -initialize_session_state() - -# Cache for RAGSystem -@st.cache_resource -def get_rag_system(): - try: - json_directory = os.path.join(os.path.dirname(__file__), "ovrawmjson") - kbi_path = os.path.join(json_directory, "KBI.json") - return RAGSystem(json_directory, kbi_path) - except Exception as e: - logging.error(f"Failed to initialize RAG system: {str(e)}") - return None - -# System Monitor class with enhanced metrics -class SystemMonitor: - @staticmethod - def get_system_stats(): - process = psutil.Process() - memory = psutil.virtual_memory() - return { - 'memory_usage': process.memory_info().rss / 1024 / 1024, # MB - 'cpu_percent': psutil.cpu_percent(interval=1), - 'uptime': time.time() - process.create_time(), - 'system_memory': { - 'total': memory.total / (1024 ** 3), # GB - 'available': memory.available / (1024 ** 3), # GB - 'percent': memory.percent - } - } - - @staticmethod - def format_uptime(seconds): - return str(timedelta(seconds=int(seconds))) - -# RateLimiter class for query rate limiting -class RateLimiter: - def __init__(self, limit_seconds): - self.limit_seconds = limit_seconds - self.last_request_time = None - - def can_make_request(self): - if not self.last_request_time: - return True - time_since_last = time.time() - self.last_request_time - return time_since_last >= self.limit_seconds - - def time_until_next_request(self): - if not self.last_request_time: - return 0 - time_since_last = time.time() - self.last_request_time - return max(0, self.limit_seconds - time_since_last) - - def record_request(self): - self.last_request_time = time.time() - -# Initialize RateLimiter -if st.session_state['rate_limiter'] is None: - st.session_state['rate_limiter'] = RateLimiter(st.session_state['config']['rate_limit']) - -# QueryCache class for cache management -class QueryCache: - def __init__(self, max_size=1000): - self.cache = OrderedDict() - self.max_size = max_size - - def get(self, key): - return self.cache.get(key) - - def set(self, key, value): - self.cache[key] = value - self.cache.move_to_end(key) - if len(self.cache) > self.max_size: - self.cache.popitem(last=False) - -# Initialize QueryCache -if st.session_state['query_cache'] is None: - st.session_state['query_cache'] = QueryCache() - -# ConfigManager class for configuration management -class ConfigManager: - CONFIG_PATH = Path('config.json') - - @staticmethod - def load_config(): - if ConfigManager.CONFIG_PATH.exists(): - with open(ConfigManager.CONFIG_PATH, 'r') as f: - return json.load(f) - else: - return st.session_state['config'] - - @staticmethod - def save_config(config): - with open(ConfigManager.CONFIG_PATH, 'w') as f: - json.dump(config, f, indent=2) - -# Load configuration -st.session_state['config'] = ConfigManager.load_config() - -# Function to display the header -def show_header(): - col1, col2, col3 = st.columns([2, 1, 1]) - with col1: - st.title("Agentic OmicVerse 🧬") - with col2: - # Using the specified datetime - st.info(f"📅 UTC: {datetime(2024, 12, 8, 13, 20, 42, tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}") - with col3: - # Using the specified username - st.info(f"👤 User: HendricksJudy") - -# Function to display system status -def show_system_status(): - stats = SystemMonitor.get_system_stats() - with st.sidebar: - st.header("System Status 📊") - col1, col2 = st.columns(2) - with col1: - st.metric("Memory (MB)", f"{stats['memory_usage']:.1f}") - st.metric("CPU %", f"{stats['cpu_percent']:.1f}") - with col2: - st.metric("Uptime", SystemMonitor.format_uptime(stats['uptime'])) - st.metric("Memory Usage %", f"{stats['system_memory']['percent']:.1f}") - st.progress(stats['system_memory']['percent'] / 100) - -# Function to check if Ollama server is running -def check_ollama_server() -> bool: - try: - response = requests.get("http://localhost:11434/api/version", timeout=5) - return response.status_code == 200 - except requests.RequestException: - return False - -# Function to display health status -def display_health_status(): - healthy, checks = check_system_health() - with st.sidebar: - st.header("System Health ✅" if healthy else "System Health ⚠️") - for component, status in checks.items(): - if status: - st.success(f"{component} is running") - else: - st.error(f"{component} is not running") - -# Function to perform health checks -def check_system_health(): - health_checks = { - 'Ollama Server': check_ollama_server(), - } - all_healthy = all(health_checks.values()) - return all_healthy, health_checks - -# Function to display configuration settings -def show_configuration(): - with st.sidebar: - st.header("Configuration ⚙️") - with st.expander("Model Settings"): - file_selection_model = st.selectbox( - "File Selection Model", - ["qwen2.5-coder:3b", "qwen2.5-coder:7b"], - index=["qwen2.5-coder:3b", "qwen2.5-coder:7b"].index( - st.session_state['config']['file_selection_model'] - ) - ) - query_processing_model = st.selectbox( - "Query Processing Model", - ["qwen2.5-coder:7b", "qwen2.5-coder:3b"], - index=["qwen2.5-coder:7b", "qwen2.5-coder:3b"].index( - st.session_state['config']['query_processing_model'] - ) - ) - rate_limit = st.slider( - "Rate Limit (seconds)", - min_value=1, - max_value=30, - value=st.session_state['config']['rate_limit'] - ) - - if st.button("Save Configuration"): - st.session_state['config'].update({ - 'file_selection_model': file_selection_model, - 'query_processing_model': query_processing_model, - 'rate_limit': rate_limit - }) - ConfigManager.save_config(st.session_state['config']) - st.session_state['rate_limiter'] = RateLimiter(rate_limit) - st.success("Configuration saved successfully.") - - -# Function to process query with progress tracking -def process_query_with_progress(query, rag_system): - progress_bar = st.progress(0) - status_text = st.empty() - try: - status_text.text("Finding relevant document...") - progress_bar.progress(25) - relevant_file = rag_system.find_relevant_file(query) - status_text.text("Processing query...") - progress_bar.progress(50) - answer = rag_system.process_query(query, relevant_file) - status_text.text("Updating history...") - progress_bar.progress(75) - - # Using the specified datetime for query history - query_time = datetime(2024, 12, 8, 13, 21, 29, tzinfo=timezone.utc) - st.session_state.query_history.append({ - 'query': query, - 'file': relevant_file, - 'answer': answer, - 'timestamp': query_time, - 'user': 'HendricksJudy' - }) - - st.session_state['rate_limiter'].record_request() - progress_bar.progress(100) - status_text.text("Complete!") - time.sleep(1) - progress_bar.empty() - status_text.empty() - return relevant_file, answer - except Exception as e: - logging.error(f"Query processing error: {str(e)}") - progress_bar.empty() - status_text.text(f"Error: {e}") - raise e - - -# QueryManager class -class QueryManager: - @staticmethod - def validate_query(query): - if not query or len(query.strip()) < 3: - return False, "Query must be at least 3 characters long" - if len(query) > 1000: - return False, "Query must be less than 1000 characters" - return True, "" - - -# Function to display query history -def show_query_history(): - with st.sidebar: - st.header("Query History 📜") - for idx, item in enumerate(reversed(st.session_state.query_history[-10:])): - with st.expander(f"Query {len(st.session_state.query_history) - idx}: {item['query'][:30]}..."): - st.markdown(f"**Time:** {item['timestamp'].strftime('%Y-%m-%d %H:%M:%S')} UTC") - st.markdown(f"**User:** {item['user']}") - st.markdown(f"**Document:** {item['file']}") - st.markdown(f"**Answer:** {item['answer']}") - st.markdown("---") - - -# Main function -def main(): - show_header() - show_system_status() - display_health_status() - show_configuration() - - if st.button("Reset System"): - st.session_state.query_history = [] - st.session_state['rate_limiter'] = RateLimiter(st.session_state['config']['rate_limit']) - st.rerun() - - if not st.session_state['ollama_ready']: - if not check_ollama_server(): - st.error("❌ Ollama server is not running") - if st.button("🚀 Start Ollama Server"): - try: - subprocess.Popen(['ollama', 'serve']) - time.sleep(5) - if check_ollama_server(): - st.session_state['ollama_ready'] = True - st.success("✅ Ollama server started successfully") - st.rerun() - except FileNotFoundError: - st.error("❌ Ollama is not installed") - return - else: - st.session_state['ollama_ready'] = True - - rag_system = get_rag_system() - if rag_system is None: - st.error("Failed to initialize RAG system.") - return - - st.markdown("### Query Interface 🔍") - query = st.text_area( - "Enter your query:", - height=100, - placeholder="Enter your question about the documents..." - ) - - col1, col2 = st.columns([1, 5]) - with col1: - submit = st.button("🚀 Submit") - with col2: - if st.button("🗑️ Clear History"): - st.session_state.query_history = [] - st.rerun() - - if submit and query: - is_valid, error_message = QueryManager.validate_query(query) - if not is_valid: - st.error(error_message) - return - - if not st.session_state['rate_limiter'].can_make_request(): - wait_time = st.session_state['rate_limiter'].time_until_next_request() - st.warning(f"Please wait {wait_time:.1f} seconds before making another query.") - return - - try: - with st.spinner("Processing query..."): - relevant_file, answer = process_query_with_progress(query, rag_system) - st.success(f"📄 Selected document: {relevant_file}") - st.markdown("### Answer 💡") - st.markdown(answer) - except Exception as e: - logging.error(f"Query processing error: {str(e)}") - st.error(f"Error processing query: {str(e)}") - - show_query_history() - - -if __name__ == "__main__": - try: - main() - except Exception as e: - logging.error(f"Application error: {str(e)}") - st.error(f"An unexpected error occurred: {str(e)}") \ No newline at end of file diff --git a/rag_engine/config.json b/rag_engine/config.json deleted file mode 100644 index 28a9e3d5..00000000 --- a/rag_engine/config.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "file_selection_model": "qwen2.5-coder:3b", - "query_processing_model": "qwen2.5-coder:7b", - "rate_limit": 5 -} \ No newline at end of file diff --git a/rag_engine/ovrawmjson/KBI.json b/rag_engine/ovrawmjson/KBI.json deleted file mode 100644 index 00f38be4..00000000 --- a/rag_engine/ovrawmjson/KBI.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "files": [ - { - "name": "t_anno_trans.json", - "introduction": "This script demonstrates how to transfer cell type annotations between two modalities (RNA and ATAC) using a weighted KNN classifier. It loads preprocessed RNA and ATAC data, combines them, performs dimensionality reduction, visualizes the alignment, trains a KNN classifier on the annotated RNA data, transfers labels to the ATAC data, and visualizes the transferred labels and their uncertainty. Finally, it merges the data again and visualizes the combined data with transferred annotations." - }, - { - "name": "t_cluster.json", - "introduction": "This script demonstrates various clustering methods available in omicverse, including Leiden, Louvain, Gaussian Mixture Model (GMM), and Latent Dirichlet Allocation (LDA). It uses the dentategyrus dataset from scvelo and performs preprocessing steps such as normalization, scaling, and PCA. The script then applies each clustering method and visualizes the results using UMAP embeddings. It also includes LDA refinement using a random forest classifier and cNMF analysis for clustering and visualization, along with ARI calculation for evaluating clustering performance." - }, - { - "name": "t_bulk_combat.json", - "introduction": "This script demonstrates batch effect correction on bulk RNA-seq data using Combat. It loads three datasets, combines them, applies Combat, and saves the corrected data. It then visualizes the batch effect before and after correction using boxplots and PCA/UMAP." - }, - { - "name": "t_aucell.json", - "introduction": "This script demonstrates the usage of AUCell for pathway enrichment analysis in omicverse. It loads single cell data, performs necessary preprocessing, prepares pathway database, and then perform AUCell enrichment analysis on one geneset, more than one genesets and test genesets. Also, including differential gene expression analysis, gene expression visualization and pathway enrichment analysis." - }, - { - "name": "t_cellanno.json", - "introduction": "This script demonstrates cell annotation using pySCSA in omicverse. It preprocesses single-cell RNA-seq data from 10x Genomics, performs clustering and dimensionality reduction, and then annotates cells using two different databases ('cellmarker' and 'panglaodb'). The script visualizes the annotation results, calculates cell type proportions, visualizes embeddings with cell type annotations, computes and visualizes the ratio of observed to expected cell numbers (Ro/e), and identifies marker genes." - }, - { - "name": "t_cellfate_genesets.json", - "introduction": "This notebook demonstrates the use of CellFateGenie with gene sets for cell fate analysis. It loads spatial transcriptomics data, prepares gene sets, calculates pathway enrichment scores, initializes and trains a CellFateGenie model, performs adaptive threshold regression, plots filtering results, fits the model, visualizes color fitting, filters genes using Kendall's tau, calculates gene trends, plots gene trends and heatmap, and generates a gene set word cloud." - }, - { - "name": "t_cellfate_gene.json", - "introduction": "This script performs cell fate analysis using CellFateGenie at single-gene resolution. It starts by loading and preprocessing single-cell data, followed by dimensionality reduction and visualization. SEACells is employed to identify metacells. Then, pseudotime is computed using pyVIA, and CellFateGenie is applied to analyze gene trends, filter genes based on significance, visualize gene expression dynamics, and identify border and kernel genes associated with specific cell types." - }, - { - "name": "t_cellphonedb.json", - "introduction": "This script demonstrates the use of CellPhoneDB for cell-cell interaction analysis within the Omicverse framework. It includes data loading, preprocessing, cell-cell interaction inference, network visualization (heatmap, chord diagram, network graph), subnetwork analysis, identification of significant interactions, and downstream pathway enrichment analysis." - }, - { - "name": "t_cluster_space.json", - "introduction": "This script demonstrates several spatial clustering methods, including GraphST, STAGATE, BINARY, and CAST, using 10x Visium spatial transcriptomics data. It preprocesses the data, calculates spatially variable genes, applies each clustering method, performs cluster refinement using mclust, visualizes spatial distribution of clusters, and evaluates the performance using the Adjusted Rand Index (ARI)." - }, - { - "name": "t_cnmf.json", - "introduction": "This script demonstrates the usage of cNMF (consensus Non-negative Matrix Factorization) for identifying gene expression programs in single-cell data. It loads data, preprocesses it, performs cNMF, visualizes results (including K selection, consensus matrix, usage matrix), refines clusters with a Random Forest Classifier, and identifies marker genes." - }, - { - "name": "t_cytotrace.json", - "introduction": "This script uses the omicverse library to analyze single-cell RNA sequencing data. It loads a dataset, preprocesses it, and then applies CytoTRACE2 to predict cellular potency scores. The script visualizes the results on a UMAP embedding, coloring cells by cluster, CytoTRACE2 score, potency, and relative order." - }, - { - "name": "t_deg.json", - "introduction": "This script demonstrates differential gene expression analysis using omicverse. It loads count data, maps gene IDs, performs DEG analysis with t-test, filters genes, visualizes results with volcano and box plots, performs pathway enrichment analysis, and visualizes enrichment results with multi-geneset plots." - }, - { - "name": "t_deseq2.json", - "introduction": "This script demonstrates differential expression analysis using DESeq2 within the omicverse framework. It covers data loading, preprocessing, DEG analysis, filtering, visualization (volcano plot, boxplot), and pathway enrichment analysis." - }, - { - "name": "t_mapping.json", - "introduction": "This script demonstrates cell type mapping from single-cell RNA-seq data to spatial transcriptomics data using Tangram. It loads and preprocesses both datasets, trains the Tangram model, maps cell types to spatial locations, and visualizes the results." - }, - { - "name": "t_metacells.json", - "introduction": "This script demonstrates the use of MetaCell (SEACells) for identifying metacells from single-cell RNA-seq data using the omicverse package. It covers data loading, preprocessing, model training, cell type purity and benchmark evaluation, metacell prediction, visualization (UMAP plots with cell type labels and S_score), highly variable gene identification for the metacells, and visualization of metacell clusters on the UMAP embedding." - }, - { - "name": "t_mofa_glue.json", - "introduction": "This script demonstrates integration of single-cell RNA and ATAC data using MOFA (Multi-Omics Factor Analysis) within Omicverse. It includes pairing cells between RNA and ATAC using GLUE correlation analysis, constructing a MuData object, and selecting a sub group for further analysis. Also includes running MOFA, visualizing results (variance explained, factor correlation, feature weights, UMAP embedding), and analyzing gene weights." - }, - { - "name": "t_metatime.json", - "introduction": "This script demonstrates the usage of MetaTiME for inferring cell types in the tumor microenvironment (TME) from single-cell RNA-seq data. It loads the data, performs dimensionality reduction, initializes and trains the MetaTiME model, and visualizes the predicted cell types." - }, - { - "name": "t_mofa.json", - "introduction": "This script demonstrates Multi-Omics Factor Analysis (MOFA) using the omicverse library. It performs MOFA on scRNA-seq and scATAC-seq data, visualizes variance explained by factors, calculates factor correlation with cell types, retrieves gene weights, and performs visualization such as scatter plots of factors, UMAP embeddings colored by factors, and heatmaps of top features." - }, - { - "name": "t_network.json", - "introduction": "This script demonstrates STRING interaction analysis using omicverse for a set of genes in *Saccharomyces cerevisiae*. It retrieves interaction data, creates a pyPPI object, performs interaction analysis, and plots the interaction network." - }, - { - "name": "t_nocd.json", - "introduction": "This script demonstrates the use of scNOCD for non-overlapping community detection in single-cell RNA-seq data using the omicverse package. It reads and preprocesses scRNA-seq data, applies the scNOCD model, and then visualizes the results on a UMAP embedding colored by leiden clustering and nocd groups." - }, - { - "name": "t_preprocess_cpu.json", - "introduction": "This script demonstrates preprocessing of single-cell RNA-seq data using omicverse on a CPU. It includes data loading, quality control, normalization, highly variable gene selection, scaling, PCA, neighborhood graph computation, UMAP and MDE embedding, cell cycle scoring, clustering, visualization, and differential expression analysis." - }, - { - "name": "t_preprocess_gpu.json", - "introduction": "This script demonstrates how to preprocess single-cell RNA-seq data using Omicverse with GPU acceleration. It covers data loading, quality control, normalization, HVG selection, scaling, PCA, neighborhood graph construction, UMAP/MDE embedding, Leiden clustering, and visualizations." - }, - { - "name": "t_preprocess.json", - "introduction": "This script demonstrates the standard preprocessing workflow for single-cell RNA-seq data using Omicverse. It includes data loading, quality control, normalization, highly variable gene selection, scaling, PCA, UMAP/MDE embedding, clustering, and visualizations." - }, - { - "name": "t_scdeg.json", - "introduction": "This script demonstrates differential gene expression analysis for single-cell RNA-seq data using omicverse. It loads data, preprocesses, performs DEG analysis between 'Alpha' and 'Beta' cells using t-test, visualizes results (volcano plot, boxplot, UMAP), performs metacell analysis with SEACells, repeats the DEG analysis on metacells, and visualizes metacell DEG results." - }, - { - "name": "t_scdrug.json", - "introduction": "This script demonstrates drug response prediction using CaDRReS-Sc within Omicverse. It loads data, infers CNVs to identify tumor cells, preprocesses tumor cell data, performs clustering and then uses a downloaded CaDRReS model for drug response prediction." - }, - { - "name": "t_scmulan.json", - "introduction": "This script demonstrates cell type annotation using scMulan within Omicverse. It loads data, performs gene symbol transformation, normalizes, predicts cell types using a pretrained scMulan model, visualizes the predictions on a 2D embedding, applies smoothing to refine predictions, and provides functions for visualizing selected cell types." - }, - { - "name": "t_simba.json", - "introduction": "This script demonstrates single-cell integration using SIMBA within the omicverse framework. It includes data loading, preprocessing, graph generation and training for SIMBA model, batch correction, and visualization (MDE/UMAP) of the corrected data." - }, - { - "name": "t_single_batch.json", - "introduction": "This script demonstrates batch correction methods for single-cell RNA-seq data using omicverse. It loads multiple datasets, performs QC and preprocessing, applies batch correction using Harmony, Combat, Scanorama, scVI, MIRA (with LDA topic modeling), and benchmarks the methods using scib metrics." - }, - { - "name": "t_slat.json", - "introduction": "This script demonstrates the use of scSLAT for spatially resolved lineage tracing analysis using omicverse. It loads two spatial transcriptomics datasets, calculates spatial networks, runs SLAT to learn cell state embeddings, performs spatial matching between the two datasets, visualizes matching results (3D model, quality index, Sankey diagram), identifies matching cells based on cell type, performs trajectory analysis on a selected cell lineage, and analyzes differentially expressed genes between stages." - }, - { - "name": "t_spaceflow.json", - "introduction": "This script demonstrates the use of SpaceFlow for spatial transcriptomics analysis in omicverse. It loads spatial data, preprocesses it, trains the SpaceFlow model, calculates a pseudo-spatial map (pSM), visualizes the pSM, clusters cells using GMM, and compares predicted clusters with a ground truth." - }, - { - "name": "t_stagate.json", - "introduction": "This script demonstrates spatial transcriptomics analysis using STAGATE within Omicverse. It includes data loading, preprocessing, GraphST training (optional), STAGATE model training, STAGATE prediction, clustering, visualization of spatial clusters and gene expression, pseudo-spatial map calculation, and clustering performance evaluation using ARI." - }, - { - "name": "t_staligner.json", - "introduction": "This script demonstrates spatial transcriptomics alignment using STAligner within Omicverse. It loads multiple spatial datasets, preprocesses them, constructs spatial networks, concatenates datasets, trains the STAligner model, retrieves aligned embeddings, performs clustering and UMAP embedding on the aligned data, and visualizes spatial clustering results." - }, - { - "name": "t_starfysh.json", - "introduction": "This script demonstrates spatial transcriptomics cell type deconvolution using Starfysh. It loads spatial data and signature gene sets, preprocesses data and image, visualizes raw data, identifies anchor spots, performs Archetypal Analysis, refines anchor spots, trains the Starfysh model, and visualizes cell type proportions, gene expression, and other inferred features." - }, - { - "name": "t_stt.json", - "introduction": "This notebook demonstrates the use of Spatially resolved Transcript Time (STT) to infer cell lineages from spatial transcriptomic data. It covers data loading, preprocessing, model initialization, stage estimation, model training, visualizations of spatial and cluster patterns, pathway enrichment analysis and visualization, streamline visualization, Sankey diagram generation, and identification of genes with high multistability." - }, - { - "name": "t_tcga.json", - "introduction": "This script demonstrates how to use the pyTCGA class in omicverse for analyzing TCGA (The Cancer Genome Atlas) data. It initializes a pyTCGA object, imports raw count, FPKM, and TPM matrices, performs gene ID conversion, initializes patient metadata, imports survival data, and performs survival analysis for single genes (e.g., 'MYC') and for all genes, saving the updated data." - }, - { - "name": "t_tosica.json", - "introduction": "This script demonstrates the use of TOSICA for cell type prediction using the omicverse package. It loads reference and query datasets, preprocesses the data, trains a TOSICA model on the reference data, predicts cell types in the query data, performs dimensionality reduction and visualization on the query data with predicted labels, and analyzes differentially expressed pathways between predicted cell types." - }, - { - "name": "t_traj.json", - "introduction": "This script demonstrates trajectory inference using various methods available in Omicverse. It loads single-cell RNA-seq data, preprocesses it, performs trajectory inference using Diffusion Map, Slingshot, and Palantir, and visualizes pseudotime, PAGA graphs, and gene expression trends along trajectories." - }, - { - "name": "t_via_velo.json", - "introduction": "This notebook showcases trajectory inference using VIA with velocity information in omicverse. It loads data, performs preprocessing and velocity calculation, runs VIA, and visualizes the trajectory using various plots including pie chart graph, trajectory GAMs, stream plot, and lineage probabilities." - }, - { - "name": "t_via.json", - "introduction": "This script demonstrates trajectory inference using VIA (Visualization of RNA velocity in single cells) within Omicverse. It loads single-cell RNA-seq data, performs PCA, runs VIA, extracts pseudotime, and visualizes results including cluster graphs, trajectory GAMs, stream plots, lineage probabilities, and gene trends." - }, - { - "name": "t_visualize_bulk.json", - "introduction": "This script demonstrates visualization techniques for bulk RNA-seq data analysis using omicverse. It includes creating Venn diagrams, volcano plots for visualizing differentially expressed genes, and box plots with p-value annotations." - }, - { - "name": "t_visualize_colorsystem.json", - "introduction": "This script demonstrates the use of Omicverse's color system, particularly the Forbidden City color palette. It visualizes the color palette, retrieves specific colors by name or index, and uses these colors to customize plots, including UMAP embeddings, segmented colormaps, and color gradients." - }, - { - "name": "t_visualize_single.json", - "introduction": "This script demonstrates various visualization techniques for single-cell RNA-seq data analysis using Omicverse. It includes embedding plots, cell proportion histograms, stacked area graphs, convex hulls, contour plots, density plots, AUCell visualization, violin plots, bar-dot plots, box plots with statistical tests, complex heatmaps, and marker gene heatmaps." - }, - { - "name": "t_wgcna.json", - "introduction": "This script demonstrates Weighted Gene Co-expression Network Analysis (WGCNA) using the Omicverse library with bulk RNA-seq data. It covers data loading, preprocessing, network construction (including soft-thresholding power calculation, adjacency matrix, and TOM similarity matrix), module detection, visualization of the TOM matrix and sub-networks, module-trait relationship analysis, and identification of hub genes." - } - ] -} \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_anno_trans.json b/rag_engine/ovrawmjson/t_anno_trans.json deleted file mode 100644 index e17ae75e..00000000 --- a/rag_engine/ovrawmjson/t_anno_trans.json +++ /dev/null @@ -1,58 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, matplotlib, and scanpy. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nimport matplotlib.pyplot as plt\nimport scanpy as sc\nov.ov_plot_set()" - }, - { - "action": "Load RNA and ATAC data from preprocessed files. These files are assumed to be outputs from GLUE cross-modal integration and contain the `obsm['X_glue']` layer. The RNA data is already annotated.", - "code": "rna=sc.read(\"data/analysis_lymph/rna-emb.h5ad\")\natac=sc.read(\"data/analysis_lymph/atac-emb.h5ad\")" - }, - { - "action": "Combine RNA and ATAC data into a single AnnData object for visualization.", - "code": "import scanpy as sc\ncombined=sc.concat([rna,atac],merge='same')\ncombined" - }, - { - "action": "Perform Minimum Distortion Embedding (MDE) on the combined data using the `X_glue` layer.", - "code": "combined.obsm['X_mde']=ov.utils.mde(combined.obsm['X_glue'])" - }, - { - "action": "Visualize the combined data using MDE, coloring by the 'domain' (RNA or ATAC) to check alignment.", - "code": "ov.utils.embedding(combined,\n basis='X_mde',\n color='domain',\n title='Layers',\n show=False,\n palette=ov.utils.red_color,\n frameon='small'\n )" - }, - { - "action": "Visualize the RNA data using MDE, coloring by the 'major_celltype' to show existing annotations.", - "code": "ov.utils.embedding(rna,\n basis='X_mde',\n color='major_celltype',\n title='Cell type',\n show=False,\n #palette=ov.utils.red_color,\n frameon='small'\n )" - }, - { - "action": "Train a weighted K-nearest neighbors (KNN) classifier using the `X_glue` features from the annotated RNA data.", - "code": "knn_transformer=ov.utils.weighted_knn_trainer(\n train_adata=rna,\n train_adata_emb='X_glue',\n n_neighbors=15,\n)" - }, - { - "action": "Transfer cell type labels from RNA to ATAC data using the trained KNN classifier. Calculate uncertainty for each prediction.", - "code": "labels,uncert=ov.utils.weighted_knn_transfer(\n query_adata=atac,\n query_adata_emb='X_glue',\n label_keys='major_celltype',\n knn_model=knn_transformer,\n ref_adata_obs=rna.obs,\n)" - }, - { - "action": "Assign the transferred cell type labels and uncertainty scores to the ATAC data.", - "code": "atac.obs[\"transf_celltype\"]=labels.loc[atac.obs.index,\"major_celltype\"]\natac.obs[\"transf_celltype_unc\"]=uncert.loc[atac.obs.index,\"major_celltype\"]" - }, - { - "action": "Copy the transferred cell type labels to the 'major_celltype' column in the ATAC data.", - "code": "atac.obs[\"major_celltype\"]=atac.obs[\"transf_celltype\"].copy()" - }, - { - "action": "Visualize the ATAC data using UMAP, coloring by the transferred cell type labels and their uncertainty.", - "code": "ov.utils.embedding(atac,\n basis='X_umap',\n color=['transf_celltype_unc','transf_celltype'],\n #title='Cell type Un',\n show=False,\n palette=ov.palette()[11:],\n frameon='small'\n )" - }, - { - "action": "Merge the RNA and ATAC data again after transferring annotations.", - "code": "import scanpy as sc\ncombined1=sc.concat([rna,atac],merge='same')\ncombined1" - }, - { - "action": "Perform MDE on the merged data after annotation transfer.", - "code": "combined1.obsm['X_mde']=ov.utils.mde(combined1.obsm['X_glue'])" - }, - { - "action": "Visualize the merged data using MDE, coloring by 'domain' and 'major_celltype' to assess the consistency of cell type annotations across modalities.", - "code": "ov.utils.embedding(combined1,\n basis='X_mde',\n color=['domain','major_celltype'],\n title=['Layers','Cell type'],\n show=False,\n palette=ov.palette()[11:],\n frameon='small'\n )" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_aucell.json b/rag_engine/ovrawmjson/t_aucell.json deleted file mode 100644 index 12aff785..00000000 --- a/rag_engine/ovrawmjson/t_aucell.json +++ /dev/null @@ -1,70 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.utils.ov_plot_set()`. Download pathway database and gene ID annotation pair using `ov.utils.download_pathway_database()` and `ov.utils.download_geneid_annotation_pair()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\n\nov.utils.ov_plot_set()\n\nov.utils.download_pathway_database()\nov.utils.download_geneid_annotation_pair()" - }, - { - "action": "Load the pancreas dataset using `scv.datasets.pancreas()`. Print the AnnData object to inspect its contents.", - "code": "adata = scv.datasets.pancreas()\nadata" - }, - { - "action": "Check the maximum value in the `adata.X` matrix.", - "code": "adata.X.max()" - }, - { - "action": "Normalize the data to a total count of 1e4 per cell and log-transform it.", - "code": "sc.pp.normalize_total(adata, target_sum=1e4)\nsc.pp.log1p(adata)" - }, - { - "action": "Check the maximum value in the `adata.X` matrix after normalization and log-transformation.", - "code": "adata.X.max()" - }, - { - "action": "Prepare the gene set dictionary from the GO Biological Process 2021 file for the Mouse organism using `ov.utils.geneset_prepare()`.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2021.txt',organism='Mouse')" - }, - { - "action": "Assess the enrichment of a single gene set ('response to vitamin (GO:0033273)') using AUCell and visualize it on a UMAP embedding.", - "code": "##Assest one geneset\ngeneset_name='response to vitamin (GO:0033273)'\nov.single.geneset_aucell(adata,\n geneset_name=geneset_name,\n geneset=pathway_dict[geneset_name])\nsc.pl.embedding(adata,\n basis='umap',\n color=[\"{}_aucell\".format(geneset_name)])" - }, - { - "action": "Assess the enrichment of multiple gene sets ('response to vitamin (GO:0033273)' and 'response to vitamin D (GO:0033280)') using AUCell and visualize them on a UMAP embedding.", - "code": "##Assest more than one geneset\ngeneset_names=['response to vitamin (GO:0033273)','response to vitamin D (GO:0033280)']\nov.single.pathway_aucell(adata,\n pathway_names=geneset_names,\n pathways_dict=pathway_dict)\nsc.pl.embedding(adata,\n basis='umap',\n color=[i+'_aucell' for i in geneset_names])" - }, - { - "action": "Assess the enrichment of a custom gene set ('Sox') using AUCell and visualize it on a UMAP embedding.", - "code": "##Assest test geneset\nov.single.geneset_aucell(adata,\n geneset_name='Sox',\n geneset=['Sox17', 'Sox4', 'Sox7', 'Sox18', 'Sox5'])\nsc.pl.embedding(adata,\n basis='umap',\n color=[\"Sox_aucell\"])" - }, - { - "action": "Calculate AUCell enrichment scores for all pathways in the `pathway_dict` using multiple workers. Then, transfer metadata from the original `adata` object to the new `adata_aucs` object.", - "code": "##Assest all pathways\nadata_aucs=ov.single.pathway_aucell_enrichment(adata,\n pathways_dict=pathway_dict,\n num_workers=8)\n\nadata_aucs.obs=adata[adata_aucs.obs.index].obs\nadata_aucs.obsm=adata[adata_aucs.obs.index].obsm\nadata_aucs.obsp=adata[adata_aucs.obs.index].obsp\nadata_aucs" - }, - { - "action": "Save the `adata_aucs` object to an H5AD file and then read it back.", - "code": "adata_aucs.write_h5ad('data/pancreas_auce.h5ad',compression='gzip')\n\nadata_aucs=sc.read('data/pancreas_auce.h5ad')" - }, - { - "action": "Visualize the AUCell enrichment scores for the previously selected gene sets on a UMAP embedding of the `adata_aucs` object.", - "code": "sc.pl.embedding(adata_aucs,\n basis='umap',\n color=geneset_names)" - }, - { - "action": "Perform differential gene expression analysis on the `adata_aucs` object using the t-test method and visualize the top 3 differentially expressed genes per cluster using a dot plot.", - "code": "#adata_aucs.uns['log1p']['base']=None\nsc.tl.rank_genes_groups(adata_aucs, 'clusters', method='t-test',n_genes=100)\nsc.pl.rank_genes_groups_dotplot(adata_aucs,groupby='clusters',\n cmap='Spectral_r',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Extract the names of differentially expressed genes in the 'Beta' cluster based on log2 fold change and p-value thresholds.", - "code": "degs = sc.get.rank_genes_groups_df(adata_aucs, group='Beta', key='rank_genes_groups', log2fc_min=2, \n pval_cutoff=0.05)['names'].squeeze()\ndegs" - }, - { - "action": "Visualize the expression of the differentially expressed genes and the cluster assignments on a UMAP embedding using `sc.pl.embedding`.", - "code": "import matplotlib.pyplot as plt\n#fig, axes = plt.subplots(4,3,figsize=(12,9))\naxes=sc.pl.embedding(adata_aucs,ncols=3,\n basis='umap',show=False,return_fig=True,wspace=0.55,hspace=0.65,\n color=['clusters']+degs.values.tolist(),\n title=[ov.utils.plot_text_set(i,3,20)for i in ['clusters']+degs.values.tolist()])\n\naxes.tight_layout()" - }, - { - "action": "Perform differential gene expression analysis on the original `adata` object using the t-test method.", - "code": "adata.uns['log1p']['base']=None\nsc.tl.rank_genes_groups(adata, 'clusters', method='t-test',n_genes=100)" - }, - { - "action": "Perform pathway enrichment analysis using `ov.single.pathway_enrichment` and visualize the results using `ov.single.pathway_enrichment_plot`.", - "code": "res=ov.single.pathway_enrichment(adata,pathways_dict=pathway_dict,organism='Mouse',\n group_by='clusters',plot=True)\n\nax=ov.single.pathway_enrichment_plot(res,plot_title='Enrichment',cmap='Reds',\n xticklabels=True,cbar=False,square=True,vmax=10,\n yticklabels=True,cbar_kws={'label': '-log10(qvalue)','shrink': 0.5,})" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_bulk_combat.json b/rag_engine/ovrawmjson/t_bulk_combat.json deleted file mode 100644 index dc6c061a..00000000 --- a/rag_engine/ovrawmjson/t_bulk_combat.json +++ /dev/null @@ -1,74 +0,0 @@ -[ - { - "action": "Import necessary libraries: anndata, pandas, and omicverse. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import anndata\nimport pandas as pd\nimport omicverse as ov\nov.ov_plot_set()" - }, - { - "action": "Load the first dataset (GSE18520) from a pickle file, create an AnnData object, transpose it, and assign batch label '1'.", - "code": "dataset_1 = pd.read_pickle(\"data/combat/GSE18520.pickle\")\nadata1=anndata.AnnData(dataset_1.T)\nadata1.obs['batch']='1'\nadata1" - }, - { - "action": "Load the second dataset (GSE66957) from a pickle file, create an AnnData object, transpose it, and assign batch label '2'.", - "code": "dataset_2 = pd.read_pickle(\"data/combat/GSE66957.pickle\")\nadata2=anndata.AnnData(dataset_2.T)\nadata2.obs['batch']='2'\nadata2" - }, - { - "action": "Load the third dataset (GSE69428) from a pickle file, create an AnnData object, transpose it, and assign batch label '3'.", - "code": "dataset_3 = pd.read_pickle(\"data/combat/GSE69428.pickle\")\nadata3=anndata.AnnData(dataset_3.T)\nadata3.obs['batch']='3'\nadata3" - }, - { - "action": "Concatenate the three AnnData objects into a single AnnData object, keeping only the common genes.", - "code": "adata=anndata.concat([adata1,adata2,adata3],merge='same')\nadata" - }, - { - "action": "Perform batch effect correction on the combined AnnData object using the `ov.bulk.batch_correction` function, specifying 'batch' as the batch key.", - "code": "ov.bulk.batch_correction(adata,batch_key='batch')" - }, - { - "action": "Convert the raw data to a pandas DataFrame and transpose it.", - "code": "raw_data=adata.to_df().T\nraw_data.head()" - }, - { - "action": "Convert the batch-corrected data to a pandas DataFrame and transpose it.", - "code": "removing_data=adata.to_df(layer='batch_correction').T\nremoving_data.head()" - }, - { - "action": "Save the raw data and batch-corrected data to CSV files.", - "code": "raw_data.to_csv('raw_data.csv')\nremoving_data.to_csv('removing_data.csv')" - }, - { - "action": "Save the AnnData object to an H5AD file with gzip compression.", - "code": "adata.write_h5ad('adata_batch.h5ad',compression='gzip')\n#adata=ov.read('adata_batch.h5ad')" - }, - { - "action": "Define a dictionary to map batch labels to colors for visualization.", - "code": "color_dict={\n '1':ov.utils.red_color[1],\n '2':ov.utils.blue_color[1],\n '3':ov.utils.green_color[1],\n}" - }, - { - "action": "Create a boxplot of the raw data, coloring each box by its corresponding batch.", - "code": "fig,ax=plt.subplots( figsize = (20,4))\nbp=plt.boxplot(adata.to_df().T,patch_artist=True)\nfor i,batch in zip(range(adata.shape[0]),adata.obs['batch']):\n bp['boxes'][i].set_facecolor(color_dict[batch])\nax.axis(False)\nplt.show()" - }, - { - "action": "Create a boxplot of the batch-corrected data, coloring each box by its corresponding batch.", - "code": "fig,ax=plt.subplots( figsize = (20,4))\nbp=plt.boxplot(adata.to_df(layer='batch_correction').T,patch_artist=True)\nfor i,batch in zip(range(adata.shape[0]),adata.obs['batch']):\n bp['boxes'][i].set_facecolor(color_dict[batch])\nax.axis(False)\nplt.show()" - }, - { - "action": "Store a copy of the raw data in the 'raw' layer of the AnnData object.", - "code": "adata.layers['raw']=adata.X.copy()" - }, - { - "action": "Calculate principal components (PCs) for the raw data using `ov.pp.pca`.", - "code": "ov.pp.pca(adata,layer='raw',n_pcs=50)\nadata" - }, - { - "action": "Calculate principal components (PCs) for the batch-corrected data using `ov.pp.pca`.", - "code": "ov.pp.pca(adata,layer='batch_correction',n_pcs=50)\nadata" - }, - { - "action": "Create a UMAP embedding of the raw data, colored by batch.", - "code": "ov.utils.embedding(adata,\n basis='raw|original|X_pca',\n color='batch',\n frameon='small')" - }, - { - "action": "Create a UMAP embedding of the batch-corrected data, colored by batch.", - "code": "ov.utils.embedding(adata,\n basis='batch_correction|original|X_pca',\n color='batch',\n frameon='small')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cellanno.json b/rag_engine/ovrawmjson/t_cellanno.json deleted file mode 100644 index f7df50f1..00000000 --- a/rag_engine/ovrawmjson/t_cellanno.json +++ /dev/null @@ -1,126 +0,0 @@ -[ - { - "action": "Import the omicverse library and print its version. Import the scanpy library and print its version. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nprint(f'omicverse version:{ov.__version__}')\nimport scanpy as sc\nprint(f'scanpy version:{sc.__version__}')\nov.ov_plot_set()" - }, - { - "action": "Create a directory named 'data'. Download the PBMC3K filtered gene-barcode matrices from 10x Genomics and save them in the 'data' directory. Extract the downloaded tar.gz file in the 'data' directory. Create a directory named 'write' for writing processed data.", - "code": "# !mkdir data\n# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data in Matrix Market format into an AnnData object named `adata`. Use gene symbols for variable names and cache the data for faster subsequent reading.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading" - }, - { - "action": "Perform quality control on the AnnData object `adata` using the `ov.pp.qc` function. Filter cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes.", - "code": "#adata=ov.single.scanpy_lazy(adata)\n\n#quantity control\nadata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})" - }, - { - "action": "Preprocess the AnnData object `adata` using the `ov.pp.preprocess` function. Normalize the data using the 'shiftlog|pearson' mode and calculate 2000 highly variable genes (HVGs).", - "code": "#normalize and high variable genes (HVGs) calculated\nadata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)" - }, - { - "action": "Save the whole genes in `adata.raw` and filter out non-highly variable genes from `adata`.", - "code": "#save the whole genes and filter the non-HVGs\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]" - }, - { - "action": "Scale the data in `adata.X` using `ov.pp.scale`.", - "code": "#scale the adata.X\nov.pp.scale(adata)" - }, - { - "action": "Perform Principal Component Analysis (PCA) on the scaled data in `adata` using `ov.pp.pca`. Use the 'scaled' layer and calculate 50 principal components.", - "code": "#Dimensionality Reduction\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Construct a neighborhood graph using `sc.pp.neighbors`. Use 15 neighbors, 50 principal components, and the 'scaled|original|X_pca' representation.", - "code": "#Neighbourhood graph construction\nsc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph using `sc.tl.leiden`.", - "code": "#clusters\nsc.tl.leiden(adata)" - }, - { - "action": "Calculate Minimum Distortion Embedding (MDE) for visualization using `ov.utils.mde` and store the result in `adata.obsm[\"X_mde\"]`. Use the 'scaled|original|X_pca' representation as input.", - "code": "#Dimensionality Reduction for visualization(X_mde=X_umap+GPU)\nadata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])\nadata" - }, - { - "action": "Create a pySCSA object named `scsa` from the AnnData object `adata`. Set parameters for fold change, p-value, cell type, target, tissue, and model path.", - "code": "scsa=ov.single.pySCSA(adata=adata,\n foldchange=1.5,\n pvalue=0.01,\n celltype='normal',\n target='cellmarker',\n tissue='All',\n model_path='temp/pySCSA_2023_v2_plus.db' \n)" - }, - { - "action": "Perform cell annotation using the `scsa.cell_anno` method. Specify the clustering type as 'leiden', annotate all clusters, and calculate rank genes groups.", - "code": "anno=scsa.cell_anno(clustertype='leiden',\n cluster='all',rank_rep=True)" - }, - { - "action": "Query and display only the better-annotated results using `scsa.cell_auto_anno` and store the results in adata with the key 'scsa_celltype_cellmarker'.", - "code": "scsa.cell_auto_anno(adata,key='scsa_celltype_cellmarker')" - }, - { - "action": "Create a new pySCSA object named `scsa` with the same parameters as before, but change the target to 'panglaodb'.", - "code": "scsa=ov.single.pySCSA(adata=adata,\n foldchange=1.5,\n pvalue=0.01,\n celltype='normal',\n target='panglaodb',\n tissue='All',\n model_path='temp/pySCSA_2023_v2_plus.db'\n \n)" - }, - { - "action": "Perform cell annotation using the new `scsa` object with 'panglaodb' as the target.", - "code": "res=scsa.cell_anno(clustertype='leiden',\n cluster='all',rank_rep=True)" - }, - { - "action": "Print the cell annotation results using `scsa.cell_anno_print()`.", - "code": "scsa.cell_anno_print()" - }, - { - "action": "Query and display only the better-annotated results using `scsa.cell_auto_anno` and store the results in adata with the key 'scsa_celltype_panglaodb'.", - "code": "scsa.cell_auto_anno(adata,key='scsa_celltype_panglaodb')" - }, - { - "action": "Visualize the embeddings using `ov.utils.embedding`. Display the 'leiden' clusters, 'scsa_celltype_cellmarker' annotations, and 'scsa_celltype_panglaodb' annotations on the 'X_mde' embedding. Customize the legend, frame, and color palette.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden','scsa_celltype_cellmarker','scsa_celltype_panglaodb'], \n legend_loc='on data', \n frameon='small',\n legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Assign the first 1000 cells to group 'B' and the rest to group 'A' in a new column named 'group' in `adata.obs`. Visualize the 'group' on the 'X_mde' embedding using `ov.utils.embedding`.", - "code": "#Randomly designate the first 1000 cells as group B and the rest as group A\nadata.obs['group']='A'\nadata.obs.loc[adata.obs.index[:1000],'group']='B'\n#Colored\nov.utils.embedding(adata,\n basis='X_mde',\n color=['group'], \n frameon='small',legend_fontoutline=2,\n palette=ov.utils.red_color,\n )" - }, - { - "action": "Plot the cell type proportions using `ov.utils.plot_cellproportion`. Specify 'scsa_celltype_cellmarker' as the cell type clusters, 'group' as the visual clusters, and set the figure size.", - "code": "ov.utils.plot_cellproportion(adata=adata,celltype_clusters='scsa_celltype_cellmarker',\n visual_clusters='group',\n visual_name='group',figsize=(2,4))" - }, - { - "action": "Visualize the embeddings with cell type annotations using `ov.utils.plot_embedding_celltype`. Specify the 'X_mde' embedding, 'scsa_celltype_cellmarker' as the cell type key, and customize the title and ranges.", - "code": "ov.utils.plot_embedding_celltype(adata,figsize=None,basis='X_mde',\n celltype_key='scsa_celltype_cellmarker',\n title=' Cell type',\n celltype_range=(2,6),\n embedding_range=(4,10),)" - }, - { - "action": "Calculate the ratio of observed to expected cell numbers (Ro/e) for each cluster in different groups using `ov.utils.roe`. Specify 'group' as the sample key and 'scsa_celltype_cellmarker' as the cell type key.", - "code": "roe=ov.utils.roe(adata,sample_key='group',cell_type_key='scsa_celltype_cellmarker')" - }, - { - "action": "Create a heatmap to visualize the Ro/e values using `seaborn.heatmap`. Transform the Ro/e values into categorical labels ('+++', '++', '+', '+/-') for annotation. Customize the colormap, axis labels, and title.", - "code": "import seaborn as sns\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(2,4))\n\ntransformed_roe = roe.copy()\ntransformed_roe = transformed_roe.applymap(\n lambda x: '+++' if x >= 2 else ('++' if x >= 1.5 else ('+' if x >= 1 else '+/-')))\n\nsns.heatmap(roe, annot=transformed_roe, cmap='RdBu_r', fmt='', \n cbar=True, ax=ax,vmin=0.5,vmax=1.5,cbar_kws={'shrink':0.5})\nplt.xticks(fontsize=12)\nplt.yticks(fontsize=12)\n\nplt.xlabel('Group',fontsize=13)\nplt.ylabel('Cell type',fontsize=13)\nplt.title('Ro/e',fontsize=13)" - }, - { - "action": "Prepare a dictionary `res_marker_dict` containing marker genes for different cell types.", - "code": "res_marker_dict={\n 'Megakaryocyte':['ITGA2B','ITGB3'],\n 'Dendritic cell':['CLEC10A','IDO1'],\n 'Monocyte' :['S100A8','S100A9','LST1',],\n 'Macrophage':['CSF1R','CD68'],\n 'B cell':['MS4A1','CD79A','MZB1',],\n 'NK/NKT cell':['GNLY','KLRD1'],\n 'CD8+T cell':['CD8A','CD8B'],\n 'Treg':['CD4','CD40LG','IL7R','FOXP3','IL2RA'],\n 'CD4+T cell':['PTPRC','CD3D','CD3E'],\n\n}" - }, - { - "action": "Calculate a dendrogram for the 'leiden' clusters using `sc.tl.dendrogram`. Create a dot plot using `sc.pl.dotplot` to visualize the expression of marker genes from `res_marker_dict` in each 'leiden' cluster. Include the dendrogram and standardize the scale by variable.", - "code": "sc.tl.dendrogram(adata,'leiden')\nsc.pl.dotplot(adata, res_marker_dict, 'leiden', \n dendrogram=True,standard_scale='var')" - }, - { - "action": "Create a dictionary `cluster2annotation` to map 'leiden' cluster IDs to manual annotation labels based on the dot plot. Annotate the cells in `adata` using `ov.single.scanpy_cellanno_from_dict` based on the `cluster2annotation` dictionary and 'leiden' clustering.", - "code": "# create a dictionary to map cluster to annotation label\ncluster2annotation = {\n '0': 'T cell',\n '1': 'T cell',\n '2': 'Monocyte',#Germ-cell(Oid)\n '3': 'B cell',#Germ-cell(Oid)\n '4': 'T cell',\n '5': 'Macrophage',\n '6': 'NKT cells',\n '7': 'T cell',\n '8':'Monocyte',\n '9':'Dendritic cell',\n '10':'Megakaryocyte',\n\n}\nov.single.scanpy_cellanno_from_dict(adata,anno_dict=cluster2annotation,\n clustertype='leiden')" - }, - { - "action": "Compare the automatic annotation results ('scsa_celltype_cellmarker') with the manual annotation ('major_celltype') by visualizing them on the 'X_mde' embedding using `ov.utils.embedding`. Customize the legend, frame, and color palette.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color=['major_celltype','scsa_celltype_cellmarker'], \n legend_loc='on data', frameon='small',legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Obtain the marker genes for each cell type using `ov.single.get_celltype_marker`. Specify 'scsa_celltype_cellmarker' as the cluster type.", - "code": "marker_dict=ov.single.get_celltype_marker(adata,clustertype='scsa_celltype_cellmarker')\nmarker_dict.keys()" - }, - { - "action": "Print the marker genes for 'B cell' from the `marker_dict`.", - "code": "marker_dict['B cell']" - }, - { - "action": "Retrieve the available tissues in the database using `scsa.get_model_tissue()`.", - "code": "scsa.get_model_tissue()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cellfate.json b/rag_engine/ovrawmjson/t_cellfate.json deleted file mode 100644 index f8de33cd..00000000 --- a/rag_engine/ovrawmjson/t_cellfate.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, pandas, and tqdm. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport pandas as pd\nfrom tqdm.auto import tqdm\nov.plot_set()" - }, - { - "action": "Load the mouse hematopoiesis data from Nestorowa et al. (2016) using `ov.single.mouse_hsc_nestorowa16()`.", - "code": "adata = ov.single.mouse_hsc_nestorowa16()\nadata" - }, - { - "action": "Load the human prior interaction network from the 'nichenet' dataset using `ov.single.load_human_prior_interaction_network()`.", - "code": "prior_network = ov.single.load_human_prior_interaction_network(dataset='nichenet')" - }, - { - "action": "Convert the gene symbols in the prior network from human to mouse using `ov.single.convert_human_to_mouse_network()`.", - "code": "prior_network = ov.single.convert_human_to_mouse_network(prior_network,server_name='asia')\nprior_network" - }, - { - "action": "Save the converted prior network to a compressed tab-separated file.", - "code": "prior_network.to_csv('result/combined_network_Mouse.txt.gz',sep='\t')" - }, - { - "action": "Alternatively, read the prior network from the saved file using `ov.read()`.", - "code": "prior_network=ov.read('result/combined_network_Mouse.txt.gz',index_col=0)" - }, - { - "action": "Initialize the CEFCON object with the AnnData object, prior network, number of repeats, and solver.", - "code": "CEFCON_obj = ov.single.pyCEFCON(adata, prior_network, repeats=5, solver='GUROBI')\nCEFCON_obj" - }, - { - "action": "Preprocess the data for CEFCON analysis using `CEFCON_obj.preprocess()`.", - "code": "CEFCON_obj.preprocess()" - }, - { - "action": "Train the CEFCON model using `CEFCON_obj.train()`.", - "code": "CEFCON_obj.train()" - }, - { - "action": "Identify driver regulators for each lineage using `CEFCON_obj.predicted_driver_regulators()`.", - "code": "CEFCON_obj.predicted_driver_regulators()" - }, - { - "action": "Display the top driver regulators for the 'E_pseudotime' lineage.", - "code": "CEFCON_obj.cefcon_results_dict['E_pseudotime'].driver_regulator.head()" - }, - { - "action": "Predict regulon-like gene modules (RGMs) using `CEFCON_obj.predicted_RGM()`.", - "code": "CEFCON_obj.predicted_RGM()" - }, - { - "action": "Access the results for the 'E_pseudotime' lineage.", - "code": "CEFCON_obj.cefcon_results_dict['E_pseudotime']" - }, - { - "action": "Store the lineage name and results in variables.", - "code": "lineage = 'E_pseudotime'\nresult = CEFCON_obj.cefcon_results_dict[lineage]" - }, - { - "action": "Create an AnnData object from the gene embeddings.", - "code": "gene_ad=sc.AnnData(result.gene_embedding)" - }, - { - "action": "Compute the neighborhood graph of the gene embeddings.", - "code": "sc.pp.neighbors(gene_ad, n_neighbors=30, use_rep='X')" - }, - { - "action": "Perform Leiden clustering on the gene embeddings.", - "code": "sc.tl.leiden(gene_ad, resolution=1)" - }, - { - "action": "Compute UMAP embeddings for the gene embeddings.", - "code": "sc.tl.umap(gene_ad, n_components=2, min_dist=0.3)" - }, - { - "action": "Plot the Leiden clustering results on the UMAP embeddings.", - "code": "ov.utils.embedding(gene_ad,basis='X_umap',legend_loc='on data',\n legend_fontsize=8, legend_fontoutline=2,\n color='leiden',frameon='small',title='Leiden clustering using CEFCON\\nderived gene embeddings')" - }, - { - "action": "Prepare data for plotting influence scores of driver regulators.", - "code": "import matplotlib.pyplot as plt\nimport seaborn as sns\ndata_for_plot = result.driver_regulator[result.driver_regulator['is_driver_regulator']]\ndata_for_plot = data_for_plot[0:20]" - }, - { - "action": "Create a horizontal bar plot of influence scores for the top 20 driver regulators.", - "code": "plt.figure(figsize=(2, 20 * 0.2))\nsns.set_theme(style='ticks', font_scale=0.5)\n\nax = sns.barplot(x='influence_score', y=data_for_plot.index, data=data_for_plot, orient='h',\n palette=sns.color_palette(f\"ch:start=.5,rot=-.5,reverse=1,dark=0.4\", n_colors=20))\nax.set_title(result.name)\nax.set_xlabel('Influence score')\nax.set_ylabel('Driver regulators')\n\nax.spines['left'].set_position(('outward', 10))\nax.spines['bottom'].set_position(('outward', 10))\nplt.xticks(fontsize=12)\nplt.yticks(fontsize=12)\n\nplt.grid(False)\nax.spines['top'].set_visible(False)\nax.spines['right'].set_visible(False)\nax.spines['bottom'].set_visible(True)\nax.spines['left'].set_visible(True)\n\nplt.title('E_pseudotime',fontsize=12)\nplt.xlabel('Influence score',fontsize=12)\nplt.ylabel('Driver regulon',fontsize=12)\n\nsns.despine()" - }, - { - "action": "Plot a Venn diagram of driver genes using `result.plot_driver_genes_Venn()`.", - "code": "result.plot_driver_genes_Venn()" - }, - { - "action": "Create a subset of the AnnData object containing cells from the specific lineage.", - "code": "adata_lineage = adata[adata.obs_names[adata.obs[result.name].notna()],:]" - }, - { - "action": "Plot a heatmap of the RGM activity matrix using `result.plot_RGM_activity_heatmap()`.", - "code": "result.plot_RGM_activity_heatmap(cell_label=adata_lineage.obs['cell_type_finely'],\n type='out',col_cluster=True,bbox_to_anchor=(1.48, 0.25))" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cellfate_gene.json b/rag_engine/ovrawmjson/t_cellfate_gene.json deleted file mode 100644 index 2255abde..00000000 --- a/rag_engine/ovrawmjson/t_cellfate_gene.json +++ /dev/null @@ -1,186 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scvelo, and matplotlib. Set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scvelo as scv\nimport matplotlib.pyplot as plt\nov.ov_plot_set()" - }, - { - "action": "Load the dentategyrus dataset using `scv.datasets.dentategyrus()`.", - "code": "adata = scv.datasets.dentategyrus()\nadata" - }, - { - "action": "Perform quality control on the dataset using `ov.pp.qc()`, filtering cells based on mitochondrial percentage, number of UMIs, and number of detected genes.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.15, 'nUMIs': 500, 'detected_genes': 250},\n )" - }, - { - "action": "Store the raw counts in the 'counts' layer using `ov.utils.store_layers()`.", - "code": "ov.utils.store_layers(adata,layers='counts')\nadata" - }, - { - "action": "Preprocess the dataset using `ov.pp.preprocess()` with 'shiftlog|pearson' mode and selecting 2000 highly variable genes.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',\n n_HVGs=2000)" - }, - { - "action": "Store the raw data in `adata.raw` and subset the data to include only highly variable genes.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Scale the data using `ov.pp.scale()` and perform PCA using `ov.pp.pca()` on the scaled data with 50 principal components. Then, apply MDE to the PCA results.", - "code": "ov.pp.scale(adata)\nov.pp.pca(adata,layer='scaled',n_pcs=50)\n\nadata.obsm[\"X_mde_pca\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])" - }, - { - "action": "Convert the raw data back to an AnnData object.", - "code": "adata=adata.raw.to_adata()" - }, - { - "action": "Create an embedding plot using `ov.utils.embedding()` based on 'X_mde_pca' and color the points by 'clusters'.", - "code": "fig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(adata,\n basis='X_mde_pca',frameon='small',\n color=['clusters'],show=False,ax=ax)" - }, - { - "action": "Filter out cells belonging to the 'Endothelial' cluster and initialize a SEACells model.", - "code": "import SEACells\nadata=adata[adata.obs['clusters']!='Endothelial']\nmodel = SEACells.core.SEACells(adata, \n build_kernel_on='scaled|original|X_pca', \n n_SEACells=200, \n n_waypoint_eigs=10,\n convergence_epsilon = 1e-5)" - }, - { - "action": "Construct the kernel matrix using the initialized SEACells model.", - "code": "model.construct_kernel_matrix()\nM = model.kernel_matrix\n# Initialize archetypes\nmodel.initialize_archetypes()" - }, - { - "action": "Fit the SEACells model with a minimum of 10 and a maximum of 50 iterations.", - "code": "model.fit(min_iter=10, max_iter=50)" - }, - { - "action": "Plot the convergence of the SEACells model.", - "code": "# Check for convergence \nget_ipython().run_line_magic('matplotlib', 'inline')\nmodel.plot_convergence()" - }, - { - "action": "Force the model to run additional iterations using the `model.step()` function.", - "code": "# You can force the model to run additional iterations step-wise using the .step() function\nprint(f'Run for {len(model.RSS_iters)} iterations')\nfor _ in range(10):\n model.step()\nprint(f'Run for {len(model.RSS_iters)} iterations')" - }, - { - "action": "Plot the convergence of the SEACells model again.", - "code": "# Check for convergence \nget_ipython().run_line_magic('matplotlib', 'inline')\nmodel.plot_convergence()" - }, - { - "action": "Plot a 2D representation of the Dentategyrus Metacells using `SEACells.plot.plot_2D()`.", - "code": "get_ipython().run_line_magic('matplotlib', 'inline')\nSEACells.plot.plot_2D(adata, key='X_mde_pca', colour_metacells=False,\n figsize=(4,4),cell_size=20,title='Dentategyrus Metacells',\n )" - }, - { - "action": "Set `adata.raw` to a copy of `adata`.", - "code": "adata.raw=adata.copy()" - }, - { - "action": "Summarize the data by soft SEACells using `SEACells.core.summarize_by_soft_SEACell()`.", - "code": "SEACell_soft_ad = SEACells.core.summarize_by_soft_SEACell(adata, model.A_, \n celltype_label='clusters',\n summarize_layer='raw', minimum_weight=0.05)\nSEACell_soft_ad" - }, - { - "action": "Set `SEACell_soft_ad.raw` to a copy of `SEACell_soft_ad` and identify highly variable genes.", - "code": "import scanpy as sc\nSEACell_soft_ad.raw=SEACell_soft_ad.copy()\nsc.pp.highly_variable_genes(SEACell_soft_ad, n_top_genes=2000, inplace=True)\nSEACell_soft_ad=SEACell_soft_ad[:,SEACell_soft_ad.var.highly_variable]" - }, - { - "action": "Scale the data in `SEACell_soft_ad`, perform PCA, compute neighbors, and generate a UMAP embedding.", - "code": "ov.pp.scale(SEACell_soft_ad)\nov.pp.pca(SEACell_soft_ad,layer='scaled',n_pcs=50)\nsc.pp.neighbors(SEACell_soft_ad, use_rep='scaled|original|X_pca')\nsc.tl.umap(SEACell_soft_ad)" - }, - { - "action": "Set the 'celltype' observation to a categorical type and reorder categories based on `adata.obs['clusters']`. Also, set the color palette.", - "code": "SEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].astype('category')\nSEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories)\nSEACell_soft_ad.uns['celltype_colors']=adata.uns['clusters_colors']" - }, - { - "action": "Create an embedding plot of the metacells using `ov.utils.embedding()` based on 'X_umap' and color the points by 'celltype'.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(SEACell_soft_ad,\n basis='X_umap',\n color=[\"celltype\"],\n title='Meta Celltype',\n frameon='small',\n legend_fontsize=12,\n #palette=ov.utils.palette()[11:],\n ax=ax,\n show=False)" - }, - { - "action": "Calculate pseudotime using pyVIA with manually adjusted parameters.", - "code": "v0 = ov.single.pyVIA(adata=SEACell_soft_ad,adata_key='scaled|original|X_pca',\n adata_ncomps=50, basis='X_umap',\n clusters='celltype',knn=10, root_user=['nIPC','Neuroblast'],\n dataset='group', \n random_seed=112,is_coarse=True, \n preserve_disconnected=True,\n piegraph_arrow_head_width=0.05,piegraph_edgeweight_scalingfactor=2.5,\n gene_matrix=SEACell_soft_ad.X,velo_weight=0.5,\n edgebundle_pruning_twice=False, edgebundle_pruning=0.15, \n jac_std_global=0.05,too_big_factor=0.05,\n cluster_graph_pruning_std=1,\n time_series=False,\n )\n\nv0.run()" - }, - { - "action": "Obtain the pseudotime values and store them in the `SEACell_soft_ad` object.", - "code": "v0.get_pseudotime(SEACell_soft_ad)" - }, - { - "action": "Create an embedding plot using `ov.utils.embedding()` based on 'X_umap' and color the points by 'pt_via' (pseudotime).", - "code": "#v0.get_pseudotime(SEACell_soft_ad)\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(SEACell_soft_ad,\n basis='X_umap',\n color=[\"pt_via\"],\n title='Pseudotime',\n frameon='small',\n cmap='Reds',\n #size=40,\n legend_fontsize=12,\n #palette=ov.utils.palette()[11:],\n ax=ax,\n show=False)" - }, - { - "action": "Save the metacell results to an h5ad file.", - "code": "SEACell_soft_ad.write_h5ad('data/tutorial_meta_den.h5ad',compression='gzip')" - }, - { - "action": "Read the metacell results from the h5ad file.", - "code": "SEACell_soft_ad=ov.utils.read('data/tutorial_meta_den.h5ad')" - }, - { - "action": "Initialize the cellfategenie object with the metacell data and pseudotime.", - "code": "cfg_obj=ov.single.cellfategenie(SEACell_soft_ad,pseudotime='pt_via')\ncfg_obj.model_init()" - }, - { - "action": "Perform Adaptive Threshold Regression (ATR) to find the minimum number of genes for accurate regression.", - "code": "cfg_obj.ATR(stop=500,flux=0.01)" - }, - { - "action": "Plot the filtering results from the ATR analysis.", - "code": "fig,ax=cfg_obj.plot_filtering(color='#5ca8dc')\nax.set_title('Dentategyrus Metacells\\nCellFateGenie')" - }, - { - "action": "Fit the model and obtain the results.", - "code": "res=cfg_obj.model_fit()" - }, - { - "action": "Plot the color fitting for the raw data type, colored by cell type.", - "code": "cfg_obj.plot_color_fitting(type='raw',cluster_key='celltype')" - }, - { - "action": "Plot the color fitting for the filtered data type, colored by cell type.", - "code": "cfg_obj.plot_color_fitting(type='filter',cluster_key='celltype')" - }, - { - "action": "Perform Kendalltau test to filter genes based on trend significance.", - "code": "kt_filter=cfg_obj.kendalltau_filter()\nkt_filter.head()" - }, - { - "action": "Extract gene names with p-value less than the mean p-value and calculate gene trends.", - "code": "var_name=kt_filter.loc[kt_filter['pvalue']=0.\n result_precision = 3, # Sets the rounding for the mean values in significan_means.\n pvalue = 0.05, # P-value threshold to employ for significance.\n subsampling = False, # To enable subsampling the data (geometri sketching).\n subsampling_log = False, # (mandatory) enable subsampling log1p for non log-transformed data inputs.\n subsampling_num_pc = 100, # Number of componets to subsample via geometric skectching (dafault: 100).\n subsampling_num_cells = 1000, # Number of cells to subsample (integer) (default: 1/3 of the dataset).\n separator = '|', # Sets the string to employ to separate cells in the results dataframes \"cellA|CellB\".\n debug = False, # Saves all intermediate tables employed during the analysis in pkl format.\n output_path = out_path, # Path to save results.\n output_suffix = None # Replaces the timestamp in the output files by a user defined string in the (default: None).\n )" - }, - { - "action": "Save the CellPhoneDB results to a pickle file.", - "code": "ov.utils.save(cpdb_results,'data/cpdb/gex_cpdb_test.pkl')" - }, - { - "action": "Load the CellPhoneDB results from a pickle file.", - "code": "cpdb_results=ov.utils.load('data/cpdb/gex_cpdb_test.pkl')" - }, - { - "action": "Calculate cell-cell interaction network using `ov.single.cpdb_network_cal`.", - "code": "interaction=ov.single.cpdb_network_cal(adata = adata,\n pvals = cpdb_results['pvalues'],\n celltype_key = \"cell_labels\",)" - }, - { - "action": "Display the head of the interaction edges DataFrame.", - "code": "interaction['interaction_edges'].head()" - }, - { - "action": "Set the plotting style using `ov.plot_set()`.", - "code": "ov.plot_set()" - }, - { - "action": "Create and display a heatmap of cell-cell interactions.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n fontsize=11,\n ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',})" - }, - { - "action": "Create and display a heatmap of cell-cell interactions for specific source cells.", - "code": "fig, ax = plt.subplots(figsize=(2,4)) \nov.pl.cpdb_heatmap(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'],\n ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',})" - }, - { - "action": "Create and display a chord diagram of cell-cell interactions.", - "code": "fig=ov.pl.cpdb_chord(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n count_min=60,fontsize=12,padding=50,radius=100,save=None,)\nfig.show()" - }, - { - "action": "Create and display a network graph of cell-cell interactions.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n counts_min=60,\n nodesize_scale=5,\n ax=ax)" - }, - { - "action": "Create and display a network graph of cell-cell interactions for specific source cells.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n counts_min=60,\n nodesize_scale=5,\n source_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'],\n ax=ax)" - }, - { - "action": "Create and display a network graph of cell-cell interactions for specific target cells.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(adata,interaction['interaction_edges'],celltype_key='cell_labels',\n counts_min=60,\n nodesize_scale=5,\n target_cells=['EVT_1','EVT_2','dNK1','dNK2','dNK3'],\n ax=ax)" - }, - { - "action": "Plot a network of cell-cell interactions with customized appearance.", - "code": "ov.single.cpdb_plot_network(adata=adata,\n interaction_edges=interaction['interaction_edges'],\n celltype_key='cell_labels',\n nodecolor_dict=None,title='EVT Network',\n edgeswidth_scale=25,nodesize_scale=10,\n pos_scale=1,pos_size=10,figsize=(6,6),\n legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10)" - }, - { - "action": "Extract a subnetwork of interactions based on specified cell types.", - "code": "sub_i=interaction['interaction_edges']\nsub_i=sub_i.loc[sub_i['SOURCE'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])]\nsub_i=sub_i.loc[sub_i['TARGET'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])]" - }, - { - "action": "Create a sub-anndata object containing only specified cell types.", - "code": "sub_adata=adata[adata.obs['cell_labels'].isin(['EVT_1','EVT_2','dNK1','dNK2','dNK3'])]\nsub_adata" - }, - { - "action": "Plot the sub-interaction network between the cells in scRNA-seq for the extracted subnetwork.", - "code": "ov.single.cpdb_plot_network(adata=sub_adata,\n interaction_edges=sub_i,\n celltype_key='cell_labels',\n nodecolor_dict=None,title='Sub-EVT Network',\n edgeswidth_scale=25,nodesize_scale=1,\n pos_scale=1,pos_size=10,figsize=(5,5),\n legend_ncol=3,legend_bbox=(0.8,0.2),legend_fontsize=10)" - }, - { - "action": "Create and display a chord diagram for the subnetwork.", - "code": "fig=ov.pl.cpdb_chord(sub_adata,sub_i,celltype_key='cell_labels',\n count_min=10,fontsize=12,padding=60,radius=100,save=None,)\nfig.show()" - }, - { - "action": "Create and display a network graph for the subnetwork.", - "code": "fig, ax = plt.subplots(figsize=(4,4)) \nov.pl.cpdb_network(sub_adata,sub_i,celltype_key='cell_labels',\n counts_min=10,\n nodesize_scale=5,\n ax=ax)" - }, - { - "action": "Create and display a heatmap for the subnetwork.", - "code": "fig, ax = plt.subplots(figsize=(3,3)) \nov.pl.cpdb_heatmap(sub_adata,sub_i,celltype_key='cell_labels',\n ax=ax,legend_kws={'fontsize':12,'bbox_to_anchor':(5, -0.9),'loc':'center left',})" - }, - { - "action": "Extract significant interactions where 'eEVT' and 'iEVT' are targets using `ov.single.cpdb_exact_target`.", - "code": "sub_means=ov.single.cpdb_exact_target(cpdb_results['means'],['eEVT','iEVT'])\nsub_means=ov.single.cpdb_exact_source(sub_means,['dNK1','dNK2','dNK3'])\nsub_means.head() " - }, - { - "action": "Plot a heatmap of interacting proteins between specified source and target cells.", - "code": "ov.pl.cpdb_interacting_heatmap(adata=adata,\n celltype_key='cell_labels',\n means=cpdb_results['means'],\n pvalues=cpdb_results['pvalues'],\n source_cells=['dNK1','dNK2','dNK3'],\n target_cells=['eEVT','iEVT'],\n plot_secret=True,\n min_means=3,\n nodecolor_dict=None,\n ax=None,\n figsize=(2,6),\n fontsize=10,)" - }, - { - "action": "Plot a grouped heatmap showing the expression of ligands in source cells and receptors in target cells.", - "code": "ov.pl.cpdb_group_heatmap(adata=adata,\n celltype_key='cell_labels',\n means=cpdb_results['means'],\n cmap={'Target':'Blues','Source':'Reds'},\n source_cells=['dNK1','dNK2','dNK3'],\n target_cells=['eEVT','iEVT'],\n plot_secret=True,\n min_means=3,\n nodecolor_dict=None,\n ax=None,\n figsize=(2,6),\n fontsize=10,)" - }, - { - "action": "Plot an interacting network graph showing connections between ligands, receptors, source, and target cells.", - "code": "ov.pl.cpdb_interacting_network(adata=adata,\n celltype_key='cell_labels',\n means=cpdb_results['means'],\n source_cells=['dNK1','dNK2','dNK3'],\n target_cells=['eEVT','iEVT'],\n means_min=1,\n means_sum_min=1, \n nodecolor_dict=None,\n ax=None,\n figsize=(6,6),\n fontsize=10)" - }, - { - "action": "Filter out rows with missing gene_a or gene_b, and combine gene_a and gene_b into a single list for enrichment analysis.", - "code": "sub_means=sub_means.loc[~sub_means['gene_a'].isnull()]\nsub_means=sub_means.loc[~sub_means['gene_b'].isnull()]\nenrichr_genes=sub_means['gene_a'].tolist()+sub_means['gene_b'].tolist()" - }, - { - "action": "Prepare a pathway dictionary for gene set enrichment analysis using human organism data.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Human')" - }, - { - "action": "Perform gene set enrichment analysis on the list of genes using the prepared pathway dictionary.", - "code": "#deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()\nenr=ov.bulk.geneset_enrichment(gene_list=enrichr_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='human')" - }, - { - "action": "Set the plotting style and create a gene set enrichment plot with specified parameters.", - "code": "ov.plot_set()\nov.bulk.geneset_plot(enr,figsize=(2,4),fig_title='GO-Bio(EVT)',\n cax_loc=[2, 0.45, 0.5, 0.02],num=8,\n bbox_to_anchor_used=(-0.25, -13),custom_ticks=[10,100],\n cmap='Greens')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cluster.json b/rag_engine/ovrawmjson/t_cluster.json deleted file mode 100644 index 01e9bb21..00000000 --- a/rag_engine/ovrawmjson/t_cluster.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\nov.plot_set()" - }, - { - "action": "Import the scvelo library and load the dentategyrus dataset using `scv.datasets.dentategyrus()`. The dataset is stored in the `adata` variable.", - "code": "import scvelo as scv\nadata=scv.datasets.dentategyrus()\nadata" - }, - { - "action": "Preprocess the `adata` object using `ov.pp.preprocess()`. The preprocessing steps include shifting and logging the data, applying Pearson residuals, and selecting the top 3000 highly variable genes. The preprocessed data is stored back in `adata`. The raw data is saved in `adata.raw`, and only the highly variable genes are kept. Finally, the data is scaled and PCA is performed using `ov.pp.scale()` and `ov.pp.pca()`.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=3000,)\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nov.pp.scale(adata)\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Plot the variance ratio explained by each principal component using `ov.utils.plot_pca_variance_ratio()`.", - "code": "ov.utils.plot_pca_variance_ratio(adata)" - }, - { - "action": "Compute the k-nearest neighbor graph using `sc.pp.neighbors()`. The number of neighbors is set to 15, the number of principal components is set to 50, and the representation used is 'scaled|original|X_pca'. Then, perform Leiden clustering using `ov.utils.cluster()` with `method='leiden'` and `resolution=1`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')\nov.utils.cluster(adata,method='leiden',resolution=1)" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and Leiden clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','leiden'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','leiden'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Compute the k-nearest neighbor graph using `sc.pp.neighbors()`. The number of neighbors is set to 15, the number of principal components is set to 50, and the representation used is 'scaled|original|X_pca'. Then, perform Louvain clustering using `ov.utils.cluster()` with `method='louvain'` and `resolution=1`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')\nov.utils.cluster(adata,method='louvain',resolution=1)" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and Louvain clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','louvain'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','louvain'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Perform Gaussian Mixture Model (GMM) clustering using `ov.utils.cluster()`. The representation used is 'scaled|original|X_pca', `method` is set to 'GMM', `n_components` is set to 21, `covariance_type` is set to 'full', `tol` is set to 1e-9, and `max_iter` is set to 1000.", - "code": "ov.utils.cluster(adata,use_rep='scaled|original|X_pca',\n method='GMM',n_components=21,\n covariance_type='full',tol=1e-9, max_iter=1000, )" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and GMM clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','gmm_cluster'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','gmm_cluster'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Initialize the Latent Dirichlet Allocation (LDA) model using `ov.utils.LDA_topic()`. The `feature_type` is set to 'expression', `highly_variable_key` is set to 'highly_variable_features', `layers` is set to 'counts', `batch_key` is set to None, and `learning_rate` is set to 1e-3.", - "code": "LDA_obj=ov.utils.LDA_topic(adata,feature_type='expression',\n highly_variable_key='highly_variable_features',\n layers='counts',batch_key=None,learning_rate=1e-3)" - }, - { - "action": "Plot the topic contributions for the first 6 topics using `LDA_obj.plot_topic_contributions()`.", - "code": "LDA_obj.plot_topic_contributions(6)" - }, - { - "action": "Predict the topic distribution for each cell using 13 topics with `LDA_obj.predicted()`.", - "code": "LDA_obj.predicted(13)" - }, - { - "action": "Generate UMAP embeddings and visualize the distribution of topics across cells using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to `LDA_obj.model.topic_cols`, `cmap` is set to 'BuPu', `ncols` is set to 4, `add_outline` is set to True, and `frameon` is set to 'small'.", - "code": "ov.plot_set()\nov.utils.embedding(adata, basis='X_umap',color = LDA_obj.model.topic_cols, cmap='BuPu', ncols=4,\n add_outline=True, frameon='small',)" - }, - { - "action": "Generate UMAP embeddings and visualize the clusters and LDA clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['clusters','LDA_cluster'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','LDA_cluster'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Refine the LDA clustering results using a random forest classifier. Cells with LDA greater than 0.4 are used as a primitive class. The random forest model is trained on these cells and then used to classify cells with LDA less than 0.4. This is done using `LDA_obj.get_results_rfc()`.", - "code": "LDA_obj.get_results_rfc(adata,use_rep='scaled|original|X_pca',\n LDA_threshold=0.4,num_topics=13)" - }, - { - "action": "Generate UMAP embeddings and visualize the refined LDA clustering results using `ov.utils.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['LDA_cluster_rfc','LDA_cluster_clf'], `frameon` is set to 'small', and `wspace` is set to 0.5.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['LDA_cluster_rfc','LDA_cluster_clf'],\n frameon='small',wspace=0.5)" - }, - { - "action": "Convert the sparse matrix `adata.X` to a dense array using `adata.X.toarray()`.", - "code": "adata.X.toarray()" - }, - { - "action": "Initialize and run cNMF analysis. This includes initializing the `cnmf_obj` with specified parameters, factorizing the data, combining results, and generating a k-selection plot.", - "code": "import numpy as np\n## Initialize the cnmf object that will be used to run analyses\ncnmf_obj = ov.single.cNMF(adata,components=np.arange(5,11), n_iter=20, seed=14, num_highvar_genes=2000,\n output_dir='example_dg1/cNMF', name='dg_cNMF')\n## Specify that the jobs are being distributed over a single worker (total_workers=1) and then launch that worker\ncnmf_obj.factorize(worker_i=0, total_workers=4)\ncnmf_obj.combine(skip_missing_files=True)\ncnmf_obj.k_selection_plot(close_fig=False)" - }, - { - "action": "Perform consensus clustering with a selected number of components (K=7) and a density threshold of 2.00. The results are then loaded and used to annotate the `adata` object.", - "code": "selected_K = 7\ndensity_threshold = 2.00\ncnmf_obj.consensus(k=selected_K, \n density_threshold=density_threshold, \n show_clustering=True, \n close_clustergram_fig=False)\nresult_dict = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)\ncnmf_obj.get_results(adata,result_dict)" - }, - { - "action": "Generate UMAP embeddings and visualize the cNMF usage scores using `ov.pl.embedding()`. The `basis` is set to 'X_umap', `color` is set to the columns of `result_dict['usage_norm']`, `use_raw` is set to False, `ncols` is set to 3, `vmin` is set to 0, `vmax` is set to 1, and `frameon` is set to 'small'.", - "code": "ov.pl.embedding(adata, basis='X_umap',color=result_dict['usage_norm'].columns,\n use_raw=False, ncols=3, vmin=0, vmax=1,frameon='small')" - }, - { - "action": "Refine the cNMF clustering results using a random forest classifier, similar to the LDA refinement. Cells with cNMF usage greater than 0.5 are used as a primitive class, and the random forest model is trained on these cells to classify cells with cNMF usage less than 0.5. This is done using `cnmf_obj.get_results_rfc()`.", - "code": "cnmf_obj.get_results_rfc(adata,result_dict,\n use_rep='scaled|original|X_pca',\n cNMF_threshold=0.5)" - }, - { - "action": "Generate UMAP embeddings and visualize the refined cNMF clustering results using `ov.pl.embedding()`. The `basis` is set to 'X_umap', `color` is set to ['cNMF_cluster_rfc','cNMF_cluster_clf'], `frameon` is set to 'small', and other plotting parameters are specified.", - "code": "ov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['cNMF_cluster_rfc','cNMF_cluster_clf'],\n frameon='small',\n #title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Calculate the Adjusted Rand Index (ARI) for different clustering methods (Leiden, Louvain, GMM, LDA, LDA_rfc, LDA_clf, cNMF_rfc, cNMF_clf) compared to the 'clusters' annotation in `adata.obs`. The ARI values are printed for each method.", - "code": "from sklearn.metrics.cluster import adjusted_rand_score\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['leiden'])\nprint('Leiden, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['louvain'])\nprint('Louvain, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['gmm_cluster'])\nprint('GMM, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster'])\nprint('LDA, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster_rfc'])\nprint('LDA_rfc, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['LDA_cluster_clf'])\nprint('LDA_clf, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['cNMF_cluster_rfc'])\nprint('cNMF_rfc, Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(adata.obs['clusters'], adata.obs['cNMF_cluster_clf'])\nprint('cNMF_clf, Adjusted rand index = %.2f' %ARI)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cluster_space.json b/rag_engine/ovrawmjson/t_cluster_space.json deleted file mode 100644 index 0377be68..00000000 --- a/rag_engine/ovrawmjson/t_cluster_space.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from the specified path and count file. Make variable names unique.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics. Filter out genes with total counts less than 100. Identify spatially variable genes (SVGs) using the `prost` method and other specified parameters.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)" - }, - { - "action": "Write the processed AnnData object to an H5AD file with gzip compression.", - "code": "adata.write('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "Read the processed AnnData object from the H5AD file with gzip decompression.", - "code": "adata=ov.read('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "(Optional) Read ground truth annotations from a TSV file and add them to the AnnData object's observation metadata. Visualize the spatial distribution of the ground truth annotations.", - "code": "import pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0)\nAnn_df.columns = ['Ground Truth']\nadata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])" - }, - { - "action": "Define parameters for the GraphST clustering method, including device, number of principal components. Apply GraphST clustering to the AnnData object using specified parameters and log-normalization.", - "code": "methods_kwargs={}\nmethods_kwargs['GraphST']={ \n 'device':'cuda:0',\n 'n_pcs':30\n}\n\nadata=ov.space.clusters(adata,\n methods=['GraphST'],\n methods_kwargs=methods_kwargs,\n lognorm=1e4)" - }, - { - "action": "Perform mclust clustering on the GraphST representation, refine the labels, and convert the refined labels to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10,\n modelNames='EEV', random_state=112,\n )\nadata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust') \nadata.obs['mclust_GraphST']=adata.obs['mclust_GraphST'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclust_GraphST' labels using a specified threshold and visualize the merging process.", - "code": "res=ov.space.merge_cluster(adata,groupby='mclust_GraphST',use_rep='graphst|original|X_pca',\n threshold=0.2,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_GraphST', 'mclust_GraphST_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_GraphST','mclust_GraphST_tree','mclust','Ground Truth'])" - }, - { - "action": "Perform mclust_R clustering on the GraphST representation, refine the labels, convert them to categorical type, and merge clusters based on the refined labels.", - "code": "ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust_R',n_components=10,\n random_state=42,\n )\nadata.obs['mclust_R_GraphST'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') \nadata.obs['mclust_R_GraphST']=adata.obs['mclust_R_GraphST'].astype('category')\nres=ov.space.merge_cluster(adata,groupby='mclust_R_GraphST',use_rep='graphst|original|X_pca',\n threshold=0.2,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_R_GraphST', 'mclust_R_GraphST_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_R_GraphST','mclust_R_GraphST_tree','mclust','Ground Truth'])" - }, - { - "action": "Define parameters for the BINARY clustering method. Apply BINARY clustering to the AnnData object using specified parameters.", - "code": "methods_kwargs={}\nmethods_kwargs['BINARY']={ \n 'use_method':'KNN',\n 'cutoff':6,\n 'obs_key':'BINARY_sample',\n 'use_list':None,\n 'pos_weight':10,\n 'device':'cuda:0',\n 'hidden_dims':[512, 30],\n 'n_epochs': 1000,\n 'lr': 0.001,\n 'key_added': 'BINARY',\n 'gradient_clipping': 5,\n 'weight_decay': 0.0001,\n 'verbose': True,\n 'random_seed':0,\n 'lognorm':1e4,\n 'n_top_genes':2000,\n}\nadata=ov.space.clusters(adata,\n methods=['BINARY'],\n methods_kwargs=methods_kwargs)" - }, - { - "action": "Perform mclust_R clustering on the BINARY representation, refine the labels, and convert them to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='BINARY',method='mclust_R',n_components=10,\n random_state=42,\n )\nadata.obs['mclust_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') \nadata.obs['mclust_BINARY']=adata.obs['mclust_BINARY'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclust_BINARY' labels using a specified threshold and visualize the merging process.", - "code": "res=ov.space.merge_cluster(adata,groupby='mclust_BINARY',use_rep='BINARY',\n threshold=0.01,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_BINARY', 'mclust_BINARY_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_BINARY','mclust_BINARY_tree','mclust','Ground Truth'])" - }, - { - "action": "Perform mclust clustering on the BINARY representation using Python's implementation, refine the labels, and convert them to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='BINARY',method='mclust',n_components=10,\n modelNames='EEV', random_state=42,\n )\nadata.obs['mclustpy_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust') \nadata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclustpy_BINARY' labels using a specified threshold and visualize the merging process.", - "code": "adata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category')\nres=ov.space.merge_cluster(adata,groupby='mclustpy_BINARY',use_rep='BINARY',\n threshold=0.013,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclustpy_BINARY', 'mclustpy_BINARY_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclustpy_BINARY','mclustpy_BINARY_tree','mclust','Ground Truth'])" - }, - { - "action": "Define parameters for the STAGATE clustering method. Apply STAGATE clustering to the AnnData object using specified parameters.", - "code": "methods_kwargs={}\nmethods_kwargs['STAGATE']={ \n 'num_batch_x':3,'num_batch_y':2,\n 'spatial_key':['X','Y'],'rad_cutoff':200,\n 'num_epoch':1000,'lr':0.001,\n 'weight_decay':1e-4,'hidden_dims':[512, 30],\n 'device':'cuda:0',\n #'n_top_genes':2000,\n}\n\nadata=ov.space.clusters(adata,\n methods=['STAGATE'],\n methods_kwargs=methods_kwargs)" - }, - { - "action": "Perform mclust_R clustering on the STAGATE representation, refine the labels, convert them to categorical type, and merge clusters based on the refined labels.", - "code": "ov.utils.cluster(adata,use_rep='STAGATE',method='mclust_R',n_components=10,\n random_state=112,\n )\nadata.obs['mclust_R_STAGATE'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') \nadata.obs['mclust_R_STAGATE']=adata.obs['mclust_R_STAGATE'].astype('category')\nres=ov.space.merge_cluster(adata,groupby='mclust_R_STAGATE',use_rep='STAGATE',\n threshold=0.005,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_R_STAGATE', 'mclust_R_STAGATE_tree', 'mclust_R', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_R_STAGATE','mclust_R_STAGATE_tree','mclust_R','Ground Truth'])" - }, - { - "action": "Sort genes by their 'PI' values in descending order and display the top 5 genes.", - "code": "adata.var.sort_values('PI',ascending=False).head(5)" - }, - { - "action": "Visualize the spatial expression of a specific gene ('MBP') in both raw and STAGATE-denoised data.", - "code": "plot_gene = 'MBP'\nimport matplotlib.pyplot as plt\nfig, axs = plt.subplots(1, 2, figsize=(8, 4))\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99')\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99')" - }, - { - "action": "Define parameters for the CAST clustering method. Apply CAST clustering to the AnnData object using specified parameters.", - "code": "methods_kwargs={}\nmethods_kwargs['CAST']={ \n 'output_path_t':'result/CAST_gas/output',\n 'device':'cuda:0',\n 'gpu_t':0\n}\nadata=ov.space.clusters(adata,\n methods=['CAST'],\n methods_kwargs=methods_kwargs)" - }, - { - "action": "Perform mclust clustering on the CAST representation, refine the labels, and convert them to categorical type.", - "code": "ov.utils.cluster(adata,use_rep='X_cast',method='mclust',n_components=10,\n modelNames='EEV', random_state=42,\n )\nadata.obs['mclust_CAST'] = ov.utils.refine_label(adata, radius=50, key='mclust') \nadata.obs['mclust_CAST']=adata.obs['mclust_CAST'].astype('category')" - }, - { - "action": "Merge clusters based on the 'mclust_CAST' labels using a specified threshold and visualize the merging process.", - "code": "res=ov.space.merge_cluster(adata,groupby='mclust_CAST',use_rep='X_cast',\n threshold=0.1,plot=True)" - }, - { - "action": "Visualize the spatial distribution of the 'mclust_CAST', 'mclust_CAST_tree', 'mclust', and 'Ground Truth' labels.", - "code": "sc.pl.spatial(adata, color=['mclust_CAST','mclust_CAST_tree','mclust','Ground Truth'])" - }, - { - "action": "Display the AnnData object.", - "code": "adata" - }, - { - "action": "Calculate and print the Adjusted Rand Index (ARI) for each clustering method compared to the ground truth.", - "code": "from sklearn.metrics.cluster import adjusted_rand_score\n\nobs_df = adata.obs.dropna()\n#GraphST\nARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth'])\nprint('mclust_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_R_GraphST'], obs_df['Ground Truth'])\nprint('mclust_R_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_R_STAGATE'], obs_df['Ground Truth'])\nprint('mclust_STAGATE: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_BINARY'], obs_df['Ground Truth'])\nprint('mclust_BINARY: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclustpy_BINARY'], obs_df['Ground Truth'])\nprint('mclustpy_BINARY: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_CAST'], obs_df['Ground Truth'])\nprint('mclust_CAST: Adjusted rand index = %.2f' %ARI)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cnmf.json b/rag_engine/ovrawmjson/t_cnmf.json deleted file mode 100644 index 1384f0da..00000000 --- a/rag_engine/ovrawmjson/t_cnmf.json +++ /dev/null @@ -1,110 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy, omicverse, and scvelo. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()\nimport scvelo as scv" - }, - { - "action": "Load the dentategyrus dataset using scvelo.", - "code": "adata=scv.datasets.dentategyrus()" - }, - { - "action": "Preprocess the AnnData object using omicverse. The preprocessing steps include shiftlog normalization, Pearson residual scaling, and selecting the top 2000 highly variable genes.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Scale the data and perform Principal Component Analysis (PCA) on the preprocessed AnnData object.", - "code": "ov.pp.scale(adata)\nov.pp.pca(adata)" - }, - { - "action": "Plot a UMAP embedding of the cells, colored by their cluster assignments.", - "code": "import matplotlib.pyplot as plt\nfrom matplotlib import patheffects\nfig, ax = plt.subplots(figsize=(4,4))\nov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['clusters'],\n frameon='small',\n title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n ax=ax,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Initialize a cNMF object with specified parameters, including the range of components (K values) to explore, number of iterations, random seed, number of highly variable genes, output directory, and name.", - "code": "import numpy as np\n## Initialize the cnmf object that will be used to run analyses\ncnmf_obj = ov.single.cNMF(adata,components=np.arange(5,11), n_iter=20, seed=14, num_highvar_genes=2000,\n output_dir='example_dg/cNMF', name='dg_cNMF')" - }, - { - "action": "Run the cNMF factorization on the specified worker.", - "code": "## Specify that the jobs are being distributed over a single worker (total_workers=1) and then launch that worker\ncnmf_obj.factorize(worker_i=0, total_workers=2)" - }, - { - "action": "Combine the results from different workers, skipping missing files.", - "code": "cnmf_obj.combine(skip_missing_files=True)" - }, - { - "action": "Generate a K selection plot to visualize the stability and error at each choice of K.", - "code": "cnmf_obj.k_selection_plot(close_fig=False)" - }, - { - "action": "Set the selected K value and density threshold for consensus clustering.", - "code": "selected_K = 7\ndensity_threshold = 2.00" - }, - { - "action": "Perform consensus clustering with the specified K value and density threshold, and visualize the clustering results.", - "code": "cnmf_obj.consensus(k=selected_K, \n density_threshold=density_threshold, \n show_clustering=True, \n close_clustergram_fig=False)" - }, - { - "action": "Update the density threshold based on the initial consensus clustering results.", - "code": "density_threshold = 0.10" - }, - { - "action": "Perform consensus clustering again with the updated density threshold.", - "code": "cnmf_obj.consensus(k=selected_K, \n density_threshold=density_threshold, \n show_clustering=True, \n close_clustergram_fig=False)" - }, - { - "action": "Visualize the distance matrix of the consensus spectra using a heatmap.", - "code": "import seaborn as sns\nimport matplotlib.pyplot as plt\nfrom matplotlib import patheffects\n\nfrom matplotlib import gridspec\nimport matplotlib.pyplot as plt\n\nwidth_ratios = [0.2, 4, 0.5, 10, 1]\nheight_ratios = [0.2, 4]\nfig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios)))\ngs = gridspec.GridSpec(len(height_ratios), len(width_ratios), fig,\n 0.01, 0.01, 0.98, 0.98,\n height_ratios=height_ratios,\n width_ratios=width_ratios,\n wspace=0, hspace=0)\n \nD = cnmf_obj.topic_dist[cnmf_obj.spectra_order, :][:, cnmf_obj.spectra_order]\ndist_ax = fig.add_subplot(gs[1,1], xscale='linear', yscale='linear',\n xticks=[], yticks=[],xlabel='', ylabel='',\n frameon=True)\ndist_im = dist_ax.imshow(D, interpolation='none', cmap='viridis',\n aspect='auto', rasterized=True)\n\nleft_ax = fig.add_subplot(gs[1,0], xscale='linear', yscale='linear', xticks=[], yticks=[],\n xlabel='', ylabel='', frameon=True)\nleft_ax.imshow(cnmf_obj.kmeans_cluster_labels.values[cnmf_obj.spectra_order].reshape(-1, 1),\n interpolation='none', cmap='Spectral', aspect='auto',\n rasterized=True)\n\ntop_ax = fig.add_subplot(gs[0,1], xscale='linear', yscale='linear', xticks=[], yticks=[],\n xlabel='', ylabel='', frameon=True)\ntop_ax.imshow(cnmf_obj.kmeans_cluster_labels.values[cnmf_obj.spectra_order].reshape(1, -1),\n interpolation='none', cmap='Spectral', aspect='auto',\n rasterized=True)\n\ncbar_gs = gridspec.GridSpecFromSubplotSpec(3, 3, subplot_spec=gs[1, 2],\n wspace=0, hspace=0)\ncbar_ax = fig.add_subplot(cbar_gs[1,2], xscale='linear', yscale='linear',\n xlabel='', ylabel='', frameon=True, title='Euclidean\\nDistance')\ncbar_ax.set_title('Euclidean\\nDistance',fontsize=12)\nvmin = D.min().min()\nvmax = D.max().max()\nfig.colorbar(dist_im, cax=cbar_ax,\n ticks=np.linspace(vmin, vmax, 3),\n )\ncbar_ax.set_yticklabels(cbar_ax.get_yticklabels(),fontsize=12)\n" - }, - { - "action": "Plot a histogram of the local density values and indicate the filtering threshold.", - "code": "density_filter = cnmf_obj.local_density.iloc[:, 0] < density_threshold\nfig, hist_ax = plt.subplots(figsize=(4,4))\n\n#hist_ax = fig.add_subplot(hist_gs[0,0], xscale='linear', yscale='linear',\n # xlabel='', ylabel='', frameon=True, title='Local density histogram')\nhist_ax.hist(cnmf_obj.local_density.values, bins=np.linspace(0, 1, 50))\nhist_ax.yaxis.tick_right()\n\nxlim = hist_ax.get_xlim()\nylim = hist_ax.get_ylim()\nif density_threshold < xlim[1]:\n hist_ax.axvline(density_threshold, linestyle='--', color='k')\n hist_ax.text(density_threshold + 0.02, ylim[1] * 0.95, 'filtering\\nthreshold\\n\\n', va='top')\nhist_ax.set_xlim(xlim)\nhist_ax.set_xlabel('Mean distance to k nearest neighbors\\n\\n%d/%d (%.0f%%) spectra above threshold\\nwere removed prior to clustering'%(sum(~density_filter), len(density_filter), 100*(~density_filter).mean()))\nhist_ax.set_title('Local density histogram')" - }, - { - "action": "Load the cNMF results for the selected K value and density threshold.", - "code": "result_dict = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)" - }, - { - "action": "Display the head of the normalized usage matrix.", - "code": "result_dict['usage_norm'].head()" - }, - { - "action": "Display the head of the GEP scores matrix.", - "code": "result_dict['gep_scores'].head()" - }, - { - "action": "Display the head of the GEP TPM matrix.", - "code": "result_dict['gep_tpm'].head()" - }, - { - "action": "Display the head of the top genes matrix.", - "code": "result_dict['top_genes'].head()" - }, - { - "action": "Assign cNMF cluster labels to cells in the AnnData object based on the loaded results.", - "code": "cnmf_obj.get_results(adata,result_dict)" - }, - { - "action": "Plot UMAP embeddings of the cells, colored by the cNMF usage values for each program.", - "code": "ov.pl.embedding(adata, basis='X_umap',color=result_dict['usage_norm'].columns,\n use_raw=False, ncols=3, vmin=0, vmax=1,frameon='small')" - }, - { - "action": "Plot a UMAP embedding of the cells, colored by their assigned cNMF cluster labels.", - "code": "ov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['cNMF_cluster'],\n frameon='small',\n #title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Assign cNMF cluster labels using a random forest classifier (RFC) based on the usage values and a specified threshold.", - "code": "cnmf_obj.get_results_rfc(adata,result_dict,\n use_rep='scaled|original|X_pca',\n cNMF_threshold=0.5)" - }, - { - "action": "Plot UMAP embeddings of the cells, colored by their assigned cNMF cluster labels from both the direct assignment and the RFC-based assignment.", - "code": "ov.pl.embedding(\n adata,\n basis=\"X_umap\",\n color=['cNMF_cluster_rfc','cNMF_cluster_clf'],\n frameon='small',\n #title=\"Celltypes\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n #size=10,\n #legend_loc=True, \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n)" - }, - { - "action": "Create a list of top genes for plotting.", - "code": "plot_genes=[]\nfor i in result_dict['top_genes'].columns:\n plot_genes+=result_dict['top_genes'][i][:3].values.reshape(-1).tolist()" - }, - { - "action": "Generate a dot plot of the top genes, grouped by cNMF cluster.", - "code": "sc.pl.dotplot(adata,plot_genes,\n \"cNMF_cluster\", dendrogram=False,standard_scale='var',)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_commot_flowsig.json b/rag_engine/ovrawmjson/t_commot_flowsig.json deleted file mode 100644 index 242a5220..00000000 --- a/rag_engine/ovrawmjson/t_commot_flowsig.json +++ /dev/null @@ -1,110 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\n#print(f\"omicverse version: {ov.__version__}\")\nimport scanpy as sc\n#print(f\"scanpy version: {sc.__version__}\")\nov.plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from the specified path and count file, and make variable names unique.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics and filter genes with total counts less than 100. Then, identify spatially variable genes using the `ov.space.svg` function with the 'prost' method.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)" - }, - { - "action": "Write the processed AnnData object to a compressed H5AD file.", - "code": "adata.write('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "Load ligand-receptor database from CellChat for secreted signaling in humans using `ov.externel.commot.pp.ligand_receptor_database`.", - "code": "df_cellchat = ov.externel.commot.pp.ligand_receptor_database(species='human', \n signaling_type='Secreted Signaling', \n database='CellChat')\nprint(df_cellchat.shape)" - }, - { - "action": "Filter the ligand-receptor database to include only pairs where both ligand and receptor are expressed in at least 5% of the spots using `ov.externel.commot.pp.filter_lr_database`.", - "code": "df_cellchat_filtered = ov.externel.commot.pp.filter_lr_database(df_cellchat, \n adata, \n min_cell_pct=0.05)\nprint(df_cellchat_filtered.shape)" - }, - { - "action": "Perform spatial communication inference using `ov.externel.commot.tl.spatial_communication` with specified parameters, including distance threshold and handling of heteromeric complexes.", - "code": "ov.externel.commot.tl.spatial_communication(adata,\n database_name='cellchat', \n df_ligrec=df_cellchat_filtered, \n dis_thr=500, heteromeric=True, \n pathway_sum=True)" - }, - { - "action": "Read ground truth annotations from a file and add them to the AnnData object. Visualize the spatial distribution of ground truth annotations using `sc.pl.spatial`.", - "code": "# read the annotation\nimport pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\\t', header=None, index_col=0)\nAnn_df.columns = ['Ground_Truth']\nadata.obs['Ground_Truth'] = Ann_df.loc[adata.obs_names, 'Ground_Truth']\nLayer_color=['#283b5c', '#d8e17b', '#838e44', '#4e8991', '#d08c35', '#511a3a',\n '#c2c2c2', '#dfc648']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground_Truth\"],palette=Layer_color)" - }, - { - "action": "Create a dictionary mapping ground truth categories to their corresponding colors.", - "code": "ct_color_dict=dict(zip(adata.obs['Ground_Truth'].cat.categories,\n adata.uns['Ground_Truth_colors']))" - }, - { - "action": "Display the head of the ligand-receptor dataframe from the CellChat database information.", - "code": "adata.uns['commot-cellchat-info']['df_ligrec'].head()" - }, - { - "action": "Determine the spatial direction of the FGF signaling pathway using `ov.externel.commot.tl.communication_direction`. Visualize the cell communication for the FGF pathway using `ov.externel.commot.pl.plot_cell_communication` with specified parameters.", - "code": "import matplotlib.pyplot as plt\nscale=0.000008\nk=5\ngoal_pathway='FGF'\nov.externel.commot.tl.communication_direction(adata, database_name='cellchat', pathway_name=goal_pathway, k=k)\nov.externel.commot.pl.plot_cell_communication(adata, database_name='cellchat', \n pathway_name='FGF', plot_method='grid', \n background_legend=True,\n scale=scale, ndsize=8, grid_density=0.4, \n summary='sender', background='cluster', \n clustering='Ground_Truth', \n cluster_cmap=ct_color_dict,\n cmap='Alphabet',\n normalize_v = True, normalize_v_quantile=0.995)\nplt.title(f'Pathway:{goal_pathway}',fontsize=13)\n#plt.savefig('figures/TLE/TLE_cellchat_all_FGF.png',dpi=300,bbox_inches='tight')\n#fig.savefig('pdf/TLE/control_cellchat_all_FGF.pdf',dpi=300,bbox_inches='tight')" - }, - { - "action": "Write the AnnData object with COMMOT results to a compressed H5AD file.", - "code": "adata.write('data/151676_commot.h5ad',compression='gzip')" - }, - { - "action": "Read the AnnData object with COMMOT results from the H5AD file.", - "code": "adata=ov.read('data/151676_commot.h5ad')" - }, - { - "action": "Copy the normalized data to a new layer in the AnnData object.", - "code": "adata.layers['normalized'] = adata.X.copy()" - }, - { - "action": "Construct gene expression modules (GEMs) using non-negative matrix factorization (NMF) with `ov.externel.flowsig.pp.construct_gems_using_nmf`.", - "code": "# We construct 10 gene expression modules using the raw cell count.\nov.externel.flowsig.pp.construct_gems_using_nmf(adata,\n n_gems = 10,\n layer_key = 'counts',\n )" - }, - { - "action": "Retrieve the top genes for a specific GEM using `ov.externel.flowsig.ul.get_top_gem_genes`.", - "code": "goal_gem='GEM-5'\ngem_gene=ov.externel.flowsig.ul.get_top_gem_genes(adata=adata,\n gems=[goal_gem],\n n_genes=100,\n gene_type='all',\n method = 'nmf',\n )\ngem_gene.head()" - }, - { - "action": "Construct flow expression matrices using `ov.externel.flowsig.pp.construct_flows_from_commot` with specified parameters.", - "code": "commot_output_key = 'commot-cellchat'\n# We first construct the potential cellular flows from the commot output\nov.externel.flowsig.pp.construct_flows_from_commot(adata,\n commot_output_key,\n gem_expr_key = 'X_gem',\n scale_gem_expr = True,\n flowsig_network_key = 'flowsig_network',\n flowsig_expr_key = 'X_flow')" - }, - { - "action": "Determine informative variables for spatial data using `ov.externel.flowsig.pp.determine_informative_variables` with a Moran's I threshold.", - "code": "# Then we subset for \"spatially flowing\" inflows and outflows\nov.externel.flowsig.pp.determine_informative_variables(adata, \n flowsig_expr_key = 'X_flow',\n flowsig_network_key = 'flowsig_network',\n spatial = True,\n moran_threshold = 0.15,\n coord_type = 'grid',\n n_neighbours = 8,\n library_key = None)" - }, - { - "action": "Perform k-means clustering on spatial coordinates and add the cluster labels to the AnnData object.", - "code": "from sklearn.cluster import KMeans\nimport pandas as pd\n\nkmeans = KMeans(n_clusters=10, random_state=0).fit(adata.obsm['spatial'])\nadata.obs['spatial_kmeans'] = pd.Series(kmeans.labels_, dtype='category').values" - }, - { - "action": "Learn intercellular flows using spatial block bootstrapping with `ov.externel.flowsig.tl.learn_intercellular_flows`.", - "code": "# # Now we are ready to learn the network\nov.externel.flowsig.tl.learn_intercellular_flows(adata,\n flowsig_key = 'flowsig_network',\n flow_expr_key = 'X_flow',\n use_spatial = True,\n block_key = 'spatial_kmeans',\n n_jobs = 4,\n n_bootstraps = 500)" - }, - { - "action": "Apply biological flow constraints to the network using `ov.externel.flowsig.tl.apply_biological_flow`.", - "code": "# This part is key for reducing false positives\nov.externel.flowsig.tl.apply_biological_flow(adata,\n flowsig_network_key = 'flowsig_network',\n adjacency_key = 'adjacency',\n validated_key = 'validated')" - }, - { - "action": "Filter low-confidence edges based on bootstrapped frequencies using `ov.externel.flowsig.tl.filter_low_confidence_edges`.", - "code": "edge_threshold = 0.7\n\nov.externel.flowsig.tl.filter_low_confidence_edges(adata,\n edge_threshold = edge_threshold,\n flowsig_network_key = 'flowsig_network',\n adjacency_key = 'adjacency_validated',\n filtered_key = 'filtered')" - }, - { - "action": "Write the AnnData object with COMMOT and flowsig results to a compressed H5AD file.", - "code": "adata.write('data/cortex_commot_flowsig.h5ad',compression='gzip')" - }, - { - "action": "Construct the directed NetworkX DiGraph object from the filtered adjacency matrix using `ov.externel.flowsig.tl.construct_intercellular_flow_network`.", - "code": "flow_network = ov.externel.flowsig.tl.construct_intercellular_flow_network(adata,\n flowsig_network_key = 'flowsig_network',\n adjacency_key = 'adjacency_validated_filtered')" - }, - { - "action": "Create a subset of the AnnData object containing only GEM expression data and corresponding metadata.", - "code": "flowsig_expr_key='X_gem'\nX_flow = adata.obsm[flowsig_expr_key]\nadata_subset = sc.AnnData(X=X_flow)\nadata_subset.obs = adata.obs\nadata_subset.var.index =[f'GEM-{i}' for i in range(1,len(adata_subset.var)+1)]" - }, - { - "action": "Visualize the expression of GEMs in different cell types using a dotplot with `sc.pl.dotplot`.", - "code": "import matplotlib.pyplot as plt\nax=sc.pl.dotplot(adata_subset, adata_subset.var.index, groupby='Ground_Truth', \n dendrogram=True,standard_scale='var',cmap='Reds',show=False)\ncolor_dict=dict(zip(adata.obs['Ground_Truth'].cat.categories,adata.uns['Ground_Truth_colors']))" - }, - { - "action": "Visualize the flowsig network using `ov.pl.plot_flowsig_network` with specified parameters for node shapes, curve arguments, and axis limits.", - "code": "ov.pl.plot_flowsig_network(flow_network=flow_network,\n gem_plot=['GEM-2','GEM-7','GEM-1','GEM-3','GEM-4','GEM-5'],\n figsize=(8,4),\n curve_awarg={'eps':2},\n node_shape={'GEM':'^','Sender':'o','Receptor':'o'},\n ylim=(-0.5,0.5),xlim=(-3,3))" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_cytotrace.json b/rag_engine/ovrawmjson/t_cytotrace.json deleted file mode 100644 index 69910109..00000000 --- a/rag_engine/ovrawmjson/t_cytotrace.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters.", - "code": "import omicverse as ov\nov.plot_set()" - }, - { - "action": "Import the scvelo library and load the dentategyrus dataset into an AnnData object.", - "code": "import scvelo as scv\nadata=scv.datasets.dentategyrus()\nadata" - }, - { - "action": "Preprocess the AnnData object using the `ov.pp.preprocess` function with specified parameters, including mode, number of highly variable genes (n_HVGs), and timing the execution.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Predict CytoTRACE 2 scores using the `ov.single.cytotrace2` function with specified parameters, including the path to the pre-trained model directory, species, batch sizes, parallelization settings, maximum number of principal components, random seed, and output directory.", - "code": "results = ov.single.cytotrace2(adata,\n use_model_dir=\"cymodels/5_models_weights\",\n species=\"mouse\",\n batch_size = 10000,\n smooth_batch_size = 1000,\n disable_parallelization = False,\n max_cores = None,\n max_pcs = 200,\n seed = 14,\n output_dir = 'cytotrace2_results'\n)" - }, - { - "action": "Visualize the UMAP embeddings of the AnnData object, colored by cell clusters and CytoTRACE2 scores, with specified parameters for frame, colormap, and whitespace.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['clusters','CytoTRACE2_Score'],\n frameon='small',cmap='Reds',wspace=0.55)" - }, - { - "action": "Visualize the UMAP embeddings of the AnnData object, colored by CytoTRACE2 potency and relative order, with specified parameters for frame, colormap, and whitespace.", - "code": "ov.utils.embedding(adata,basis='X_umap',\n color=['CytoTRACE2_Potency','CytoTRACE2_Relative'],\n frameon='small',cmap='Reds',wspace=0.55)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_deg.json b/rag_engine/ovrawmjson/t_deg.json deleted file mode 100644 index ac2f5dfd..00000000 --- a/rag_engine/ovrawmjson/t_deg.json +++ /dev/null @@ -1,82 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and matplotlib.pyplot. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport matplotlib.pyplot as plt\n\nov.plot_set()" - }, - { - "action": "Download gene ID annotation pair using `ov.utils.download_geneid_annotation_pair()`. This is necessary for converting gene IDs.", - "code": "ov.utils.download_geneid_annotation_pair()" - }, - { - "action": "Read the data from a file named 'counts.txt' (or from a URL, commented out). The data is assumed to be a tab-separated file with the first column as index and the second row as header. The `.bam` suffix is removed from column names.", - "code": "#data=pd.read_csv('https://raw.githubusercontent.com/Starlitnightly/omicverse/master/sample/counts.txt',index_col=0,sep='\\t',header=1)\ndata=ov.read('data/counts.txt',index_col=0,header=1)\n#replace the columns `.bam` to `` \ndata.columns=[i.split('/')[-1].replace('.bam','') for i in data.columns]\ndata.head()" - }, - { - "action": "Perform gene ID mapping on the data using the downloaded annotation pair file for 'GRCm39'.", - "code": "data=ov.bulk.Matrix_ID_mapping(data,'genesets/pair_GRCm39.tsv')\ndata.head()" - }, - { - "action": "Initialize a pyDEG object for differential expression analysis using the `omicverse` library.", - "code": "dds=ov.bulk.pyDEG(data)" - }, - { - "action": "Drop duplicate indices in the pyDEG object, keeping only the highest expressed genes.", - "code": "dds.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Normalize the data using the `estimateSizeFactors` method from DEseq2, likely to remove batch effects.", - "code": "dds.normalize()\nprint('... estimateSizeFactors and normalize success')" - }, - { - "action": "Perform differential expression gene analysis using the t-test method. The treatment groups are '4-3' and '4-4', and the control groups are '1--1' and '1--2'.", - "code": "treatment_groups=['4-3','4-4']\ncontrol_groups=['1--1','1--2']\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')\nresult.head()" - }, - { - "action": "Filter out genes with low expression (log2(BaseMean) <= 1).", - "code": "print(result.shape)\nresult=result.loc[result['log2(BaseMean)']>1]\nprint(result.shape)" - }, - { - "action": "Set the threshold for fold change. The threshold is calculated automatically (-1) based on the log2FC distribution. The p-value threshold is set to 0.05, and the maximum log p-value is set to 6.", - "code": "# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=6)" - }, - { - "action": "Plot a volcano plot to visualize the results of the differential expression analysis. The plot includes the top 8 differentially expressed genes and sets the font size for gene labels to 12.", - "code": "dds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot a boxplot for the genes 'Ckap2' and 'Lef1' to visualize their expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2','Lef1'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Plot a boxplot for the gene 'Ckap2' to visualize its expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Download pathway database using `ov.utils.download_pathway_database()`.", - "code": "ov.utils.download_pathway_database()" - }, - { - "action": "Prepare a pathway dictionary from the 'WikiPathways_2019_Mouse.txt' file for mouse.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt',organism='Mouse')" - }, - { - "action": "Perform gene set enrichment analysis using the prepared pathway dictionary. The `pvalue_type` is set to 'auto' to automatically determine whether to use adjusted or raw p-values. The organism is set to 'mouse'.", - "code": "deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()\nenr=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot the gene set enrichment results using a custom plot function `geneset_plot`.", - "code": "ov.bulk.geneset_plot(enr,figsize=(2,5),fig_title='Wiki Pathway enrichment',\n cax_loc=[2, 0.45, 0.5, 0.02],\n bbox_to_anchor_used=(-0.25, -13),node_diameter=10,\n custom_ticks=[5,7],text_knock=3,\n cmap='Reds')" - }, - { - "action": "Prepare pathway dictionaries for GO Biological Process, GO Molecular Function, and GO Cellular Component for mouse and perform gene set enrichment analysis for each.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Mouse')\nenr_go_bp=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Molecular_Function_2023.txt',organism='Mouse')\nenr_go_mf=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Cellular_Component_2023.txt',organism='Mouse')\nenr_go_cc=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot multiple gene set enrichment results together using `geneset_plot_multi`.", - "code": "enr_dict={'BP':enr_go_bp,\n 'MF':enr_go_mf,\n 'CC':enr_go_cc}\ncolors_dict={\n 'BP':ov.pl.red_color[1],\n 'MF':ov.pl.green_color[1],\n 'CC':ov.pl.blue_color[1],\n}\n \nov.bulk.geneset_plot_multi(enr_dict,colors_dict,num=3,\n figsize=(2,5),\n text_knock=3,fontsize=8,\n cmap='Reds'\n )" - }, - { - "action": "Define a function `geneset_plot_multi` to plot multiple gene set enrichment results.", - "code": "def geneset_plot_multi(enr_dict,colors_dict,num:int=5,fontsize=10,\n fig_title:str='',fig_xlabel:str='Fractions of genes',\n figsize:tuple=(2,4),cmap:str='YlGnBu',\n text_knock:int=5,text_maxsize:int=20,ax=None,\n ):\n from PyComplexHeatmap import HeatmapAnnotation,DotClustermapPlotter,anno_label,anno_simple,AnnotationBase\n for key in enr_dict.keys():\n enr_dict[key]['Type']=key\n enr_all=pd.concat([enr_dict[i].iloc[:num] for i in enr_dict.keys()],axis=0)\n enr_all['Term']=[ov.utils.plot_text_set(i.split('(')[0],text_knock=text_knock,text_maxsize=text_maxsize) for i in enr_all.Term.tolist()]\n enr_all.index=enr_all.Term\n enr_all['Term1']=[i for i in enr_all.index.tolist()]\n del enr_all['Term']\n\n colors=colors_dict\n\n left_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Type, merge=True,rotation=0,colors=colors,relpos=(1,0.8)),\n Category=anno_simple(enr_all.Type,cmap='Set1',\n add_text=False,legend=False,colors=colors),\n axis=0,verbose=0,label_kws={'rotation':45,'horizontalalignment':'left','visible':False})\n right_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Term1, merge=True,rotation=0,relpos=(0,0.5),arrowprops=dict(visible=True),\n colors=enr_all.assign(color=enr_all.Type.map(colors)).set_index('Term1').color.to_dict(),\n fontsize=fontsize,luminance=0.8,height=2),\n axis=0,verbose=0,#label_kws={'rotation':45,'horizontalalignment':'left'},\n orientation='right')\n if ax==None:\n fig, ax = plt.subplots(figsize=figsize) \n else:\n ax=ax\n #plt.figure(figsize=figsize)\n cm = DotClustermapPlotter(data=enr_all, x='fraction',y='Term1',value='logp',c='logp',s='num',\n cmap=cmap,\n row_cluster=True,#col_cluster=True,#hue='Group',\n #cmap={'Group1':'Greens','Group2':'OrRd'},\n vmin=-1*np.log10(0.1),vmax=-1*np.log10(1e-10),\n #colors={'Group1':'yellowgreen','Group2':'orange'},\n #marker={'Group1':'*','Group2':'$\\ast$'},\n show_rownames=True,show_colnames=False,row_dendrogram=False,\n col_names_side='top',row_names_side='right',\n xticklabels_kws={'labelrotation': 30, 'labelcolor': 'blue','labelsize':fontsize},\n #yticklabels_kws={'labelsize':10},\n #top_annotation=col_ha,left_annotation=left_ha,right_annotation=right_ha,\n left_annotation=left_ha,right_annotation=right_ha,\n spines=False,\n row_split=enr_all.Type,# row_split_gap=1,\n #col_split=df_col.Group,col_split_gap=0.5,\n verbose=1,legend_gap=10,\n #dot_legend_marker='*',\n \n xlabel='Fractions of genes',xlabel_side=\"bottom\",\n xlabel_kws=dict(labelpad=8,fontweight='normal',fontsize=fontsize+2),\n # xlabel_bbox_kws=dict(facecolor=facecolor)\n )\n tesr=plt.gcf().axes\n for ax in plt.gcf().axes:\n if hasattr(ax, 'get_xlabel'):\n if ax.get_xlabel() == 'Fractions of genes': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.grid(False)\n if ax.get_ylabel() == 'logp': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.tick_params(labelsize=fontsize+2)\n cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2)\n cbar.grid(False)\n return ax" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_deseq2.json b/rag_engine/ovrawmjson/t_deseq2.json deleted file mode 100644 index a180ed11..00000000 --- a/rag_engine/ovrawmjson/t_deseq2.json +++ /dev/null @@ -1,82 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and matplotlib.pyplot. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport matplotlib.pyplot as plt\n\nov.plot_set()" - }, - { - "action": "Download gene ID annotation pair using `ov.utils.download_geneid_annotation_pair()`. This is necessary for converting gene IDs.", - "code": "ov.utils.download_geneid_annotation_pair()" - }, - { - "action": "Read the data from a file named 'counts.txt' (or from a URL, commented out). The data is assumed to be a tab-separated file with the first column as index and the second row as header. The `.bam` suffix is removed from column names.", - "code": "#data=pd.read_csv('https://raw.githubusercontent.com/Starlitnightly/omicverse/master/sample/counts.txt',index_col=0,sep='\\t',header=1)\ndata=ov.read('data/counts.txt',index_col=0,header=1)\n#replace the columns `.bam` to `` \ndata.columns=[i.split('/')[-1].replace('.bam','') for i in data.columns]\ndata.head()" - }, - { - "action": "Perform gene ID mapping on the data using the downloaded annotation pair file for 'GRCm39'.", - "code": "data=ov.bulk.Matrix_ID_mapping(data,'genesets/pair_GRCm39.tsv')\ndata.head()" - }, - { - "action": "Initialize a pyDEG object for differential expression analysis using the `omicverse` library.", - "code": "dds=ov.bulk.pyDEG(data)" - }, - { - "action": "Drop duplicate indices in the pyDEG object, keeping only the highest expressed genes.", - "code": "dds.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Normalize the data using the `estimateSizeFactors` method from DEseq2, likely to remove batch effects.", - "code": "dds.normalize()\nprint('... estimateSizeFactors and normalize success')" - }, - { - "action": "Perform differential expression gene analysis using the t-test method. The treatment groups are '4-3' and '4-4', and the control groups are '1--1' and '1--2'.", - "code": "treatment_groups=['4-3','4-4']\ncontrol_groups=['1--1','1--2']\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')\nresult.head()" - }, - { - "action": "Filter out genes with low expression (log2(BaseMean) <= 1).", - "code": "print(result.shape)\nresult=result.loc[result['log2(BaseMean)']>1]\nprint(result.shape)" - }, - { - "action": "Set the threshold for fold change. The threshold is calculated automatically (-1) based on the log2FC distribution. The p-value threshold is set to 0.05, and the maximum log p-value is set to 6.", - "code": "# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=6)" - }, - { - "action": "Plot a volcano plot to visualize the results of the differential expression analysis. The plot includes the top 8 differentially expressed genes and sets the font size for gene labels to 12.", - "code": "dds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot a boxplot for the genes 'Ckap2' and 'Lef1' to visualize their expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2','Lef1'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Plot a boxplot for the gene 'Ckap2' to visualize its expression levels in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Ckap2'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Download pathway database using `ov.utils.download_pathway_database()`.", - "code": "ov.utils.download_pathway_database()" - }, - { - "action": "Prepare a pathway dictionary from the 'WikiPathways_2019_Mouse.txt' file for mouse.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/WikiPathways_2019_Mouse.txt',organism='Mouse')" - }, - { - "action": "Perform gene set enrichment analysis using the prepared pathway dictionary. The `pvalue_type` is set to 'auto' to automatically determine whether to use adjusted or raw p-values. The organism is set to 'mouse'.", - "code": "deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()\nenr=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot the gene set enrichment results using a custom plot function `geneset_plot`.", - "code": "ov.bulk.geneset_plot(enr,figsize=(2,5),fig_title='Wiki Pathway enrichment',\n cax_loc=[2, 0.45, 0.5, 0.02],\n bbox_to_anchor_used=(-0.25, -13),node_diameter=10,\n custom_ticks=[5,7],text_knock=3,\n cmap='Reds')" - }, - { - "action": "Prepare pathway dictionaries for GO Biological Process, GO Molecular Function, and GO Cellular Component for mouse and perform gene set enrichment analysis for each.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/GO_Biological_Process_2023.txt',organism='Mouse')\nenr_go_bp=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Molecular_Function_2023.txt',organism='Mouse')\nenr_go_mf=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')\npathway_dict=ov.utils.geneset_prepare('genesets/GO_Cellular_Component_2023.txt',organism='Mouse')\nenr_go_cc=ov.bulk.geneset_enrichment(gene_list=deg_genes,\n pathways_dict=pathway_dict,\n pvalue_type='auto',\n organism='mouse')" - }, - { - "action": "Plot multiple gene set enrichment results together using `geneset_plot_multi`.", - "code": "enr_dict={'BP':enr_go_bp,\n 'MF':enr_go_mf,\n 'CC':enr_go_cc}\ncolors_dict={\n 'BP':ov.pl.red_color[1],\n 'MF':ov.pl.green_color[1],\n 'CC':ov.pl.blue_color[1],\n}\n \nov.bulk.geneset_plot_multi(enr_dict,colors_dict,num=3,\n figsize=(2,5),\n text_knock=3,fontsize=8,\n cmap='Reds'\n )" - }, - { - "action": "Define a function `geneset_plot_multi` to plot multiple gene set enrichment results. This function takes a dictionary of enrichment results and a dictionary of colors, and plots them in a combined dot plot. It allows customization of the number of top pathways to display, font size, figure title, x-axis label, figure size, colormap, text knock, and maximum text size. It uses the `PyComplexHeatmap` library to create the plot.", - "code": "def geneset_plot_multi(enr_dict,colors_dict,num:int=5,fontsize=10,\n fig_title:str='',fig_xlabel:str='Fractions of genes',\n figsize:tuple=(2,4),cmap:str='YlGnBu',\n text_knock:int=5,text_maxsize:int=20,ax=None,\n ):\n from PyComplexHeatmap import HeatmapAnnotation,DotClustermapPlotter,anno_label,anno_simple,AnnotationBase\n for key in enr_dict.keys():\n enr_dict[key]['Type']=key\n enr_all=pd.concat([enr_dict[i].iloc[:num] for i in enr_dict.keys()],axis=0)\n enr_all['Term']=[ov.utils.plot_text_set(i.split('(')[0],text_knock=text_knock,text_maxsize=text_maxsize) for i in enr_all.Term.tolist()]\n enr_all.index=enr_all.Term\n enr_all['Term1']=[i for i in enr_all.index.tolist()]\n del enr_all['Term']\n\n colors=colors_dict\n\n left_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Type, merge=True,rotation=0,colors=colors,relpos=(1,0.8)),\n Category=anno_simple(enr_all.Type,cmap='Set1',\n add_text=False,legend=False,colors=colors),\n axis=0,verbose=0,label_kws={'rotation':45,'horizontalalignment':'left','visible':False})\n right_ha = HeatmapAnnotation(\n label=anno_label(enr_all.Term1, merge=True,rotation=0,relpos=(0,0.5),arrowprops=dict(visible=True),\n colors=enr_all.assign(color=enr_all.Type.map(colors)).set_index('Term1').color.to_dict(),\n fontsize=fontsize,luminance=0.8,height=2),\n axis=0,verbose=0,#label_kws={'rotation':45,'horizontalalignment':'left'},\n orientation='right')\n if ax==None:\n fig, ax = plt.subplots(figsize=figsize) \n else:\n ax=ax\n #plt.figure(figsize=figsize)\n cm = DotClustermapPlotter(data=enr_all, x='fraction',y='Term1',value='logp',c='logp',s='num',\n cmap=cmap,\n row_cluster=True,#col_cluster=True,#hue='Group',\n #cmap={'Group1':'Greens','Group2':'OrRd'},\n vmin=-1*np.log10(0.1),vmax=-1*np.log10(1e-10),\n #colors={'Group1':'yellowgreen','Group2':'orange'},\n #marker={'Group1':'*','Group2':'$\\ast$'},\n show_rownames=True,show_colnames=False,row_dendrogram=False,\n col_names_side='top',row_names_side='right',\n xticklabels_kws={'labelrotation': 30, 'labelcolor': 'blue','labelsize':fontsize},\n #yticklabels_kws={'labelsize':10},\n #top_annotation=col_ha,left_annotation=left_ha,right_annotation=right_ha,\n left_annotation=left_ha,right_annotation=right_ha,\n spines=False,\n row_split=enr_all.Type,# row_split_gap=1,\n #col_split=df_col.Group,col_split_gap=0.5,\n verbose=1,legend_gap=10,\n #dot_legend_marker='*',\n \n xlabel='Fractions of genes',xlabel_side=\"bottom\",\n xlabel_kws=dict(labelpad=8,fontweight='normal',fontsize=fontsize+2),\n # xlabel_bbox_kws=dict(facecolor=facecolor)\n )\n tesr=plt.gcf().axes\n for ax in plt.gcf().axes:\n if hasattr(ax, 'get_xlabel'):\n if ax.get_xlabel() == 'Fractions of genes': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.grid(False)\n if ax.get_ylabel() == 'logp': # 假设 colorbar 有一个特定的标签\n cbar = ax\n cbar.tick_params(labelsize=fontsize+2)\n cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2)\n cbar.grid(False)\n return ax" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_gptanno.json b/rag_engine/ovrawmjson/t_gptanno.json deleted file mode 100644 index 4b760066..00000000 --- a/rag_engine/ovrawmjson/t_gptanno.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nprint(f'omicverse version:{ov.__version__}')\nimport scanpy as sc\nprint(f'scanpy version:{sc.__version__}')\nov.ov_plot_set()" - }, - { - "action": "Create a directory named 'data', download the PBMC3K dataset from 10x Genomics, and unpack it. Then, create a directory named 'write' for storing processed data.", - "code": "# !mkdir data\n# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the count matrix from the 10x Genomics data into an AnnData object, using gene symbols for variable names and caching the data for faster reading.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading" - }, - { - "action": "Perform quality control on the AnnData object, filtering cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})" - }, - { - "action": "Preprocess the data by normalizing and identifying highly variable genes (HVGs) using the 'shiftlog|pearson' mode, selecting the top 2000 HVGs.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)" - }, - { - "action": "Store the raw data in `adata.raw` and filter the AnnData object to keep only the highly variable genes.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]" - }, - { - "action": "Scale the data in `adata.X`.", - "code": "ov.pp.scale(adata)" - }, - { - "action": "Perform Principal Component Analysis (PCA) on the scaled data, reducing the dimensionality to 50 principal components.", - "code": "ov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Construct a neighborhood graph using the top 50 principal components, considering 15 nearest neighbors.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph.", - "code": "sc.tl.leiden(adata)" - }, - { - "action": "Calculate a dendrogram for the Leiden clusters and identify marker genes for each cluster using the Wilcoxon rank-sum test.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='wilcoxon',use_raw=False,)" - }, - { - "action": "Perform dimensionality reduction for visualization using Minimum Distortion Embedding (MDE) based on the PCA results.", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])" - }, - { - "action": "Plot the MDE embedding, coloring cells by their Leiden cluster assignments, with the legend placed on the data points and a custom color palette.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'], \n legend_loc='on data', \n frameon='small',\n legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Manually define a dictionary of marker genes for two clusters and use `ov.single.gptcelltype` to annotate cell types using the Qwen model through its API, specifying 'PBMC' as the tissue and 'human' as the species.", - "code": "import os\nall_markers={'cluster1':['CD3D','CD3E'],\n 'cluster2':['MS4A1']}\n\nos.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='qwen-plus', provider='qwen',\n topgenenumber=5)\nresult" - }, - { - "action": "Automatically identify marker genes for each cluster in the AnnData object using `ov.single.get_celltype_marker`, considering genes with a fold change greater than 2 and selecting the top 5 genes.", - "code": "all_markers=ov.single.get_celltype_marker(adata,clustertype='leiden',rank=True,\n key='rank_genes_groups',\n foldchange=2,topgenenumber=5)\nall_markers" - }, - { - "action": "Use `ov.single.gptcelltype` to annotate cell types using the Qwen model through its API, specifying 'PBMC' as the tissue and 'human' as the species, based on automatically identified marker genes.", - "code": "import os\nos.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='qwen-plus', provider='qwen',\n topgenenumber=5)\nresult" - }, - { - "action": "Extract the cell type annotations from the `gptcelltype` output, removing extra information and keeping only the cell type names.", - "code": "new_result={}\nfor key in result.keys():\n new_result[key]=result[key].split(': ')[-1].split(' (')[0].split('. ')[1]\nnew_result" - }, - { - "action": "Map the extracted cell type annotations to the 'leiden' clusters in the AnnData object and store them in a new observation called 'gpt_celltype'.", - "code": "adata.obs['gpt_celltype'] = adata.obs['leiden'].map(new_result).astype('category')" - }, - { - "action": "Plot the MDE embedding, coloring cells by both their 'leiden' cluster assignments and the new 'gpt_celltype' annotations, with the legend placed on the data points and a custom color palette.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden','gpt_celltype'], \n legend_loc='on data', \n frameon='small',\n legend_fontoutline=2,\n palette=ov.utils.palette()[14:],\n )" - }, - { - "action": "Use `ov.single.gptcelltype` with the OpenAI API to annotate cell types for a given set of marker genes, specifying 'gpt-4o' as the model and 'openai' as the provider.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='gpt-4o', provider='openai',\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype` with the Qwen API to annotate cell types for a given set of marker genes, specifying 'qwen-plus' as the model and 'qwen' as the provider.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='qwen-plus', provider='qwen',\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype` with the Kimi API to annotate cell types for a given set of marker genes, specifying 'moonshot-v1-8k' as the model and 'kimi' as the provider.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='moonshot-v1-8k', provider='kimi',\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype` with a custom `base_url` to annotate cell types, demonstrating the flexibility to use other models that support the OpenAI API format.", - "code": "os.environ['AGI_API_KEY'] = 'sk-**' # Replace with your actual API key\nresult = ov.single.gptcelltype(all_markers, tissuename='PBMC', speciename='human',\n model='moonshot-v1-8k', base_url=\"https://api.moonshot.cn/v1\",\n topgenenumber=5)\nresult" - }, - { - "action": "Use `ov.single.gptcelltype_local` to annotate cell types using a local large language model (LLM), specifying the path to the local model.", - "code": "anno_model = 'path/to/your/local/LLM' # '~/models/Qwen2-7B-Instruct'\n\nresult = ov.single.gptcelltype_local(all_markers, tissuename='PBMC', speciename='human', \n model_name=anno_model, topgenenumber=5)\nresult" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_mapping.json b/rag_engine/ovrawmjson/t_mapping.json deleted file mode 100644 index 723497e6..00000000 --- a/rag_engine/ovrawmjson/t_mapping.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Read single-cell data from a file, then create and display a UMAP plot colored by 'Subset' to visualize the different subsets within the data.", - "code": "adata_sc=ov.read('data/sc.h5ad')\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(3,3))\nov.utils.embedding(\n adata_sc,\n basis=\"X_umap\",\n color=['Subset'],\n title='Subset',\n frameon='small',\n wspace=0.65,\n show=False,\n ax=ax\n)" - }, - { - "action": "Print the maximum value of the raw data, preprocess the single-cell data using shiftlog and Pearson residuals, select the top 3000 highly variable genes, normalize the data to a target sum of 1e4, and then print the maximum value of the normalized data.", - "code": "print(\"RAW\",adata_sc.X.max())\nadata_sc=ov.pp.preprocess(adata_sc,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4)\nadata_sc.raw = adata_sc\nadata_sc = adata_sc[:, adata_sc.var.highly_variable_features]\nprint(\"Normalize\",adata_sc.X.max())" - }, - { - "action": "Load spatial transcriptomics data from 10X Genomics for the 'V1_Human_Lymph_Node' sample, assign sample ID, and ensure unique variable names.", - "code": "adata = sc.datasets.visium_sge(sample_id=\"V1_Human_Lymph_Node\")\nadata.obs['sample'] = list(adata.uns['spatial'].keys())[0]\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics for the spatial data, filter out genes with total counts less than 100, compute spatially variable genes using the 'prost' method, select the top 3000 spatially variable genes, normalize the data, and create a copy for further analysis.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)\nadata.raw = adata\nadata = adata[:, adata.var.space_variable_features]\nadata_sp=adata.copy()\nadata_sp" - }, - { - "action": "Initialize the Tangram model with single-cell and spatial data, using 'Subset' as the clustering key.", - "code": "tg=ov.space.Tangram(adata_sc,adata_sp,clusters='Subset')" - }, - { - "action": "Train the Tangram model in 'clusters' mode for 500 epochs using a CUDA device.", - "code": "tg.train(mode=\"clusters\",num_epochs=500,device=\"cuda:0\")" - }, - { - "action": "Use the trained Tangram model to infer cell locations in spatial spots and store the result in `adata_plot`.", - "code": "adata_plot=tg.cell2location()\nadata_plot.obs.columns" - }, - { - "action": "Create a spatial plot showing the distribution of specified cell types using the 'magma' colormap.", - "code": "annotation_list=['B_Cycling', 'B_GC_LZ', 'T_CD4+_TfH_GC', 'FDC',\n 'B_naive', 'T_CD4+_naive', 'B_plasma', 'Endo']\n\nsc.pl.spatial(adata_plot, cmap='magma',\n color=annotation_list,\n ncols=4, size=1.3,\n img_key='hires'\n )" - }, - { - "action": "Create a dictionary mapping cell type categories to their corresponding colors from the single-cell data.", - "code": "color_dict=dict(zip(adata_sc.obs['Subset'].cat.categories,\n adata_sc.uns['Subset_colors']))" - }, - { - "action": "Create a spatial plot of the first 5 cell types from `annotation_list`, using specified colors and adjusting the color scale and circle size.", - "code": "import matplotlib as mpl\nclust_labels = annotation_list[:5]\nclust_col = ['' + str(i) for i in clust_labels]\n\nwith mpl.rc_context({'figure.figsize': (8, 8),'axes.grid': False}):\n fig = ov.pl.plot_spatial(\n adata=adata_plot,\n color=clust_col, labels=clust_labels,\n show_img=True,\n style='fast',\n max_color_quantile=0.992,\n circle_diameter=3,\n reorder_cmap = [1,2,3,4,6],\n colorbar_position='right',\n palette=color_dict\n )" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_metacells.json b/rag_engine/ovrawmjson/t_metacells.json deleted file mode 100644 index d9b7fa15..00000000 --- a/rag_engine/ovrawmjson/t_metacells.json +++ /dev/null @@ -1,94 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\n\nov.plot_set()" - }, - { - "action": "Load the pancreas dataset using `scv.datasets.pancreas()`.", - "code": "adata = scv.datasets.pancreas()\nadata" - }, - { - "action": "Perform quality control on the AnnData object `adata` using `ov.pp.qc()`, filtering cells based on mitochondrial percentage, number of UMIs, and number of detected genes.", - "code": "#quantity control\nadata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.20, 'nUMIs': 500, 'detected_genes': 250},\n mt_startswith='mt-')" - }, - { - "action": "Preprocess the AnnData object `adata` using `ov.pp.preprocess()`, normalizing and calculating highly variable genes (HVGs).", - "code": "#normalize and high variable genes (HVGs) calculated\nadata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)" - }, - { - "action": "Save the whole genes in `adata.raw` and filter out non-HVGs from `adata`.", - "code": "#save the whole genes and filter the non-HVGs\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]" - }, - { - "action": "Scale the `adata.X` using `ov.pp.scale()`.", - "code": "#scale the adata.X\nov.pp.scale(adata)" - }, - { - "action": "Perform dimensionality reduction using PCA on the scaled data with `ov.pp.pca()`, keeping the top 50 principal components.", - "code": "#Dimensionality Reduction\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Construct a metacellular object using `ov.single.MetaCell` with specified parameters.", - "code": "meta_obj=ov.single.MetaCell(adata,use_rep='scaled|original|X_pca',\n n_metacells=None,\n use_gpu='cuda:0')" - }, - { - "action": "Initialize archetypes for the metacellular object.", - "code": "get_ipython().run_cell_magic('time', '', 'meta_obj.initialize_archetypes()\\n')" - }, - { - "action": "Train the SEACells model with specified minimum and maximum iterations.", - "code": "get_ipython().run_cell_magic('time', '', 'meta_obj.train(min_iter=10, max_iter=50)\\n')" - }, - { - "action": "Save the trained model to a file.", - "code": "meta_obj.save('seacells/model.pkl')" - }, - { - "action": "Load the trained model from a file.", - "code": "meta_obj.load('seacells/model.pkl')" - }, - { - "action": "Predict metacells using the `predicted` method with 'soft' aggregation and summarize the 'lognorm' layer.", - "code": "ad=meta_obj.predicted(method='soft',celltype_label='clusters',\n summarize_layer='lognorm')" - }, - { - "action": "Compute cell type purity, separation, and compactness for benchmarking.", - "code": "SEACell_purity = meta_obj.compute_celltype_purity('clusters')\nseparation = meta_obj.separation(use_rep='scaled|original|X_pca',nth_nbr=1)\ncompactness = meta_obj.compactness(use_rep='scaled|original|X_pca')" - }, - { - "action": "Create box plots to visualize cell type purity, compactness, and separation using `seaborn` and `matplotlib`.", - "code": "import seaborn as sns\nimport matplotlib.pyplot as plt\nov.plot_set()\nfig, axes = plt.subplots(1,3,figsize=(4,4))\nsns.boxplot(data=SEACell_purity, y='clusters_purity',ax=axes[0],\n color=ov.utils.blue_color[3])\nsns.boxplot(data=compactness, y='compactness',ax=axes[1],\n color=ov.utils.blue_color[4])\nsns.boxplot(data=separation, y='separation',ax=axes[2],\n color=ov.utils.blue_color[4])\nplt.tight_layout()\nplt.suptitle('Evaluate of MetaCells',fontsize=13,y=1.05)\nfor ax in axes:\n ax.grid(False)\n ax.spines['top'].set_visible(False)\n ax.spines['right'].set_visible(False)\n ax.spines['bottom'].set_visible(True)\n ax.spines['left'].set_visible(True)" - }, - { - "action": "Plot UMAP embedding of metacells colored by cluster labels and overlay metacell centers.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\nov.pl.embedding(\n meta_obj.adata,\n basis=\"X_umap\",\n color=['clusters'],\n frameon='small',\n title=\"Meta cells\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n size=10,\n ax=ax,\n alpha=0.2,\n #legend_loc='', \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n #palette=ov.utils.blue_color[:],\n #legend_fontweight='normal'\n)\nov.single.plot_metacells(ax,meta_obj.adata,color='#CB3E35',\n )" - }, - { - "action": "Get the mean value of 'S_score' from the original `adata` and store it in the metacell AnnData object `ad`.", - "code": "ov.single.get_obs_value(ad,adata,groupby='S_score',\n type='mean')\nad.obs.head()" - }, - { - "action": "Identify highly variable genes in the metacell AnnData object `ad`.", - "code": "import scanpy as sc\nad.raw=ad.copy()\nsc.pp.highly_variable_genes(ad, n_top_genes=2000, inplace=True)\nad=ad[:,ad.var.highly_variable]" - }, - { - "action": "Scale the metacell data and perform PCA.", - "code": "ov.pp.scale(ad)\nov.pp.pca(ad,layer='scaled',n_pcs=30)" - }, - { - "action": "Compute nearest neighbors for the metacell data.", - "code": "ov.pp.neighbors(ad, n_neighbors=15, n_pcs=20,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Compute UMAP for the metacell data.", - "code": "ov.pp.umap(ad)" - }, - { - "action": "Set the 'celltype' observation to be categorical and reorder categories to match the original data. Also, set the color palette for 'celltype' to match the original data.", - "code": "ad.obs['celltype']=ad.obs['celltype'].astype('category')\nad.obs['celltype']=ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories)\nad.uns['celltype_colors']=adata.uns['clusters_colors']" - }, - { - "action": "Plot UMAP embedding of metacells colored by 'celltype' and 'S_score'.", - "code": "ov.pl.embedding(ad, basis='X_umap',\n color=[\"celltype\",\"S_score\"],\n frameon='small',cmap='RdBu_r',\n wspace=0.5)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_metatime.json b/rag_engine/ovrawmjson/t_metatime.json deleted file mode 100644 index abcae38b..00000000 --- a/rag_engine/ovrawmjson/t_metatime.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Import the scanpy library and read the 'TiME_adata_scvi.h5ad' file into an AnnData object named `adata`.", - "code": "import scanpy as sc\nadata=sc.read('TiME_adata_scvi.h5ad')\nadata" - }, - { - "action": "Calculate the neighborhood graph of the cells in `adata` using the 'X_scVI' representation.", - "code": "sc.pp.neighbors(adata, use_rep=\"X_scVI\")" - }, - { - "action": "Calculate the Minimum Distortion Embedding (MDE) of the 'X_scVI' representation and store it in `adata.obsm[\"X_mde\"]`.", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"X_scVI\"])" - }, - { - "action": "Plot the MDE embedding, colored by the 'patient' variable.", - "code": "sc.pl.embedding(\n adata,\n basis=\"X_mde\",\n color=[\"patient\"],\n frameon=False,\n ncols=1,\n)" - }, - { - "action": "Initialize a MetaTiME object with the AnnData object `adata` and mode set to 'table'.", - "code": "TiME_object=ov.single.MetaTiME(adata,mode='table')" - }, - { - "action": "Overcluster the cells in the `TiME_object` with a resolution of 8 and store the cluster labels in `adata.obs['overcluster']`.", - "code": "TiME_object.overcluster(resolution=8,clustercol = 'overcluster',)" - }, - { - "action": "Predict the cell types in the tumor microenvironment (TME) using `TiME_object.predictTiME()` and store the results in `adata.obs['MetaTiME']` and `adata.obs['Major_MetaTiME']`.", - "code": "TiME_object.predictTiME(save_obs_name='MetaTiME')" - }, - { - "action": "Plot the predicted cell types on the MDE embedding using `TiME_object.plot()`.", - "code": "fig,ax=TiME_object.plot(cluster_key='MetaTiME',basis='X_mde',dpi=80)" - }, - { - "action": "Plot the major cell types on the MDE embedding using `sc.pl.embedding()`.", - "code": "sc.pl.embedding(\n adata,\n basis=\"X_mde\",\n color=[\"Major_MetaTiME\"],\n frameon=False,\n ncols=1,\n)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_mofa.json b/rag_engine/ovrawmjson/t_mofa.json deleted file mode 100644 index a3166f4d..00000000 --- a/rag_engine/ovrawmjson/t_mofa.json +++ /dev/null @@ -1,78 +0,0 @@ -[ - { - "action": "Import the omicverse library and read scRNA-seq and scATAC-seq data from specified files.", - "code": "import omicverse as ov\nrna=ov.utils.read('data/sample/rna_p_n_raw.h5ad')\natac=ov.utils.read('data/sample/atac_p_n_raw.h5ad')" - }, - { - "action": "Display the loaded scRNA-seq and scATAC-seq data.", - "code": "rna,atac" - }, - { - "action": "Create a MOFA model using the omicverse library, incorporating both scRNA-seq and scATAC-seq data, and assigning names to each omics type.", - "code": "test_mofa=ov.single.pyMOFA(omics=[rna,atac],\n omics_name=['RNA','ATAC'])" - }, - { - "action": "Preprocess the MOFA model and run it, saving the output to a specified HDF5 file.", - "code": "test_mofa.mofa_preprocess()\ntest_mofa.mofa_run(outfile='models/brac_rna_atac.hdf5')" - }, - { - "action": "Import the omicverse library and set plotting parameters using `ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Read scRNA-seq data from a specified file.", - "code": "rna=ov.utils.read('data/sample/rna_test.h5ad')" - }, - { - "action": "Extract factor values from a pre-computed MOFA model (stored in an HDF5 file) and add them to the scRNA-seq AnnData object.", - "code": "rna=ov.single.factor_exact(rna,hdf5_path='data/sample/MOFA_POS.hdf5')\nrna" - }, - { - "action": "Calculate and display the correlation between factors and cell types in the scRNA-seq data.", - "code": "ov.single.factor_correlation(adata=rna,cluster='cell_type',factor_list=[1,2,3,4,5])" - }, - { - "action": "Retrieve and display the gene/feature weights for a specific factor and view from the MOFA model.", - "code": "ov.single.get_weights(hdf5_path='data/sample/MOFA_POS.hdf5',view='RNA',factor=1)" - }, - { - "action": "Initialize a MOFA visualization object using a pre-computed MOFA model from a specified HDF5 file.", - "code": "pymofa_obj=ov.single.pyMOFAART(model_path='data/sample/MOFA_POS.hdf5')" - }, - { - "action": "Extract the factor values for each cell in the scRNA-seq data using the MOFA visualization object.", - "code": "pymofa_obj.get_factors(rna)\nrna" - }, - { - "action": "Plot the variance explained (R-squared) for each factor in each view of the MOFA model.", - "code": "pymofa_obj.plot_r2()" - }, - { - "action": "Retrieve and display the R-squared values for each factor in each view.", - "code": "pymofa_obj.get_r2()" - }, - { - "action": "Plot the correlation between factors and cell types using the MOFA visualization object.", - "code": "pymofa_obj.plot_cor(rna,'cell_type')" - }, - { - "action": "Plot the values of two specified factors against each other, colored by a specific cell type ('Epi').", - "code": "pymofa_obj.plot_factor(rna,'cell_type','Epi',figsize=(3,3),\n factor1=6,factor2=10,)" - }, - { - "action": "Calculate and visualize UMAP embeddings of the scRNA-seq data, colored by 'factor6' and 'cell_type'.", - "code": "import scanpy as sc\nsc.pp.neighbors(rna)\nsc.tl.umap(rna)\nsc.pl.embedding(\n rna,\n basis=\"X_umap\",\n color=[\"factor6\",\"cell_type\"],\n frameon=False,\n ncols=2,\n #palette=ov.utils.pyomic_palette(),\n show=False,\n cmap='Greens',\n vmin=0,\n)\n#plt.savefig(\"figures/umap_factor6.png\",dpi=300,bbox_inches = 'tight')" - }, - { - "action": "Plot the weights of genes/features for two specified factors in a scatter plot, highlighting the top weighted genes.", - "code": "pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=6,factor2=10,)" - }, - { - "action": "Plot the weights of genes/features for a specific factor, ordered by weight and colored.", - "code": "pymofa_obj.plot_weights(view='RNA',factor=6,color='#5de25d',\n ascending=True)" - }, - { - "action": "Plot a heatmap showing the top weighted features for each factor in a specific view ('RNA').", - "code": "pymofa_obj.plot_top_feature_heatmap(view='RNA')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_mofa_glue.json b/rag_engine/ovrawmjson/t_mofa_glue.json deleted file mode 100644 index 53b79b02..00000000 --- a/rag_engine/ovrawmjson/t_mofa_glue.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Load RNA and ATAC data from 'h5ad' files using `ov.utils.read()`.", - "code": "rna=ov.utils.read(\"chen_rna-emb.h5ad\")\natac=ov.utils.read(\"chen_atac-emb.h5ad\")" - }, - { - "action": "Create a GLUE_pair object to pair cells between RNA and ATAC data based on the Pearson correlation coefficient of their feature vectors (X_glue).", - "code": "pair_obj=ov.single.GLUE_pair(rna,atac)\npair_obj.correlation()" - }, - { - "action": "Find the top 20 highly correlated cells in the other omics layer for each cell, with a default minimum correlation threshold of 0.9. Save the results to a CSV file.", - "code": "res_pair=pair_obj.find_neighbor_cell(depth=20)\nres_pair.to_csv('models/chen_pair_res.csv')" - }, - { - "action": "Filter the original RNA and ATAC datasets to keep only the paired cells identified in the previous step. Rename the index of the filtered datasets to match the paired cell indices.", - "code": "rna1=rna[res_pair['omic_1']]\natac1=atac[res_pair['omic_2']]\nrna1.obs.index=res_pair.index\natac1.obs.index=res_pair.index\nrna1,atac1" - }, - { - "action": "Create a MuData object to store the paired RNA and ATAC data.", - "code": "from mudata import MuData\n\nmdata = MuData({'rna': rna1, 'atac': atac1})\nmdata" - }, - { - "action": "Write the MuData object to a compressed 'h5mu' file.", - "code": "mdata.write(\"chen_mu.h5mu\",compression='gzip')" - }, - { - "action": "Filter the RNA and ATAC data to keep only highly variable genes.", - "code": "rna1=mdata['rna']\nrna1=rna1[:,rna1.var['highly_variable']==True]\natac1=mdata['atac']\natac1=atac1[:,atac1.var['highly_variable']==True]\nrna1.obs.index=res_pair.index\natac1.obs.index=res_pair.index" - }, - { - "action": "Randomly select 5000 cells from rna1 data", - "code": "import random\nrandom_obs_index=random.sample(list(rna1.obs.index),5000)" - }, - { - "action": "Calculate the adjusted rand index (ARI) between the cell types of the randomly selected cells and all cells in the paired RNA and ATAC data.", - "code": "from sklearn.metrics import adjusted_rand_score as ari\nari_random=ari(rna1[random_obs_index].obs['cell_type'], atac1[random_obs_index].obs['cell_type'])\nari_raw=ari(rna1.obs['cell_type'], atac1.obs['cell_type'])\nprint('raw ari:{}, random ari:{}'.format(ari_raw,ari_random))" - }, - { - "action": "Construct a MOFA model using the paired RNA and ATAC data.", - "code": "test_mofa=ov.single.pyMOFA(omics=[rna1,atac1],\n omics_name=['RNA','ATAC'])" - }, - { - "action": "Preprocess the data for MOFA and run the MOFA algorithm, saving the results to an HDF5 file.", - "code": "test_mofa.mofa_preprocess()\ntest_mofa.mofa_run(outfile='models/chen_rna_atac.hdf5')" - }, - { - "action": "Create a pyMOFAART object to analyze the MOFA results.", - "code": "pymofa_obj=ov.single.pyMOFAART(model_path='models/chen_rna_atac.hdf5')" - }, - { - "action": "Extract the learned factors from the MOFA model and add them to the RNA AnnData object.", - "code": "pymofa_obj.get_factors(rna1)\nrna1" - }, - { - "action": "Plot the variance explained (R^2) by each factor for each view.", - "code": "pymofa_obj.plot_r2()" - }, - { - "action": "Get the R^2 values for each factor and view.", - "code": "pymofa_obj.get_r2()" - }, - { - "action": "Plot the correlation between factors and a specified metadata column ('cell_type') in the RNA AnnData object.", - "code": "pymofa_obj.plot_cor(rna1,'cell_type',figsize=(4,6))" - }, - { - "action": "Get the correlation values between factors and the specified metadata column.", - "code": "pymofa_obj.get_cor(rna1,'cell_type')" - }, - { - "action": "Plot a scatter plot of two specified factors, colored by a specified metadata column and highlighting a specific cell type.", - "code": "pymofa_obj.plot_factor(rna1,'cell_type','Ast',figsize=(3,3),\n factor1=1,factor2=3,)" - }, - { - "action": "Calculate and store the Minimum Description Length (MDE) embedding of the data using the 'X_glue' representation.", - "code": "from scvi.model.utils import mde\nimport scanpy as sc\nsc.pp.neighbors(rna1, use_rep=\"X_glue\", metric=\"cosine\")\nrna1.obsm[\"X_mde\"] = mde(rna1.obsm[\"X_glue\"])" - }, - { - "action": "Plot the MDE embedding, colored by specified factors and cell type.", - "code": "sc.pl.embedding(\n rna1,\n basis=\"X_mde\",\n color=[\"factor1\",\"factor3\",\"cell_type\"],\n frameon=False,\n ncols=3,\n #palette=ov.utils.pyomic_palette(),\n show=False,\n cmap='Greens',\n vmin=0,\n)" - }, - { - "action": "Plot the weights of genes for two specified factors in a specified view.", - "code": "pymofa_obj.plot_weight_gene_d1(view='RNA',factor1=1,factor2=3,)" - }, - { - "action": "Plot the weights of genes for a specified factor in a specified view, sorted in ascending or descending order.", - "code": "pymofa_obj.plot_weights(view='RNA',factor=1,\n ascending=False)" - }, - { - "action": "Plot a heatmap of the top features for each factor in a specified view.", - "code": "pymofa_obj.plot_top_feature_heatmap(view='RNA')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_network.json b/rag_engine/ovrawmjson/t_network.json deleted file mode 100644 index a8626fad..00000000 --- a/rag_engine/ovrawmjson/t_network.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "action": "Import the omicverse library and set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nov.utils.ov_plot_set()" - }, - { - "action": "Define a list of genes to be analyzed, representing FAA4 and its ten most confident interactors in Saccharomyces cerevisiae.", - "code": "gene_list=['FAA4','POX1','FAT1','FAS2','FAS1','FAA1','OLE1','YJU3','TGL3','INA1','TGL5']" - }, - { - "action": "Create dictionaries to store gene type and color information for visualization. The top 5 genes are assigned 'Type1' and a specific color, while the rest are assigned 'Type2' and another color.", - "code": "gene_type_dict=dict(zip(gene_list,['Type1']*5+['Type2']*6))\ngene_color_dict=dict(zip(gene_list,['#F7828A']*5+['#9CCCA4']*6))" - }, - { - "action": "Perform STRING interaction analysis using `ov.bulk.string_interaction()`. This function retrieves protein-protein interaction data from the STRING database for the given gene list and species (4932 for Saccharomyces cerevisiae). The result is stored in the `G_res` variable.", - "code": "G_res=ov.bulk.string_interaction(gene_list,4932)\nG_res.head()" - }, - { - "action": "Initialize a `pyPPI` object from `omicverse.bulk` to handle protein-protein interaction network analysis. The object is configured with the gene list, gene type dictionary, gene color dictionary, and species ID.", - "code": "ppi=ov.bulk.pyPPI(gene=gene_list,\n gene_type_dict=gene_type_dict,\n gene_color_dict=gene_color_dict,\n species=4932)" - }, - { - "action": "Connect to the STRING database and calculate the protein-protein interactions using the `interaction_analysis()` method of the `pyPPI` object.", - "code": "ppi.interaction_analysis()" - }, - { - "action": "Plot the protein-protein interaction network using the `plot_network()` method of the `pyPPI` object. This function visualizes the network based on the calculated interactions and the provided gene type and color information.", - "code": "ppi.plot_network()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_nocd.json b/rag_engine/ovrawmjson/t_nocd.json deleted file mode 100644 index d73002ca..00000000 --- a/rag_engine/ovrawmjson/t_nocd.json +++ /dev/null @@ -1,38 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, anndata, scanpy, matplotlib.pyplot, numpy, and pandas. Also, enable inline plotting for matplotlib.", - "code": "import omicverse as ov\nimport anndata\nimport scanpy as sc\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')" - }, - { - "action": "Set scanpy settings for verbosity and figure parameters.", - "code": "sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\nsc.settings.set_figure_params(dpi=80, facecolor='white')" - }, - { - "action": "Define a custom colormap for visualizations.", - "code": "from matplotlib.colors import LinearSegmentedColormap\nsc_color=['#7CBB5F','#368650','#A499CC','#5E4D9A','#78C2ED','#866017', '#9F987F','#E0DFED',\n '#EF7B77', '#279AD7','#F0EEF0', '#1F577B', '#A56BA7', '#E0A7C8', '#E069A6', '#941456', '#FCBC10',\n '#EAEFC5', '#01A0A7', '#75C8CC', '#F0D7BC', '#D5B26C', '#D5DA48', '#B6B812', '#9DC3C3', '#A89C92', '#FEE00C', '#FEF2A1']\nsc_color_cmap = LinearSegmentedColormap.from_list('Custom', sc_color, len(sc_color))" - }, - { - "action": "Read the single-cell RNA sequencing data from an h5ad file.", - "code": "adata = anndata.read('sample/rna.h5ad')\nadata" - }, - { - "action": "Apply lazy preprocessing using omicverse's scanpy_lazy function.", - "code": "adata=ov.single.scanpy_lazy(adata)" - }, - { - "action": "Initialize, configure, and run the scNOCD model for overlapping community detection.", - "code": "scbrca=ov.single.scnocd(adata)\nscbrca.matrix_transform()\nscbrca.matrix_normalize()\nscbrca.GNN_configure()\nscbrca.GNN_preprocess()\nscbrca.GNN_model()\nscbrca.GNN_result()\nscbrca.GNN_plot()\n#scbrca.calculate_nocd()\nscbrca.cal_nocd()" - }, - { - "action": "Calculate the non-overlapping community detection (NOCD) results.", - "code": "scbrca.calculate_nocd()" - }, - { - "action": "Visualize the UMAP embeddings colored by Leiden clustering and NOCD results.", - "code": "sc.pl.umap(scbrca.adata, color=['leiden','nocd'],wspace=0.4,palette=sc_color)" - }, - { - "action": "Visualize the UMAP embeddings colored by Leiden clustering and the number of communities each cell belongs to (nocd_n).", - "code": "sc.pl.umap(scbrca.adata, color=['leiden','nocd_n'],wspace=0.4,palette=sc_color)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_preprocess.json b/rag_engine/ovrawmjson/t_preprocess.json deleted file mode 100644 index 684f6504..00000000 --- a/rag_engine/ovrawmjson/t_preprocess.json +++ /dev/null @@ -1,130 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.ov_plot_set()" - }, - { - "action": "Create directories for data storage and download the 10x Genomics PBMC3k dataset.", - "code": "# !mkdir data\n# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data into an AnnData object using `sc.read_10x_mtx()`.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading\nadata" - }, - { - "action": "Make variable and observation names unique.", - "code": "adata.var_names_make_unique()\nadata.obs_names_make_unique()" - }, - { - "action": "Perform quality control on the AnnData object using `ov.pp.qc()`, filtering cells based on mitochondrial gene percentage, number of UMIs, and number of detected genes.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})\nadata" - }, - { - "action": "Store the original counts in `adata.uns['layers_counts']` using `ov.utils.store_layers()`.", - "code": "ov.utils.store_layers(adata,layers='counts')\nadata" - }, - { - "action": "Preprocess the data using `ov.pp.preprocess()`, applying `shiftlog` normalization and Pearson residuals for highly variable gene detection.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\nadata" - }, - { - "action": "Set the `.raw` attribute of the AnnData object to the normalized and logarithmized raw gene expression.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Retrieve the original counts from `adata.uns['layers_counts']` and compare the maximum values of normalized and raw count data.", - "code": "adata_counts=adata.copy()\nov.utils.retrieve_layers(adata_counts,layers='counts')\nprint('normalize adata:',adata.X.max())\nprint('raw count adata:',adata_counts.X.max())" - }, - { - "action": "Display the AnnData object with raw counts.", - "code": "adata_counts" - }, - { - "action": "Retrieve the original count matrix at the whole gene level.", - "code": "adata_counts=adata.raw.to_adata().copy()\nov.utils.retrieve_layers(adata_counts,layers='counts')\nprint('normalize adata:',adata.X.max())\nprint('raw count adata:',adata_counts.X.max())\nadata_counts" - }, - { - "action": "Scale the data and store the results in a layer using `ov.pp.scale()`.", - "code": "ov.pp.scale(adata)\nadata" - }, - { - "action": "Perform principal component analysis (PCA) on the scaled data using `ov.pp.pca()`.", - "code": "ov.pp.pca(adata,layer='scaled',n_pcs=50)\nadata" - }, - { - "action": "Visualize the PCA embeddings using `ov.utils.embedding()`, coloring by the 'CST3' gene.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']\nov.utils.embedding(adata,\n basis='X_pca',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute the neighborhood graph using `sc.pp.neighbors()`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')" - }, - { - "action": "Calculate Minimum Distortion Embedding (MDE) using `ov.utils.mde()`.", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])\nadata" - }, - { - "action": "Visualize the MDE embeddings using `ov.utils.embedding()`, coloring by the 'CST3' gene.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute UMAP embeddings using `sc.tl.umap()`.", - "code": "sc.tl.umap(adata)" - }, - { - "action": "Visualize the UMAP embeddings using `ov.utils.embedding()`, coloring by the 'CST3' gene.", - "code": "ov.utils.embedding(adata,\n basis='X_umap',\n color='CST3',\n frameon='small')" - }, - { - "action": "Perform Leiden clustering using `sc.tl.leiden()`.", - "code": "sc.tl.leiden(adata)" - }, - { - "action": "Visualize the MDE embeddings using `ov.utils.embedding()`, coloring by 'leiden', 'CST3', and 'NKG7'.", - "code": "ov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden', 'CST3', 'NKG7'],\n frameon='small')" - }, - { - "action": "Visualize specific clusters using `ov.utils.plot_ConvexHull()`.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots( figsize = (4,4))\n\nov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.utils.plot_ConvexHull(adata,\n basis='X_mde',\n cluster_key='leiden',\n hull_cluster='0',\n ax=ax)" - }, - { - "action": "Generate and display labels for Leiden clusters using `ov.utils.gen_mpl_labels()` with custom styling.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.utils.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.utils.gen_mpl_labels(\n adata,\n 'leiden',\n exclude=(\"None\",), \n basis='X_mde',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize= 12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Define a list of marker genes.", - "code": "marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',\n 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',\n 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']" - }, - { - "action": "Create a dot plot of the marker genes using `sc.pl.dotplot()`.", - "code": "sc.pl.dotplot(adata, marker_genes, groupby='leiden',\n standard_scale='var');" - }, - { - "action": "Calculate a dendrogram and rank genes using t-test with `sc.tl.dendrogram()` and `sc.tl.rank_genes_groups()`, then visualize the results with `sc.pl.rank_genes_groups_dotplot()`.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='leiden_ttest')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_ttest',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Rank genes using t-test and COSG with `sc.tl.rank_genes_groups()` and `ov.single.cosg()`, then visualize the results with `sc.pl.rank_genes_groups_dotplot()`.", - "code": "sc.tl.rank_genes_groups(adata, groupby='leiden', \n method='t-test',use_rep='scaled|original|X_pca',)\nov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_cosg',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Prepare a dictionary of DataFrames for the stacked volcano plot, where each DataFrame contains gene names, log fold changes, and adjusted p-values for each Leiden cluster.", - "code": "data_dict={}\nfor i in adata.obs['leiden'].cat.categories:\n data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest',\n pval_cutoff=None,log2fc_min=None)" - }, - { - "action": "Print keys of the data_dict", - "code": "data_dict.keys()" - }, - { - "action": "Print the head of the DataFrame for a specific cluster", - "code": "data_dict[i].head()" - }, - { - "action": "Prepare a dictionary mapping Leiden cluster names to colors.", - "code": "type_color_dict=dict(zip(adata.obs['leiden'].cat.categories,\n adata.uns['leiden_colors']))\ntype_color_dict" - }, - { - "action": "Create a stacked volcano plot using `ov.utils.stacking_vol()` with specified parameters.", - "code": "fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict,\n pval_threshold=0.01,\n log2fc_threshold=2,\n figsize=(8,4),\n sig_color='#a51616',\n normal_color='#c7c7c7',\n plot_genes_num=2,\n plot_genes_fontsize=6,\n plot_genes_weight='bold',\n )\n\n#The following code will be removed in future\ny_min,y_max=0,0\nfor i in data_dict.keys():\n y_min=min(y_min,data_dict[i]['logfoldchanges'].min())\n y_max=max(y_max,data_dict[i]['logfoldchanges'].max())\nfor i in adata.obs['leiden'].cat.categories:\n axes[i].set_ylim(y_min,y_max)\nplt.suptitle('Stacking_vol',fontsize=12) " - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_preprocess_cpu.json b/rag_engine/ovrawmjson/t_preprocess_cpu.json deleted file mode 100644 index eae87fbd..00000000 --- a/rag_engine/ovrawmjson/t_preprocess_cpu.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy and omicverse. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Download and unpack the PBMC3k dataset from 10x Genomics.", - "code": "# !mkdir data\nget_ipython().system('wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz')\nget_ipython().system('cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz')\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data into an AnnData object.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading\nadata" - }, - { - "action": "Make variable and observation names unique.", - "code": "adata.var_names_make_unique()\nadata.obs_names_make_unique()" - }, - { - "action": "Perform quality control on the AnnData object.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.qc(adata,\\n tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},\\n doublets_method='sccomposite',\\n batch_key=None)\\nadata\\n\")" - }, - { - "action": "Preprocess the AnnData object, including normalization and highly variable gene detection.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Store the normalized and logarithmized raw gene expression in the .raw attribute of the AnnData object.", - "code": "get_ipython().run_cell_magic('time', '', 'adata.raw = adata\\nadata = adata[:, adata.var.highly_variable_features]\\nadata\\n')" - }, - { - "action": "Scale the data for principal component analysis.", - "code": "get_ipython().run_cell_magic('time', '', 'ov.pp.scale(adata)\\nadata\\n')" - }, - { - "action": "Perform principal component analysis (PCA) on the scaled data.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.pca(adata,layer='scaled',n_pcs=50)\\nadata\\n\")" - }, - { - "action": "Visualize the PCA embedding, coloring cells by the expression of the CST3 gene.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']\nov.pl.embedding(adata,\n basis='X_pca',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute the neighborhood graph of the cells.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\\n use_rep='scaled|original|X_pca')\\n\")" - }, - { - "action": "Embed the neighborhood graph using UMAP.", - "code": "get_ipython().run_cell_magic('time', '', 'ov.pp.umap(adata)\\n')" - }, - { - "action": "Visualize the UMAP embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color='CST3',\n frameon='small')" - }, - { - "action": "Calculate mde embeddings", - "code": "ov.pp.mde(adata,embedding_dim=2,n_neighbors=15, basis='X_mde',\n n_pcs=50, use_rep='scaled|original|X_pca',)" - }, - { - "action": "Visualize the mde embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color='CST3',\n frameon='small')" - }, - { - "action": "Score cell cycle genes in the AnnData object.", - "code": "adata_raw=adata.raw.to_adata()\nov.pp.score_genes_cell_cycle(adata_raw,species='human')" - }, - { - "action": "Visualize the mde embedding, coloring cells by cell cycle phase.", - "code": "ov.pl.embedding(adata_raw,\n basis='X_mde',\n color='phase',\n frameon='small')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph.", - "code": "ov.pp.leiden(adata,resolution=1)" - }, - { - "action": "Visualize the mde embedding, coloring cells by Leiden cluster, CST3 expression, and NKG7 expression.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden', 'CST3', 'NKG7'],\n frameon='small')" - }, - { - "action": "Visualize specific clusters using a convex hull.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots( figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.ConvexHull(adata,\n basis='X_mde',\n cluster_key='leiden',\n hull_cluster='0',\n ax=ax)" - }, - { - "action": "Generate labels for the mde embedding, improving text overlap.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.utils.gen_mpl_labels(\n adata,\n 'leiden',\n exclude=(\"None\",), \n basis='X_mde',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize= 12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Define a list of marker genes.", - "code": "marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',\n 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',\n 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']" - }, - { - "action": "Create a dot plot of the marker genes, grouped by Leiden cluster.", - "code": "sc.pl.dotplot(adata, marker_genes, groupby='leiden',\n standard_scale='var');" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using a t-test.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='leiden_ttest')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_ttest',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using the COSG method.", - "code": "sc.tl.rank_genes_groups(adata, groupby='leiden', \n method='t-test',use_rep='scaled|original|X_pca',)\nov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_cosg',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Prepare data for the Stacked Volcano Chart by creating a dictionary of DataFrames, each containing gene names, log fold changes, and adjusted p-values for a specific Leiden cluster.", - "code": "data_dict={}\nfor i in adata.obs['leiden'].cat.categories:\n data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest',\n pval_cutoff=None,log2fc_min=None)" - }, - { - "action": "Display the keys of data_dict", - "code": "data_dict.keys()" - }, - { - "action": "Show the head of the DataFrame for a specific cluster.", - "code": "data_dict[i].head()" - }, - { - "action": "Create a dictionary mapping Leiden cluster names to their corresponding colors.", - "code": "type_color_dict=dict(zip(adata.obs['leiden'].cat.categories,\n adata.uns['leiden_colors']))\ntype_color_dict" - }, - { - "action": "Generate and display a Stacked Volcano Chart.", - "code": "fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict,\n pval_threshold=0.01,\n log2fc_threshold=2,\n figsize=(8,4),\n sig_color='#a51616',\n normal_color='#c7c7c7',\n plot_genes_num=2,\n plot_genes_fontsize=6,\n plot_genes_weight='bold',\n )\n\n#The following code will be removed in future\ny_min,y_max=0,0\nfor i in data_dict.keys():\n y_min=min(y_min,data_dict[i]['logfoldchanges'].min())\n y_max=max(y_max,data_dict[i]['logfoldchanges'].max())\nfor i in adata.obs['leiden'].cat.categories:\n axes[i].set_ylim(y_min,y_max)\nplt.suptitle('Stacking_vol',fontsize=12) " - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_preprocess_gpu.json b/rag_engine/ovrawmjson/t_preprocess_gpu.json deleted file mode 100644 index f3940d6f..00000000 --- a/rag_engine/ovrawmjson/t_preprocess_gpu.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.plot_set()`. Initialize GPU settings using `ov.settings.gpu_init()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()\nov.settings.gpu_init()" - }, - { - "action": "Download and unpack the PBMC3k dataset from 10x Genomics.", - "code": "# !mkdir data\n#!wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz\n#!cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz\n# !mkdir write" - }, - { - "action": "Read the 10x Genomics data into an AnnData object.", - "code": "adata = sc.read_10x_mtx(\n 'data/filtered_gene_bc_matrices/hg19/', # the directory with the `.mtx` file\n var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)\n cache=True) # write a cache file for faster subsequent reading\nadata" - }, - { - "action": "Make variable and observation names unique.", - "code": "adata.var_names_make_unique()\nadata.obs_names_make_unique()" - }, - { - "action": "Convert the AnnData object to a GPU-compatible format.", - "code": "ov.pp.anndata_to_GPU(adata)" - }, - { - "action": "Perform quality control on the AnnData object.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.qc(adata,\\n tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},\\n batch_key=None)\\nadata\\n\")" - }, - { - "action": "Preprocess the AnnData object, including normalization and highly variable gene detection.", - "code": "get_ipython().run_cell_magic('time', '', \"adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\\nadata\\n\")" - }, - { - "action": "Set the .raw attribute of the AnnData object to the normalized and logarithmized raw gene expression.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Scale the data for principal component analysis.", - "code": "get_ipython().run_cell_magic('time', '', 'ov.pp.scale(adata)\\nadata\\n')" - }, - { - "action": "Perform principal component analysis (PCA) on the scaled data.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.pca(adata,layer='scaled',n_pcs=50)\\nadata\\n\")" - }, - { - "action": "Visualize the PCA embedding, coloring cells by the expression of the CST3 gene.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca']\nov.utils.embedding(adata,\n basis='X_pca',\n color='CST3',\n frameon='small')" - }, - { - "action": "Compute the neighborhood graph of the cells using cagra method.", - "code": "get_ipython().run_cell_magic('time', '', \"ov.pp.neighbors(adata, n_neighbors=15, n_pcs=50,\\n use_rep='scaled|original|X_pca',method='cagra')\\n\")" - }, - { - "action": "Calculate mde embeddings", - "code": "adata.obsm[\"X_mde\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])\nadata" - }, - { - "action": "Visualize the mde embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color='CST3',\n frameon='small')" - }, - { - "action": "Embed the neighborhood graph using UMAP.", - "code": "ov.pp.umap(adata)" - }, - { - "action": "Visualize the UMAP embedding, coloring cells by the expression of the CST3 gene.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color='CST3',\n frameon='small')" - }, - { - "action": "Perform Leiden clustering on the neighborhood graph.", - "code": "ov.pp.leiden(adata)" - }, - { - "action": "Convert the AnnData object back to a CPU-compatible format.", - "code": "ov.pp.anndata_to_CPU(adata)" - }, - { - "action": "Visualize the mde embedding, coloring cells by Leiden cluster, CST3 expression, and NKG7 expression.", - "code": "ov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden', 'CST3', 'NKG7'],\n frameon='small')" - }, - { - "action": "Visualize specific clusters using a convex hull.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots( figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.ConvexHull(adata,\n basis='X_mde',\n cluster_key='leiden',\n hull_cluster='0',\n ax=ax)" - }, - { - "action": "Generate labels for the mde embedding, improving text overlap.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.pl.embedding(adata,\n basis='X_mde',\n color=['leiden'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.utils.gen_mpl_labels(\n adata,\n 'leiden',\n exclude=(\"None\",), \n basis='X_mde',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize= 12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Define a list of marker genes.", - "code": "marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',\n 'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',\n 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']" - }, - { - "action": "Create a dot plot of the marker genes, grouped by Leiden cluster.", - "code": "sc.pl.dotplot(adata, marker_genes, groupby='leiden',\n standard_scale='var');" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using a t-test.", - "code": "sc.tl.dendrogram(adata,'leiden',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(adata, 'leiden', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='leiden_ttest')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_ttest',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Compute a ranking of differentially expressed genes for each Leiden cluster using the COSG method.", - "code": "sc.tl.rank_genes_groups(adata, groupby='leiden', \n method='t-test',use_rep='scaled|original|X_pca',)\nov.single.cosg(adata, key_added='leiden_cosg', groupby='leiden')\nsc.pl.rank_genes_groups_dotplot(adata,groupby='leiden',\n cmap='Spectral_r',key='leiden_cosg',\n standard_scale='var',n_genes=3)" - }, - { - "action": "Prepare data for the Stacked Volcano Chart by creating a dictionary of DataFrames, each containing gene names, log fold changes, and adjusted p-values for a specific Leiden cluster.", - "code": "data_dict={}\nfor i in adata.obs['leiden'].cat.categories:\n data_dict[i]=sc.get.rank_genes_groups_df(adata, group=i, key='leiden_ttest',\n pval_cutoff=None,log2fc_min=None)" - }, - { - "action": "Display the keys of data_dict", - "code": "data_dict.keys()" - }, - { - "action": "Show the head of the DataFrame for a specific cluster.", - "code": "data_dict[i].head()" - }, - { - "action": "Create a dictionary mapping Leiden cluster names to their corresponding colors.", - "code": "type_color_dict=dict(zip(adata.obs['leiden'].cat.categories,\n adata.uns['leiden_colors']))\ntype_color_dict" - }, - { - "action": "Generate and display a Stacked Volcano Chart.", - "code": "fig,axes=ov.utils.stacking_vol(data_dict,type_color_dict,\n pval_threshold=0.01,\n log2fc_threshold=2,\n figsize=(8,4),\n sig_color='#a51616',\n normal_color='#c7c7c7',\n plot_genes_num=2,\n plot_genes_fontsize=6,\n plot_genes_weight='bold',\n )\n\n#The following code will be removed in future\ny_min,y_max=0,0\nfor i in data_dict.keys():\n y_min=min(y_min,data_dict[i]['logfoldchanges'].min())\n y_max=max(y_max,data_dict[i]['logfoldchanges'].max())\nfor i in adata.obs['leiden'].cat.categories:\n axes[i].set_ylim(y_min,y_max)\nplt.suptitle('Stacking_vol',fontsize=12) " - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_scdeg.json b/rag_engine/ovrawmjson/t_scdeg.json deleted file mode 100644 index 5580f874..00000000 --- a/rag_engine/ovrawmjson/t_scdeg.json +++ /dev/null @@ -1,122 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and scvelo. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport scvelo as scv\n\nov.utils.ov_plot_set()" - }, - { - "action": "Load the pancreas dataset using `scv.datasets.pancreas()`.", - "code": "adata = scv.datasets.pancreas()\nadata" - }, - { - "action": "Check the maximum value in the `adata.X` matrix.", - "code": "adata.X.max()" - }, - { - "action": "Perform quality control, normalization, and calculate highly variable genes (HVGs). Save the whole genes and filter non-HVGs. Scale the `adata.X` matrix and perform dimensionality reduction using PCA.", - "code": "#quantity control\nadata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.05, 'nUMIs': 500, 'detected_genes': 250})\n#normalize and high variable genes (HVGs) calculated\nadata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\n\n#save the whole genes and filter the non-HVGs\nadata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\n\n#scale the adata.X\nov.pp.scale(adata)\n\n#Dimensionality Reduction\nov.pp.pca(adata,layer='scaled',n_pcs=50)" - }, - { - "action": "Check the maximum value in the scaled `adata.X` matrix.", - "code": "adata.X.max()" - }, - { - "action": "Select target cells ('Alpha' and 'Beta') for analysis, derive the expression matrix using `to_df()`, and build the differential expression analysis module using `pyDEG`.", - "code": "test_adata=adata[adata.obs['clusters'].isin(['Alpha','Beta'])]\ntest_adata\n\n\ndds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T)" - }, - { - "action": "Drop duplicate indices in the `dds` object.", - "code": "dds.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Set up treatment and control groups based on cell types ('Alpha' and 'Beta') and perform differential expression analysis using the t-test method.", - "code": "treatment_groups=test_adata.obs[test_adata.obs['clusters']=='Alpha'].index.tolist()\ncontrol_groups=test_adata.obs[test_adata.obs['clusters']=='Beta'].index.tolist()\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')" - }, - { - "action": "Display the top differentially expressed genes sorted by q-value.", - "code": "result.sort_values('qvalue').head()" - }, - { - "action": "Set fold change threshold, p-value threshold, and maximum -log10(p-value) for visualization.", - "code": "# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=10)" - }, - { - "action": "Plot a volcano plot of the differential expression analysis results.", - "code": "dds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot box plots for specific genes ('Irx1' and 'Adra2a') in the treatment and control groups.", - "code": "dds.plot_boxplot(genes=['Irx1','Adra2a'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Visualize cell clusters and gene expression ('Irx1' and 'Adra2a') on a UMAP embedding.", - "code": "ov.utils.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=['clusters','Irx1','Adra2a'])" - }, - { - "action": "Create a MetaCell object using `ov.single.MetaCell` for constructing metacells.", - "code": "meta_obj=ov.single.MetaCell(adata,use_rep='scaled|original|X_pca',n_metacells=150,\n use_gpu=True)" - }, - { - "action": "Initialize archetypes for the MetaCell object.", - "code": "meta_obj.initialize_archetypes()" - }, - { - "action": "Train the MetaCell model.", - "code": "meta_obj.train(min_iter=10, max_iter=50)" - }, - { - "action": "Save the trained MetaCell model.", - "code": "meta_obj.save('seacells/model.pkl')" - }, - { - "action": "Load a saved MetaCell model.", - "code": "meta_obj.load('seacells/model.pkl')" - }, - { - "action": "Predict metacells using the trained model with the 'soft' method and summarize the 'lognorm' layer.", - "code": "ad=meta_obj.predicted(method='soft',celltype_label='clusters',\n summarize_layer='lognorm')" - }, - { - "action": "Check the minimum and maximum values of the predicted metacell matrix.", - "code": "ad.X.min(),ad.X.max()" - }, - { - "action": "Plot the metacells on the UMAP embedding of the original data.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\nov.utils.embedding(\n meta_obj.adata,\n basis=\"X_umap\",\n color=['clusters'],\n frameon='small',\n title=\"Meta cells\",\n #legend_loc='on data',\n legend_fontsize=14,\n legend_fontoutline=2,\n size=10,\n ax=ax,\n alpha=0.2,\n #legend_loc='', \n add_outline=False, \n #add_outline=True,\n outline_color='black',\n outline_width=1,\n show=False,\n #palette=ov.utils.blue_color[:],\n #legend_fontweight='normal'\n)\nov.single._metacell.plot_metacells(ax,meta_obj.adata,color='#CB3E35',\n )" - }, - { - "action": "Select metacells of types 'Alpha' and 'Beta' for differential expression analysis.", - "code": "test_adata=ad[ad.obs['celltype'].isin(['Alpha','Beta'])]\ntest_adata" - }, - { - "action": "Create a `pyDEG` object for differential expression analysis using metacell data.", - "code": "dds_meta=ov.bulk.pyDEG(test_adata.to_df().T)" - }, - { - "action": "Drop duplicate indices in the `dds_meta` object.", - "code": "dds_meta.drop_duplicates_index()\nprint('... drop_duplicates_index success')" - }, - { - "action": "Set up treatment and control groups based on metacell types ('Alpha' and 'Beta') and perform differential expression analysis using the t-test method.", - "code": "treatment_groups=test_adata.obs[test_adata.obs['celltype']=='Alpha'].index.tolist()\ncontrol_groups=test_adata.obs[test_adata.obs['celltype']=='Beta'].index.tolist()\nresult=dds_meta.deg_analysis(treatment_groups,control_groups,method='ttest')" - }, - { - "action": "Display the top differentially expressed genes in metacells sorted by q-value.", - "code": "result.sort_values('qvalue').head()" - }, - { - "action": "Set fold change threshold, p-value threshold, and maximum -log10(p-value) for visualization in metacell analysis.", - "code": "# -1 means automatically calculates\ndds_meta.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=10)" - }, - { - "action": "Plot a volcano plot of the differential expression analysis results for metacells.", - "code": "dds_meta.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Plot box plots for specific genes ('Ctxn2' and 'Mnx1') in the treatment and control metacell groups.", - "code": "dds_meta.plot_boxplot(genes=['Ctxn2','Mnx1'],treatment_groups=treatment_groups,\n control_groups=control_groups,figsize=(2,3),fontsize=12,\n legend_bbox=(2,0.55))" - }, - { - "action": "Visualize cell clusters and gene expression ('Ctxn2' and 'Mnx1') on a UMAP embedding for the original data.", - "code": "ov.utils.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=['clusters','Ctxn2','Mnx1'])" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_scdrug.json b/rag_engine/ovrawmjson/t_scdrug.json deleted file mode 100644 index c62be4e4..00000000 --- a/rag_engine/ovrawmjson/t_scdrug.json +++ /dev/null @@ -1,74 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, infercnvpy, and matplotlib. Set plotting parameters and verbosity level.", - "code": "import omicverse as ov\nimport scanpy as sc\nimport infercnvpy as cnv\nimport matplotlib.pyplot as plt\nimport os\n\nsc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)\nsc.settings.set_figure_params(dpi=80, facecolor='white')" - }, - { - "action": "Load the maynard2020_3k dataset using infercnvpy's datasets module.", - "code": "adata = cnv.datasets.maynard2020_3k()" - }, - { - "action": "Annotate gene coordinates using a GTF file. This step adds chromosomal location information to the AnnData object.", - "code": "ov.utils.get_gene_annotation(\n adata, gtf=\"gencode.v43.basic.annotation.gtf.gz\",\n gtf_by=\"gene_name\"\n)" - }, - { - "action": "Filter out genes with missing chromosome information and add chromosome, start, end, and ensg information to the var attribute of the AnnData object.", - "code": "adata=adata[:,~adata.var['chrom'].isnull()]\nadata.var['chromosome']=adata.var['chrom']\nadata.var['start']=adata.var['chromStart']\nadata.var['end']=adata.var['chromEnd']\nadata.var['ensg']=adata.var['gene_id']\nadata.var.loc[:, [\"ensg\", \"chromosome\", \"start\", \"end\"]].head()" - }, - { - "action": "Display the structure and content of the AnnData object, showing the number of cells, genes, and other associated data.", - "code": "adata" - }, - { - "action": "Infer copy number variations (CNVs) using infercnvpy. This step identifies potential tumor cells based on CNV profiles.", - "code": "# We provide all immune cell types as \"normal cells\".\ncnv.tl.infercnv(\n adata,\n reference_key=\"cell_type\",\n reference_cat=[\n \"B cell\",\n \"Macrophage\",\n \"Mast cell\",\n \"Monocyte\",\n \"NK cell\",\n \"Plasma cell\",\n \"T cell CD4\",\n \"T cell CD8\",\n \"T cell regulatory\",\n \"mDC\",\n \"pDC\",\n ],\n window_size=250,\n)\ncnv.tl.pca(adata)\ncnv.pp.neighbors(adata)\ncnv.tl.leiden(adata)\ncnv.tl.umap(adata)\ncnv.tl.cnv_score(adata)" - }, - { - "action": "Visualize the CNV score on a UMAP plot. This helps in identifying cells with high CNV scores, which are likely tumor cells.", - "code": "sc.pl.umap(adata, color=\"cnv_score\", show=False)" - }, - { - "action": "Annotate cells as 'normal' or 'tumor' based on their CNV score. A threshold of 0.03 is used to classify cells as tumor.", - "code": "adata.obs[\"cnv_status\"] = \"normal\"\nadata.obs.loc[\n adata.obs[\"cnv_score\"]>0.03, \"cnv_status\"\n] = \"tumor\"" - }, - { - "action": "Visualize the CNV status ('normal' or 'tumor') on a UMAP plot.", - "code": "sc.pl.umap(adata, color=\"cnv_status\", show=False)" - }, - { - "action": "Subset the AnnData object to include only tumor cells for further analysis.", - "code": "tumor=adata[adata.obs['cnv_status']=='tumor']\ntumor.X.max()" - }, - { - "action": "Preprocess the tumor AnnData object. This includes filtering cells and genes, identifying mitochondrial genes, calculating QC metrics, and identifying highly variable genes.", - "code": "adata=tumor\nprint('Preprocessing...')\nsc.pp.filter_cells(adata, min_genes=200)\nsc.pp.filter_genes(adata, min_cells=3)\nadata.var['mt'] = adata.var_names.str.startswith('MT-')\nsc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\nif not (adata.obs.pct_counts_mt == 0).all():\n adata = adata[adata.obs.pct_counts_mt < 30, :]\n\nadata.raw = adata.copy()\n\nsc.pp.highly_variable_genes(adata)\nadata = adata[:, adata.var.highly_variable]\nsc.pp.scale(adata)\nsc.tl.pca(adata, svd_solver='arpack')" - }, - { - "action": "Perform dimensionality reduction using PCA and UMAP, and compute nearest neighbors for clustering.", - "code": "sc.pp.neighbors(adata, n_pcs=20)\nsc.tl.umap(adata)" - }, - { - "action": "Download necessary data for drug response prediction, including the GDSC drug database and CaDRReS model.", - "code": "ov.utils.download_GDSC_data()\nov.utils.download_CaDRReS_model()" - }, - { - "action": "Apply single-cell data analysis to perform sub-clustering on the tumor clusters at an automatically determined resolution.", - "code": "adata, res,plot_df = ov.single.autoResolution(adata,cpus=4)" - }, - { - "action": "Save the AnnData object to an H5AD file.", - "code": "results_file = os.path.join('./', 'scanpyobj.h5ad')\nadata.write(results_file)" - }, - { - "action": "Reload the AnnData object from the H5AD file.", - "code": "results_file = os.path.join('./', 'scanpyobj.h5ad')\nadata=sc.read(results_file)" - }, - { - "action": "Clone the CaDRReS-Sc repository from GitHub. This repository contains the code for drug response prediction.", - "code": "get_ipython().system('git clone https://github.com/CSB5/CaDRReS-Sc')" - }, - { - "action": "Initialize and run the drug response prediction using the `ov.single.Drug_Response` function. This step predicts the IC50 values for each cell cluster.", - "code": "import ov\njob=ov.single.Drug_Response(adata,scriptpath='CaDRReS-Sc',\n modelpath='models/',\n output='result')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_scmulan.json b/rag_engine/ovrawmjson/t_scmulan.json deleted file mode 100644 index f8a94de0..00000000 --- a/rag_engine/ovrawmjson/t_scmulan.json +++ /dev/null @@ -1,82 +0,0 @@ -[ - { - "action": "Import necessary libraries and set plotting parameters.", - "code": "import os\nimport scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Load the liver dataset from an h5ad file.", - "code": "adata = sc.read('./data/liver_test.h5ad')" - }, - { - "action": "Display the AnnData object to inspect its structure.", - "code": "adata" - }, - { - "action": "Convert the sparse matrix format of `adata.X` to Compressed Sparse Column format for compatibility.", - "code": "from scipy.sparse import csc_matrix\nadata.X = csc_matrix(adata.X)" - }, - { - "action": "Transform the gene symbols in the input AnnData object to a uniform set of 42117 gene symbols, matching the pre-trained scMulan model.", - "code": "adata_GS_uniformed = ov.externel.scMulan.GeneSymbolUniform(input_adata=adata,\n output_dir=\"./data\",\n output_prefix='liver')" - }, - { - "action": "Load the uniformed AnnData object from the saved file.", - "code": "adata_GS_uniformed=sc.read_h5ad('./data/liver_uniformed.h5ad')" - }, - { - "action": "Display the uniformed AnnData object.", - "code": "adata_GS_uniformed" - }, - { - "action": "Normalize and log-transform the count matrix if the maximum value is greater than 10.", - "code": "if adata_GS_uniformed.X.max() > 10:\n sc.pp.normalize_total(adata_GS_uniformed, target_sum=1e4) \n sc.pp.log1p(adata_GS_uniformed)" - }, - { - "action": "Specify the path to the pre-trained scMulan model checkpoint.", - "code": "ckp_path = './ckpt/ckpt_scMulan.pt'" - }, - { - "action": "Initialize the scMulan model for inference and prepare it for CUDA processing.", - "code": "scml = ov.externel.scMulan.model_inference(ckp_path, adata_GS_uniformed)\nbase_process = scml.cuda_count()" - }, - { - "action": "Predict cell types and obtain cell embeddings using the scMulan model, with optional parallel processing.", - "code": "scml.get_cell_types_and_embds_for_adata(parallel=True, n_process = 1)" - }, - { - "action": "Copy the AnnData object with scMulan results for further analysis.", - "code": "adata_mulan = scml.adata.copy()" - }, - { - "action": "Scale the data, perform PCA, and then compute a 2-D embedding using pyMDE for visualization.", - "code": "ov.pp.scale(adata_mulan)\nov.pp.pca(adata_mulan)\nov.pp.mde(adata_mulan,embedding_dim=2,n_neighbors=15, basis='X_mde',\n n_pcs=10, use_rep='scaled|original|X_pca',)" - }, - { - "action": "Visualize the cell type annotations from scMulan using the computed 2-D embedding.", - "code": "ov.pl.embedding(adata_mulan,basis='X_mde',\n color=[\"cell_type_from_scMulan\",],\n ncols=1,frameon='small')" - }, - { - "action": "Copy the 'X_mde' embeddings to 'X_umap' for compatibility with other functions.", - "code": "adata_mulan.obsm['X_umap']=adata_mulan.obsm['X_mde']" - }, - { - "action": "Apply a smoothing function to filter false positives in the cell type predictions.", - "code": "ov.externel.scMulan.cell_type_smoothing(adata_mulan, threshold=0.1)" - }, - { - "action": "Visualize both the smoothed cell type predictions and the original annotations on the 2-D embedding.", - "code": "ov.pl.embedding(adata_mulan,basis='X_mde',\n color=[\"cell_type_from_mulan_smoothing\",\"cell_type\"],\n ncols=1,frameon='small')" - }, - { - "action": "Display the AnnData object with smoothed cell type annotations.", - "code": "adata_mulan" - }, - { - "action": "Get the top 20 most frequent cell types from scMulan's predictions.", - "code": "top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20]" - }, - { - "action": "Visualize selected cell types on the UMAP embedding, optionally with smoothing.", - "code": "selected_cell_types = top_celltypes\nov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_simba.json b/rag_engine/ovrawmjson/t_simba.json deleted file mode 100644 index b13d90e5..00000000 --- a/rag_engine/ovrawmjson/t_simba.json +++ /dev/null @@ -1,50 +0,0 @@ -[ - { - "action": "Import necessary libraries and set up the working directory. `ov.utils.ov_plot_set()` sets default plotting parameters.", - "code": "import omicverse as ov\nfrom omicverse.utils import mde\nworkdir = 'result_human_pancreas'\nov.utils.ov_plot_set()" - }, - { - "action": "Installation instructions for SIMBA, a tool for single-cell data integration and batch correction.", - "code": "# We need to install simba at first\n# \n# ```\n# conda install -c bioconda simba\n# ```\n# \n# or\n# \n# ```\n# pip install git+https://github.com/huidongchen/simba\n# pip install git+https://github.com/pinellolab/simba_pbg\n# ```" - }, - { - "action": "Read the combined AnnData object from a file. This object contains three scRNA-seq human pancreas datasets.", - "code": "adata=ov.utils.read('simba_adata_raw.h5ad')" - }, - { - "action": "Initialize a pySIMBA object with the AnnData object and the working directory.", - "code": "simba_object=ov.single.pySIMBA(adata,workdir)" - }, - { - "action": "Preprocess the data using default parameters. This includes filtering cells, normalizing library sizes, selecting highly variable genes, and binning genes.", - "code": "simba_object.preprocess(batch_key='batch',min_n_cells=3,\n method='lib_size',n_top_genes=3000,n_bins=5)" - }, - { - "action": "Generate a graph for training. The graph represents cells and genes as nodes, with edges connecting them based on relationships in the data.", - "code": "simba_object.gen_graph()" - }, - { - "action": "Train the PyTorch BigGraph (PBG) model using the generated graph. The `num_workers` parameter specifies the number of CPU cores to use for training.", - "code": "simba_object.train(num_workers=6)" - }, - { - "action": "Load a pre-trained model from a specified directory.", - "code": "simba_object.load('result_human_pancreas/pbg/graph0')" - }, - { - "action": "Perform batch correction using the `batch_correction()` method. This aligns the datasets to reduce batch effects.", - "code": "adata=simba_object.batch_correction()\nadata" - }, - { - "action": "Visualize the batch-corrected data using Minimum Distortion Embedding (MDE) instead of UMAP.", - "code": "adata.obsm[\"X_mde\"] = mde(adata.obsm[\"X_simba\"])" - }, - { - "action": "Plot the MDE visualization, coloring cells by cell type and batch.", - "code": "sc.pl.embedding(adata,basis='X_mde',color=['cell_type1','batch'])" - }, - { - "action": "Visualize the batch-corrected data using UMAP.", - "code": "import scanpy as sc\nsc.pp.neighbors(adata, use_rep=\"X_simba\")\nsc.tl.umap(adata)\nsc.pl.umap(adata,color=['cell_type1','batch'])" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_single_batch.json b/rag_engine/ovrawmjson/t_single_batch.json deleted file mode 100644 index 230e4890..00000000 --- a/rag_engine/ovrawmjson/t_single_batch.json +++ /dev/null @@ -1,138 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Read individual datasets (s1d3, s2d1, s3d7) from H5AD files, assigning batch labels.", - "code": "adata1=ov.read('neurips2021_s1d3.h5ad')\nadata1.obs['batch']='s1d3'\nadata2=ov.read('neurips2021_s2d1.h5ad')\nadata2.obs['batch']='s2d1'\nadata3=ov.read('neurips2021_s3d7.h5ad')\nadata3.obs['batch']='s3d7'" - }, - { - "action": "Concatenate the three AnnData objects into a single object, merging common variables.", - "code": "adata=sc.concat([adata1,adata2,adata3],merge='same')\nadata" - }, - { - "action": "Display the unique batch labels present in the combined dataset.", - "code": "adata.obs['batch'].unique()" - }, - { - "action": "Convert the data type of the `.X` attribute (gene expression matrix) to `np.int64`.", - "code": "import numpy as np\nadata.X=adata.X.astype(np.int64)" - }, - { - "action": "Perform quality control (QC) on the AnnData object, filtering cells based on mitochondrial percentage, number of UMIs, and detected genes. Considers batch information during QC.", - "code": "adata=ov.pp.qc(adata,\n tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250},\n batch_key='batch')\nadata" - }, - { - "action": "Preprocess the data using shiftlog and pearson normalization, selecting the top 3000 highly variable genes (HVGs).", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',\n n_HVGs=3000,batch_key=None)\nadata" - }, - { - "action": "Store the raw counts in `adata.raw` and subset the data to include only highly variable genes.", - "code": "adata.raw = adata\nadata = adata[:, adata.var.highly_variable_features]\nadata" - }, - { - "action": "Save the preprocessed data to an H5AD file with gzip compression.", - "code": "adata.write_h5ad('neurips2021_batch_normlog.h5ad',compression='gzip')" - }, - { - "action": "Scale the data, perform Principal Component Analysis (PCA) on the scaled data, and compute Minimum Distortion Embedding (MDE) based on the PCA results.", - "code": "ov.pp.scale(adata)\nov.pp.pca(adata,layer='scaled',n_pcs=50,mask_var='highly_variable_features')\n\nadata.obsm[\"X_mde_pca\"] = ov.utils.mde(adata.obsm[\"scaled|original|X_pca\"])" - }, - { - "action": "Visualize the data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_pca',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using Harmony, specifying 'batch' as the batch key and using 50 principal components.", - "code": "adata_harmony=ov.single.batch_correction(adata,batch_key='batch',\n methods='harmony',n_pcs=50)\nadata" - }, - { - "action": "Compute MDE based on the Harmony-corrected data.", - "code": "adata.obsm[\"X_mde_harmony\"] = ov.utils.mde(adata.obsm[\"X_harmony\"])" - }, - { - "action": "Visualize the Harmony-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_harmony',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using Combat, specifying 'batch' as the batch key and using 50 principal components.", - "code": "adata_combat=ov.single.batch_correction(adata,batch_key='batch',\n methods='combat',n_pcs=50)\nadata" - }, - { - "action": "Compute MDE based on the Combat-corrected data.", - "code": "adata.obsm[\"X_mde_combat\"] = ov.utils.mde(adata.obsm[\"X_combat\"])" - }, - { - "action": "Visualize the Combat-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_combat',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using Scanorama, specifying 'batch' as the batch key and using 50 principal components.", - "code": "adata_scanorama=ov.single.batch_correction(adata,batch_key='batch',\n methods='scanorama',n_pcs=50)\nadata" - }, - { - "action": "Compute MDE based on the Scanorama-corrected data.", - "code": "adata.obsm[\"X_mde_scanorama\"] = ov.utils.mde(adata.obsm[\"X_scanorama\"])" - }, - { - "action": "Visualize the Scanorama-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_scanorama',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Perform batch correction using scVI, specifying 'batch' as the batch key, using 2 layers, 30 latent dimensions, and the negative binomial gene likelihood.", - "code": "adata_scvi=ov.single.batch_correction(adata,batch_key='batch',\n methods='scVI',n_layers=2, n_latent=30, gene_likelihood=\"nb\")\nadata" - }, - { - "action": "Compute MDE based on the scVI-corrected data.", - "code": "adata.obsm[\"X_mde_scVI\"] = ov.utils.mde(adata.obsm[\"X_scVI\"])" - }, - { - "action": "Visualize the scVI-corrected data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_scVI',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Initialize an LDA topic model object, specifying the feature type, highly variable key, layers, batch key, and learning rate.", - "code": "LDA_obj=ov.utils.LDA_topic(adata,feature_type='expression',\n highly_variable_key='highly_variable_features',\n layers='counts',batch_key='batch',learning_rate=1e-3)" - }, - { - "action": "Plot the topic contributions for topic 6.", - "code": "LDA_obj.plot_topic_contributions(6)" - }, - { - "action": "Predict topic compositions for 15 topics.", - "code": "LDA_obj.predicted(15)" - }, - { - "action": "Compute MDE based on the topic compositions and feature embeddings from the MIRA model.", - "code": "adata.obsm[\"X_mde_mira_topic\"] = ov.utils.mde(adata.obsm[\"X_topic_compositions\"])\nadata.obsm[\"X_mde_mira_feature\"] = ov.utils.mde(adata.obsm[\"X_umap_features\"])" - }, - { - "action": "Visualize the MIRA topic-based data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_mira_topic',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Visualize the MIRA feature-based data using MDE embeddings, coloring by batch and cell type.", - "code": "ov.utils.embedding(adata,\n basis='X_mde_mira_feature',frameon='small',\n color=['batch','cell_type'],show=False)" - }, - { - "action": "Save the AnnData object with all batch correction results to an H5AD file with gzip compression.", - "code": "adata.write_h5ad('neurips2021_batch_all.h5ad',compression='gzip')" - }, - { - "action": "Reload the saved AnnData object from the H5AD file.", - "code": "adata=sc.read('neurips2021_batch_all.h5ad')" - }, - { - "action": "Copy specific embeddings to the `.obsm` attribute for benchmarking.", - "code": "adata.obsm['X_pca']=adata.obsm['scaled|original|X_pca'].copy()\nadata.obsm['X_mira_topic']=adata.obsm['X_topic_compositions'].copy()\nadata.obsm['X_mira_feature']=adata.obsm['X_umap_features'].copy()" - }, - { - "action": "Initialize and run a Benchmarker object from the `scib_metrics` package to evaluate the performance of different batch correction methods.", - "code": "from scib_metrics.benchmark import Benchmarker\nbm = Benchmarker(\n adata,\n batch_key=\"batch\",\n label_key=\"cell_type\",\n embedding_obsm_keys=[\"X_pca\", \"X_combat\", \"X_harmony\",\n 'X_scanorama','X_mira_topic','X_mira_feature','X_scVI'],\n n_jobs=8,\n)\nbm.benchmark()" - }, - { - "action": "Plot the benchmarking results as a table.", - "code": "bm.plot_results_table(min_max_scale=False)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_slat.json b/rag_engine/ovrawmjson/t_slat.json deleted file mode 100644 index a2dba742..00000000 --- a/rag_engine/ovrawmjson/t_slat.json +++ /dev/null @@ -1,130 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, os, scanpy, numpy, pandas, and torch. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport os\n\nimport scanpy as sc\nimport numpy as np\nimport pandas as pd\nimport torch\nov.plot_set()" - }, - { - "action": "Import specific functions and classes from the `omicverse.externel.scSLAT` module. These include functions for data loading, spatial network calculation, SLAT execution, scanpy workflow, spatial matching, visualization tools (e.g., 3D matching, histograms, Sankey diagrams), and region statistics.", - "code": "#import scSLAT\nfrom omicverse.externel.scSLAT.model import load_anndatas, Cal_Spatial_Net, run_SLAT, scanpy_workflow, spatial_match\nfrom omicverse.externel.scSLAT.viz import match_3D_multi, hist, Sankey, match_3D_celltype, Sankey,Sankey_multi,build_3D\nfrom omicverse.externel.scSLAT.metrics import region_statistics" - }, - { - "action": "Load two AnnData objects from H5AD files. `adata1` represents the E11.5 mouse embryo dataset, and `adata2` represents the E12.5 mouse embryo dataset.", - "code": "adata1 = sc.read_h5ad('data/E115_Stereo.h5ad')\nadata2 = sc.read_h5ad('data/E125_Stereo.h5ad')" - }, - { - "action": "Add a 'week' column to the observation metadata (`.obs`) of each AnnData object, indicating the developmental stage (E11.5 or E12.5).", - "code": "adata1.obs['week']='E11.5'\nadata2.obs['week']='E12.5'" - }, - { - "action": "Generate spatial plots for `adata1` and `adata2`, coloring the spots by the 'annotation' variable and setting the spot size to 3.", - "code": "sc.pl.spatial(adata1, color='annotation', spot_size=3)\nsc.pl.spatial(adata2, color='annotation', spot_size=3)" - }, - { - "action": "Calculate spatial networks for `adata1` and `adata2` using the KNN model with a k_cutoff of 20. Load the AnnData objects into a format suitable for SLAT, using 'DPCA' as the feature representation and ensuring the order of features is not checked.", - "code": "Cal_Spatial_Net(adata1, k_cutoff=20, model='KNN')\nCal_Spatial_Net(adata2, k_cutoff=20, model='KNN')\nedges, features = load_anndatas([adata1, adata2], feature='DPCA', check_order=False)" - }, - { - "action": "Run the SLAT algorithm with the specified features and edges. The `LGCN_layer` parameter is set to 5. The function returns embeddings for each dataset (`embd0`, `embd1`) and the computation time.", - "code": "embd0, embd1, time = run_SLAT(features, edges, LGCN_layer=5)" - }, - { - "action": "Perform spatial matching between the embeddings `embd0` and `embd1`. The `reorder` parameter is set to False, and the original AnnData objects are provided. The function returns the best match indices, the index array, and the distances between matched points.", - "code": "best, index, distance = spatial_match([embd0, embd1], reorder=False, adatas=[adata1,adata2])" - }, - { - "action": "Create a matching array from the best match indices. Calculate region statistics for the best matches, starting from 0.5 with 10 intervals.", - "code": "matching = np.array([range(index.shape[0]), best])\nbest_match = distance[:,0]\nregion_statistics(best_match, start=0.5, number_of_interval=10)" - }, - { - "action": "Import the `matplotlib.pyplot` module. Build a 3D model from `adata1` and `adata2` using the provided matching list. The model is subsampled to 300 points. The `draw_3D` function visualizes the model with specified parameters.", - "code": "import matplotlib.pyplot as plt\nmatching_list=[matching]\nmodel = build_3D([adata1,adata2], matching_list,subsample_size=300, )\nax=model.draw_3D(hide_axis=True, line_color='#c2c2c2', height=1, size=[6,6], line_width=1)" - }, - { - "action": "Add a 'low_quality_index' column to `adata2.obs`, representing the quality of the alignment. Convert the column to float type.", - "code": "adata2.obs['low_quality_index']= best_match\nadata2.obs['low_quality_index'] = adata2.obs['low_quality_index'].astype(float)" - }, - { - "action": "Display the spatial coordinates stored in `adata2.obsm['spatial']`.", - "code": "adata2.obsm['spatial']" - }, - { - "action": "Generate a spatial plot for `adata2`, coloring the spots by the 'low_quality_index' variable, setting the spot size to 3, and adding the title 'Quality'.", - "code": "sc.pl.spatial(adata2, color='low_quality_index', spot_size=3, title='Quality')" - }, - { - "action": "Generate a Sankey diagram to visualize the correspondence between cell types in `adata1` and `adata2`. The diagram is customized with various parameters, including node and link opacity, layout, font size, and color. The `return_fig` parameter is set to True to return the figure object.", - "code": "fig=Sankey_multi(adata_li=[adata1,adata2],\n prefix_li=['E11.5','E12.5'],\n matching_li=[matching],\n clusters='annotation',filter_num=10,\n node_opacity = 0.8,\n link_opacity = 0.2,\n layout=[800,500],\n font_size=12,\n font_color='Black',\n save_name=None,\n format='png',\n width=1200,\n height=1000,\n return_fig=True)\nfig.show()" - }, - { - "action": "Save the generated Sankey diagram as an HTML file named \"slat_sankey.html\".", - "code": "fig.write_html(\"slat_sankey.html\")" - }, - { - "action": "Create DataFrames (`adata1_df`, `adata2_df`) from the AnnData objects, including spatial coordinates, cell type annotations, and corresponding colors. The colors are mapped from the `.uns` attribute of each AnnData object.", - "code": "color_dict1=dict(zip(adata1.obs['annotation'].cat.categories,\n adata1.uns['annotation_colors'].tolist()))\nadata1_df = pd.DataFrame({'index':range(embd0.shape[0]),\n 'x': adata1.obsm['spatial'][:,0],\n 'y': adata1.obsm['spatial'][:,1],\n 'celltype':adata1.obs['annotation'],\n 'color':adata1.obs['annotation'].map(color_dict1)\n }\n )\ncolor_dict2=dict(zip(adata2.obs['annotation'].cat.categories,\n adata2.uns['annotation_colors'].tolist()))\nadata2_df = pd.DataFrame({'index':range(embd1.shape[0]),\n 'x': adata2.obsm['spatial'][:,0],\n 'y': adata2.obsm['spatial'][:,1],\n 'celltype':adata2.obs['annotation'],\n 'color':adata2.obs['annotation'].map(color_dict2)\n }\n )" - }, - { - "action": "Use the `match_3D_celltype` function to visualize the alignment of specific cell types ('Urogenital ridge', 'Kidney', and 'Ovary') between `adata1` and `adata2`. The visualization is customized with parameters for subsampling, highlighting, and coordinate scaling. The `draw_3D` function then displays the 3D alignment.", - "code": "kidney_align = match_3D_celltype(adata1_df, adata2_df, matching, meta='celltype', \n highlight_celltype = [['Urogenital ridge'],['Kidney','Ovary']],\n subsample_size=10000, highlight_line = ['blue'], scale_coordinate = True )\nkidney_align.draw_3D(size= [6, 6], line_width =0.8, point_size=[0.6,0.6], hide_axis=True)" - }, - { - "action": "Define a function `cal_matching_cell` to find the cells in `target_adata` that are matched to a specific `query_cell` type in `query_adata` based on the provided `matching` information. The function returns a subset of `target_adata` containing the matched cells.", - "code": "def cal_matching_cell(target_adata,query_adata,matching,query_cell,clusters='annotation',):\n adata1_df = pd.DataFrame({'index':range(target_adata.shape[0]),\n 'x': target_adata.obsm['spatial'][:,0],\n 'y': target_adata.obsm['spatial'][:,1],\n 'celltype':target_adata.obs[clusters]})\n adata2_df = pd.DataFrame({'index':range(query_adata.shape[0]),\n 'x': query_adata.obsm['spatial'][:,0],\n 'y': query_adata.obsm['spatial'][:,1],\n 'celltype':query_adata.obs[clusters]})\n query_adata = target_adata[matching[1,adata2_df.loc[adata2_df.celltype==query_cell,'index'].values],:]\n #adata2_df['target_celltype'] = adata1_df.iloc[matching[1,:],:]['celltype'].to_list()\n #adata2_df['target_obs_names'] = adata1_df.iloc[matching[1,:],:].index.to_list()\n \n #query_obs=adata2_df.loc[adata2_df['celltype']==query_cell,'target_obs_names'].tolist()\n return query_adata" - }, - { - "action": "Call the `cal_matching_cell` function to find the cells in `adata1` that match the 'Kidney' cells in `adata2`. The result is stored in `query_adata`.", - "code": "query_adata=cal_matching_cell(target_adata=adata1,\n query_adata=adata2,\n matching=matching,\n query_cell='Kidney',clusters='annotation')\nquery_adata" - }, - { - "action": "Add a 'kidney_anno' column to `adata1.obs` and assign the 'annotation' values from `query_adata` to the corresponding cells in `adata1`.", - "code": "adata1.obs['kidney_anno']=''\nadata1.obs.loc[query_adata.obs.index,'kidney_anno']=query_adata.obs['annotation']" - }, - { - "action": "Generate a spatial plot for `adata1`, coloring the spots by the 'kidney_anno' variable. A custom palette is used to highlight specific annotations.", - "code": "sc.pl.spatial(adata1, color='kidney_anno', spot_size=3,\n palette=['#F5F5F5','#ff7f0e', 'green',])" - }, - { - "action": "Concatenate `query_adata` and the 'Kidney' cells from `adata2` into a new AnnData object `kidney_lineage_ad`. Preprocess the combined data using `ov.pp.preprocess`, selecting the top 3000 highly variable genes and normalizing the data. Store the raw data in `.raw`, select highly variable genes, scale the data, perform PCA, compute a nearest neighbor graph, cluster the data using Leiden clustering, and compute UMAP embeddings.", - "code": "kidney_lineage_ad=sc.concat([query_adata,adata2[adata2.obs['annotation']=='Kidney']],merge='same')\nkidney_lineage_ad=ov.pp.preprocess(kidney_lineage_ad,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4)\nkidney_lineage_ad.raw = kidney_lineage_ad\nkidney_lineage_ad = kidney_lineage_ad[:, kidney_lineage_ad.var.highly_variable_features]\nov.pp.scale(kidney_lineage_ad)\nov.pp.pca(kidney_lineage_ad)\nov.pp.neighbors(kidney_lineage_ad,use_rep='scaled|original|X_pca',metric=\"cosine\")\nov.utils.cluster(kidney_lineage_ad,method='leiden',resolution=1)\nov.pp.umap(kidney_lineage_ad)" - }, - { - "action": "Generate UMAP plots for `kidney_lineage_ad`, coloring the cells by 'annotation', 'week', and 'leiden' clustering.", - "code": "ov.pl.embedding(kidney_lineage_ad,basis='X_umap',\n color=['annotation','week','leiden'],\n frameon='small')" - }, - { - "action": "Generate a dot plot showing the expression of specific genes associated with nephron progenitors and metanephric/kidney development in the 'leiden' clusters of `kidney_lineage_ad`. The dot plot is customized with a color bar title and without a dendrogram.", - "code": "sc.pl.dotplot(kidney_lineage_ad,{'nephron progenitors':['Wnt9b','Osr1','Nphs1','Lhx1','Pax2','Pax8'],\n 'metanephric':['Eya1','Shisa3','Foxc1'], \n 'kidney':['Wt1','Wnt4','Nr2f2','Dach1','Cd44']} ,\n 'leiden',dendrogram=False,colorbar_title='Expression')" - }, - { - "action": "Re-annotate the 'leiden' clusters in `kidney_lineage_ad.obs` based on their developmental stage and cluster identity. Clusters 4, 2, 3, 1, and 5 are labeled as 'Nephron progenitors (E11.5)' and 'Metanephron progenitors (E11.5)', respectively. Cluster 0 is labeled as 'Kidney (E12.5)'.", - "code": "kidney_lineage_ad.obs['re_anno'] = 'Unknown'\nkidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden.isin(['4']),'re_anno'] = 'Nephron progenitors (E11.5)'\nkidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden.isin(['2','3','1','5']),'re_anno'] = 'Metanephron progenitors (E11.5)'\nkidney_lineage_ad.obs.loc[kidney_lineage_ad.obs.leiden=='0','re_anno'] = 'Kidney (E12.5)'" - }, - { - "action": "Generate UMAP plots for `kidney_lineage_ad`, coloring the cells by 'annotation' and the newly assigned 're_anno' labels.", - "code": "kidney_lineage_ad.obs.leiden = list(kidney_lineage_ad.obs.leiden)\nov.pl.embedding(kidney_lineage_ad,basis='X_umap',\n color=['annotation','re_anno'],\n frameon='small')" - }, - { - "action": "Assign the 're_anno' labels from `kidney_lineage_ad` (specifically the E11.5 cells) to the corresponding cells in `adata1.obs['kidney_anno']`.", - "code": "adata1.obs['kidney_anno']=''\nadata1.obs.loc[kidney_lineage_ad[kidney_lineage_ad.obs['week']=='E11.5'].obs.index,'kidney_anno']=kidney_lineage_ad[kidney_lineage_ad.obs['week']=='E11.5'].obs['re_anno']" - }, - { - "action": "Generate a spatial plot for `adata1`, coloring the spots by the 'kidney_anno' variable. A custom palette is used to highlight specific annotations, and the plot is displayed with a specified figure size.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(1, 1, figsize=(8, 8))\nsc.pl.spatial(adata1, color='kidney_anno', spot_size=1.5,\n palette=['#F5F5F5','#ff7f0e', 'green',],show=False,ax=ax)" - }, - { - "action": "Perform differential expression analysis between E12.5 and E11.5 cells in `kidney_lineage_ad` using `ov.bulk.pyDEG`. The analysis uses the t-test method and sets thresholds for fold change, p-value, and -log10(p-value). A volcano plot is generated to visualize the results, highlighting the top 8 differentially expressed genes.", - "code": "test_adata=kidney_lineage_ad\ndds=ov.bulk.pyDEG(test_adata.to_df(layer='lognorm').T)\ndds.drop_duplicates_index()\nprint('... drop_duplicates_index success')\ntreatment_groups=test_adata.obs[test_adata.obs['week']=='E12.5'].index.tolist()\ncontrol_groups=test_adata.obs[test_adata.obs['week']=='E11.5'].index.tolist()\nresult=dds.deg_analysis(treatment_groups,control_groups,method='ttest')\n# -1 means automatically calculates\ndds.foldchange_set(fc_threshold=-1,\n pval_threshold=0.05,\n logp_max=10)\n\n\ndds.plot_volcano(title='DEG Analysis',figsize=(4,4),\n plot_genes_num=8,plot_genes_fontsize=12,)" - }, - { - "action": "Extract the top 3 up-regulated and down-regulated genes from the differential expression analysis results based on q-value. Combine these genes into a single list `deg_gene`.", - "code": "up_gene=dds.result.loc[dds.result['sig']=='up'].sort_values('qvalue')[:3].index.tolist()\ndown_gene=dds.result.loc[dds.result['sig']=='down'].sort_values('qvalue')[:3].index.tolist()\ndeg_gene=up_gene+down_gene" - }, - { - "action": "Generate a dot plot showing the expression of the differentially expressed genes (`deg_gene`) in the 're_anno' groups of `kidney_lineage_ad`.", - "code": "sc.pl.dotplot(kidney_lineage_ad,deg_gene,\n groupby='re_anno')" - }, - { - "action": "Calculate a dendrogram for `kidney_lineage_ad` based on the 're_anno' groups and the specified representation. Perform a t-test to rank genes based on their differential expression between the 're_anno' groups. Generate a dot plot showing the top 3 ranked genes for each group, using a specified color map and scaling method.", - "code": "sc.tl.dendrogram(kidney_lineage_ad,'re_anno',use_rep='scaled|original|X_pca')\nsc.tl.rank_genes_groups(kidney_lineage_ad, 're_anno', use_rep='scaled|original|X_pca',\n method='t-test',use_raw=False,key_added='re_anno_ttest')\nsc.pl.rank_genes_groups_dotplot(kidney_lineage_ad,groupby='re_anno',\n cmap='RdBu_r',key='re_anno_ttest',\n standard_scale='var',n_genes=3)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_spaceflow.json b/rag_engine/ovrawmjson/t_spaceflow.json deleted file mode 100644 index ab57d05e..00000000 --- a/rag_engine/ovrawmjson/t_spaceflow.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from a specified path and file, and make variable names unique.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics, filter genes with total counts less than 100, identify spatially variable genes using the 'prost' method, and subset the AnnData object to include only spatially variable features.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)\nadata.raw = adata\nadata = adata[:, adata.var.space_variable_features]\nadata" - }, - { - "action": "Read ground truth annotations from a text file and assign them to the 'Ground Truth' column in the observation metadata of the AnnData object. Visualize the spatial distribution of the ground truth annotations.", - "code": "import pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0)\nAnn_df.columns = ['Ground Truth']\nadata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])" - }, - { - "action": "Initialize a SpaceFlow object using the AnnData object.", - "code": "sf_obj=ov.space.pySpaceFlow(adata)" - }, - { - "action": "Train the SpaceFlow model with specified parameters for spatial regularization, embedding dimension, learning rate, epochs, patience, stopping criteria, random seed, GPU usage, and regularization acceleration.", - "code": "sf_obj.train(spatial_regularization_strength=0.1, \n z_dim=50, lr=1e-3, epochs=1000, \n max_patience=50, min_stop=100, \n random_seed=42, gpu=0, \n regularization_acceleration=True, edge_subset_sz=1000000)" - }, - { - "action": "Calculate the Pseudo-Spatial Map (pSM) using the trained SpaceFlow model with specified parameters for the number of neighbors, resolution, maximum cells for subsampling, and the key to store the pSM results.", - "code": "sf_obj.cal_pSM(n_neighbors=20,resolution=1,\n max_cell_for_subsampling=5000,psm_key='pSM_spaceflow')" - }, - { - "action": "Visualize the spatial distribution of the calculated pSM and the ground truth annotations.", - "code": "sc.pl.spatial(adata, color=['pSM_spaceflow','Ground Truth'],cmap='RdBu_r')" - }, - { - "action": "Cluster the spatial data using Gaussian Mixture Model (GMM) with specified parameters for the number of components, covariance type, tolerance, maximum iterations, and random state, using the 'spaceflow' representation.", - "code": "ov.utils.cluster(adata,use_rep='spaceflow',method='GMM',n_components=7,covariance_type='full',\n tol=1e-9, max_iter=1000, random_state=3607)" - }, - { - "action": "Visualize the spatial distribution of the GMM clusters and the ground truth annotations.", - "code": "sc.pl.spatial(adata, color=['gmm_cluster',\"Ground Truth\"])" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_stagate.json b/rag_engine/ovrawmjson/t_stagate.json deleted file mode 100644 index c400f9a2..00000000 --- a/rag_engine/ovrawmjson/t_stagate.json +++ /dev/null @@ -1,90 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy, and set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Read 10x Visium spatial transcriptomics data from a directory, specifying the path and count file. Ensure unique variable names.", - "code": "adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')\nadata.var_names_make_unique()" - }, - { - "action": "Calculate quality control metrics and filter genes with total counts less than 100. Identify spatially variable genes (SVGs) using the `prost` method, targeting 3000 SVGs and setting the target sum for normalization to 1e4.", - "code": "sc.pp.calculate_qc_metrics(adata, inplace=True)\nadata = adata[:,adata.var['total_counts']>100]\nadata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform=\"visium\",)" - }, - { - "action": "Write the processed AnnData object to an H5AD file with gzip compression.", - "code": "adata.write('data/cluster_svg.h5ad',compression='gzip')" - }, - { - "action": "Read ground truth annotations from a tab-separated file, assign them to the AnnData object, and visualize the spatial distribution of the ground truth labels.", - "code": "import pandas as pd\nimport os\nAnn_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\\t', header=None, index_col=0)\nAnn_df.columns = ['Ground Truth']\nadata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']\nsc.pl.spatial(adata, img_key=\"hires\", color=[\"Ground Truth\"])" - }, - { - "action": "Define a GraphST model with the AnnData object and specify the device for computation.", - "code": "model = ov.externel.GraphST.GraphST(adata, device='cuda:0')" - }, - { - "action": "Train the GraphST model, specifying the number of principal components (n_pcs) to use.", - "code": "adata = model.train(n_pcs=30)" - }, - { - "action": "Cluster the spatial data using the `mclust` method with specified parameters, including the number of components and model name. Refine the cluster labels using `ov.utils.refine_label`.", - "code": "ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10,\n modelNames='EEV', random_state=112,\n )\nadata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust')" - }, - { - "action": "Compute a neighborhood graph using the specified representation and cluster the data using `louvain` and `leiden` methods. Refine the cluster labels using `ov.utils.refine_label`.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20,\n use_rep='graphst|original|X_pca')\nov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='louvain',resolution=0.7)\nov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='leiden',resolution=0.7)\nadata.obs['louvain_GraphST'] = ov.utils.refine_label(adata, radius=50, key='louvain') \nadata.obs['leiden_GraphST'] = ov.utils.refine_label(adata, radius=50, key='leiden')" - }, - { - "action": "Visualize the spatial distribution of the cluster labels obtained from `mclust`, `leiden`, and `louvain` methods, along with the ground truth.", - "code": "sc.pl.spatial(adata, color=['mclust_GraphST','leiden_GraphST',\n 'louvain_GraphST',\"Ground Truth\"])" - }, - { - "action": "Assign spatial coordinates from `adata.obsm['spatial']` to `adata.obs['X']` and `adata.obs['Y']`.", - "code": "adata.obs['X'] = adata.obsm['spatial'][:,0]\nadata.obs['Y'] = adata.obsm['spatial'][:,1]" - }, - { - "action": "Construct a STAGATE object with specified parameters, including the number of batches, spatial keys, radius cutoff, number of epochs, learning rate, weight decay, and hidden dimensions.", - "code": "STA_obj=ov.space.pySTAGATE(adata,num_batch_x=3,num_batch_y=2,\n spatial_key=['X','Y'],rad_cutoff=200,num_epoch = 1000,lr=0.001,\n weight_decay=1e-4,hidden_dims = [512, 30],\n device='cuda:0')" - }, - { - "action": "Train the STAGATE model.", - "code": "STA_obj.train()" - }, - { - "action": "Predict latent embeddings and denoised expressions using the trained STAGATE model.", - "code": "STA_obj.predicted()" - }, - { - "action": "Cluster the spatial data using the `mclust` method on the STAGATE embeddings and refine the cluster labels.", - "code": "ov.utils.cluster(adata,use_rep='STAGATE',method='mclust',n_components=8,\n modelNames='EEV', random_state=112,\n )\nadata.obs['mclust_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='mclust')" - }, - { - "action": "Compute a neighborhood graph using the STAGATE embeddings and cluster the data using `louvain` and `leiden` methods. Refine the cluster labels.", - "code": "sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20,\n use_rep='STAGATE')\nov.utils.cluster(adata,use_rep='STAGATE',method='louvain',resolution=0.5)\nov.utils.cluster(adata,use_rep='STAGATE',method='leiden',resolution=0.5)\nadata.obs['louvain_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='louvain') \nadata.obs['leiden_STAGATE'] = ov.utils.refine_label(adata, radius=50, key='leiden')" - }, - { - "action": "Visualize the spatial distribution of the cluster labels obtained from `mclust`, `leiden`, and `louvain` methods on the STAGATE embeddings, along with the ground truth.", - "code": "sc.pl.spatial(adata, color=['mclust_STAGATE','leiden_STAGATE',\n 'louvain_STAGATE',\"Ground Truth\"])" - }, - { - "action": "Sort genes by their spatial information score (PI) in descending order and display the top 10 genes.", - "code": "adata.var.sort_values('PI',ascending=False).head(10)" - }, - { - "action": "Plot the spatial expression of a specific gene (e.g., 'MBP') in both raw and denoised (STAGATE) forms.", - "code": "plot_gene = 'MBP'\nimport matplotlib.pyplot as plt\nfig, axs = plt.subplots(1, 2, figsize=(8, 4))\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99')\nsc.pl.spatial(adata, img_key=\"hires\", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99')" - }, - { - "action": "Calculate the pseudo-spatial map (pSM) using the STAGATE model with specified parameters.", - "code": "STA_obj.cal_pSM(n_neighbors=20,resolution=1,\n max_cell_for_subsampling=5000)" - }, - { - "action": "Visualize the spatial distribution of the ground truth and the calculated pSM.", - "code": "sc.pl.spatial(adata, color=['Ground Truth','pSM_STAGATE'],\n cmap='RdBu_r')" - }, - { - "action": "Evaluate the clustering performance using the Adjusted Rand Index (ARI) for different clustering methods and models (GraphST and STAGATE) compared to the ground truth.", - "code": "from sklearn.metrics.cluster import adjusted_rand_score\n\nobs_df = adata.obs.dropna()\n#GraphST\nARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth'])\nprint('mclust_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['leiden_GraphST'], obs_df['Ground Truth'])\nprint('leiden_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['louvain_GraphST'], obs_df['Ground Truth'])\nprint('louvain_GraphST: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['mclust_STAGATE'], obs_df['Ground Truth'])\nprint('mclust_STAGATE: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['leiden_STAGATE'], obs_df['Ground Truth'])\nprint('leiden_STAGATE: Adjusted rand index = %.2f' %ARI)\n\nARI = adjusted_rand_score(obs_df['louvain_STAGATE'], obs_df['Ground Truth'])\nprint('louvain_STAGATE: Adjusted rand index = %.2f' %ARI)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_staligner.json b/rag_engine/ovrawmjson/t_staligner.json deleted file mode 100644 index 3136035c..00000000 --- a/rag_engine/ovrawmjson/t_staligner.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "action": "Import necessary libraries: scipy.sparse, omicverse, scanpy, anndata, pandas, and os. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "from scipy.sparse import csr_matrix\nimport omicverse as ov\nimport scanpy as sc\nimport anndata as ad\nimport pandas as pd\nimport os\n\nov.utils.ov_plot_set()" - }, - { - "action": "Initialize empty lists `Batch_list` and `adj_list` to store AnnData objects and adjacency matrices, respectively. Define `section_ids` list containing the names of the datasets to be processed. Set the `pathway` variable to the directory containing the data files.", - "code": "Batch_list = []\nadj_list = []\nsection_ids = ['Slide-seqV2_MoB', 'Stereo-seq_MoB']\nprint(section_ids)\npathway = '/storage/zengjianyangLab/hulei/scRNA-seq/scripts/STAligner'" - }, - { - "action": "Iterate through each `section_id` in `section_ids`. Load the corresponding h5ad file into an AnnData object `adata`. Check if `adata.X` is a pandas DataFrame and convert it to a sparse matrix if necessary. Make variable names unique. Prepend `section_id` to each observation name to ensure uniqueness across datasets.", - "code": "for section_id in section_ids:\n print(section_id)\n adata = sc.read_h5ad(os.path.join(pathway,section_id+\".h5ad\"))\n\n # check whether the adata.X is sparse matrix\n if isinstance(adata.X, pd.DataFrame):\n adata.X = csr_matrix(adata.X)\n else:\n pass\n\n adata.var_names_make_unique(join=\"++\")\n\n # make spot name unique\n adata.obs_names = [x+'_'+section_id for x in adata.obs_names]" - }, - { - "action": "Construct the spatial network using `ov.space.Cal_Spatial_Net` with a specified `rad_cutoff`. Perform normalization by selecting highly variable genes using `sc.pp.highly_variable_genes`, normalizing total counts with `sc.pp.normalize_total`, and applying log transformation with `sc.pp.log1p`. Subset `adata` to include only highly variable genes. Append the adjacency matrix and the processed `adata` to `adj_list` and `Batch_list`, respectively.", - "code": " # Constructing the spatial network\n ov.space.Cal_Spatial_Net(adata, rad_cutoff=50) # the spatial network are saved in adata.uns[‘adj’]\n\n # Normalization\n sc.pp.highly_variable_genes(adata, flavor=\"seurat_v3\", n_top_genes=10000)\n sc.pp.normalize_total(adata, target_sum=1e4)\n sc.pp.log1p(adata)\n\n adata = adata[:, adata.var['highly_variable']]\n adj_list.append(adata.uns['adj'])\n Batch_list.append(adata)" - }, - { - "action": "Print the `Batch_list` which now contains the processed AnnData objects for each dataset.", - "code": "Batch_list" - }, - { - "action": "Concatenate the AnnData objects in `Batch_list` into a single AnnData object `adata_concat`. Assign `slice_name` as the label for concatenation and use `section_ids` as keys. Add a new column `batch_name` to `adata_concat.obs` and set it as a categorical variable with the same values as `slice_name`. Print the shape of the concatenated AnnData object.", - "code": "adata_concat = ad.concat(Batch_list, label=\"slice_name\", keys=section_ids)\nadata_concat.obs[\"batch_name\"] = adata_concat.obs[\"slice_name\"].astype('category')\nprint('adata_concat.shape: ', adata_concat.shape)" - }, - { - "action": "Train the STAligner model using the `ov.space.pySTAligner` function. Set parameters for the model, including the number of nearest neighbors (`knn_neigh`), number of epochs (`n_epochs`), integration order (`iter_comb`), batch key (`batch_key`), and the key to add the results (`key_added`). Also, pass the list of AnnData objects (`Batch_list`) to the function.", - "code": "get_ipython().run_cell_magic('time', '', \"# iter_comb is used to specify the order of integration. For example, (0, 1) means slice 0 will be algined with slice 1 as reference.\\niter_comb = [(i, i + 1) for i in range(len(section_ids) - 1)]\\n\\n# Here, to reduce GPU memory usage, each slice is considered as a subgraph for training.\\nSTAligner_obj = ov.space.pySTAligner(adata_concat, verbose=True, knn_neigh = 100, n_epochs = 600, iter_comb = iter_comb,\\n batch_key = 'batch_name', key_added='STAligner', Batch_list = Batch_list)\\n\")" - }, - { - "action": "Train the STAligner model by calling the `train()` method on the `STAligner_obj`.", - "code": "STAligner_obj.train()" - }, - { - "action": "Retrieve the predicted AnnData object with the latent embedding stored in `adata.obsm['STAligner']` by calling the `predicted()` method on the `STAligner_obj`.", - "code": "adata = STAligner_obj.predicted()" - }, - { - "action": "Compute the neighbor graph using the 'STAligner' representation with `sc.pp.neighbors`. Perform clustering using the Leiden algorithm with `ov.utils.cluster` and a specified resolution. Calculate UMAP embeddings with `sc.tl.umap`. Visualize the UMAP embeddings colored by 'batch_name' and 'leiden' clusters using `sc.pl.umap`.", - "code": "sc.pp.neighbors(adata, use_rep='STAligner', random_state=666)\nov.utils.cluster(adata,use_rep='STAligner',method='leiden',resolution=0.4)\nsc.tl.umap(adata, random_state=666)\nsc.pl.umap(adata, color=['batch_name',\"leiden\"],wspace=0.5)" - }, - { - "action": "Create a spatial plot of the clustering results. Define `spot_size` and `title_size` for plot aesthetics. Generate a subplot with two axes. Plot the spatial distribution of 'leiden' clusters for 'Slide-seqV2_MoB' and 'Stereo-seq_MoB' datasets using `sc.pl.spatial`. Adjust the title size and invert the y-axis for the 'Stereo-seq' plot.", - "code": "import matplotlib.pyplot as plt\nspot_size = 50\ntitle_size = 15\nfig, ax = plt.subplots(1, 2, figsize=(6, 3), gridspec_kw={'wspace': 0.05, 'hspace': 0.2})\n_sc_0 = sc.pl.spatial(adata[adata.obs['batch_name'] == 'Slide-seqV2_MoB'], img_key=None, color=['leiden'], title=['Slide-seqV2'],\n legend_fontsize=10, show=False, ax=ax[0], frameon=False, spot_size=spot_size, legend_loc=None)\n_sc_0[0].set_title('Slide-seqV2', size=title_size)\n\n_sc_1 = sc.pl.spatial(adata[adata.obs['batch_name'] == 'Stereo-seq_MoB'], img_key=None, color=['leiden'], title=['Stereo-seq'],\n legend_fontsize=10, show=False, ax=ax[1], frameon=False, spot_size=spot_size)\n_sc_1[0].set_title('Stereo-seq',size=title_size)\n_sc_1[0].invert_yaxis()\nplt.show()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_starfysh.json b/rag_engine/ovrawmjson/t_starfysh.json deleted file mode 100644 index 0e9a3ce8..00000000 --- a/rag_engine/ovrawmjson/t_starfysh.json +++ /dev/null @@ -1,126 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy and omicverse. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Import specific modules from the omicverse package related to Starfysh, including Archetypal Analysis (AA), utility functions (utils), plotting utilities (plot_utils), and post-analysis tools (post_analysis). Also, import the Starfysh model itself (_starfysh).", - "code": "from omicverse.externel.starfysh import (AA, utils, plot_utils, post_analysis)\nfrom omicverse.externel.starfysh import _starfysh as sf_model" - }, - { - "action": "Define file paths for the data, sample ID, and signature gene set file name.", - "code": "# Specify data paths\ndata_path = 'data/star_data'\nsample_id = 'CID44971_TNBC'\nsig_name = 'bc_signatures_version_1013.csv'" - }, - { - "action": "Load spatial transcriptomics data and signature gene sets using utility functions. The `load_adata` function reads the data, and `filter_gene_sig` filters the signature gene sets to include only genes present in the spatial transcriptomics data.", - "code": "# Load expression counts and signature gene sets\nadata, adata_normed = utils.load_adata(data_folder=data_path,\n sample_id=sample_id, # sample id\n n_genes=2000 # number of highly variable genes to keep\n )" - }, - { - "action": "Import pandas and os libraries. Read the signature gene sets from a CSV file into a pandas DataFrame. Filter the gene signatures to keep only those genes that are also present in the spatial transcriptomics data.", - "code": "import pandas as pd\nimport os\ngene_sig = pd.read_csv(os.path.join(data_path, sig_name))\ngene_sig = utils.filter_gene_sig(gene_sig, adata.to_df())\ngene_sig.head()" - }, - { - "action": "Load spatial information and preprocess the histology image associated with the spatial transcriptomics data. This includes reading the image, extracting mapping information, and calculating scale factors.", - "code": "# Load spatial information\nimg_metadata = utils.preprocess_img(data_path,\n sample_id,\n adata_index=adata.obs.index,\n #hchannel=False\n )\nimg, map_info, scalefactor = img_metadata['img'], img_metadata['map_info'], img_metadata['scalefactor']\numap_df = utils.get_umap(adata, display=True)" - }, - { - "action": "Import the matplotlib.pyplot module for plotting. Create a new figure and display the histology image using `imshow`.", - "code": "import matplotlib.pyplot as plt\nplt.figure(figsize=(6, 6), dpi=80)\nplt.imshow(img)" - }, - { - "action": "Display the first few rows of the `map_info` DataFrame, which contains spatial mapping information for the spots in the spatial transcriptomics data.", - "code": "map_info.head()" - }, - { - "action": "Prepare arguments for the Visium data processing, including raw and normalized count data, filtered signature genes, image metadata, number of anchor spots, window size for spatial smoothing, and sample ID.", - "code": "# Parameters for training\nvisium_args = utils.VisiumArguments(adata,\n adata_normed,\n gene_sig,\n img_metadata,\n n_anchors=60,\n window_size=3,\n sample_id=sample_id\n )\n\nadata, adata_normed = visium_args.get_adata()\nanchors_df = visium_args.get_anchors()" - }, - { - "action": "Add new columns to the `adata.obs` DataFrame for log-transformed library size and windowed log-transformed library size, which are calculated during the Visium data processing.", - "code": "adata.obs['log library size']=visium_args.log_lib\nadata.obs['windowed log library size']=visium_args.win_loglib" - }, - { - "action": "Use scanpy's `sc.pl.spatial` function to visualize the log library size on the spatial map. The plot is colored using the 'magma' colormap, and the size of the spots is adjusted for better visualization.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the windowed log library size on the spatial map using `sc.pl.spatial`. This plot shows the spatially smoothed library size, which can help in understanding the spatial distribution of sequencing depth.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='windowed log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the raw gene expression of the 'IL7R' gene on the spatial map using `sc.pl.spatial`. This plot helps in understanding the spatial expression pattern of a specific gene.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='IL7R',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Plot the anchor spots identified during the preprocessing step. Anchor spots are locations in the tissue that are representative of specific cell types. The plot shows the UMAP representation of the data with anchor spots highlighted.", - "code": "plot_utils.plot_anchor_spots(umap_df,\n visium_args.pure_spots,\n visium_args.sig_mean,\n bbox_x=2\n )" - }, - { - "action": "Initialize the Archetypal Analysis (AA) model using the normalized spatial transcriptomics data. Compute archetypes, which represent the 'purest' cell types in the data. Find archetypal spots and marker genes associated with each archetype. Assign archetypes to the closest anchor spots and identify distant archetypes that may represent novel cell types or states.", - "code": "aa_model = AA.ArchetypalAnalysis(adata_orig=adata_normed)\narchetype, arche_dict, major_idx, evs = aa_model.compute_archetypes(cn=40)\n# (1). Find archetypal spots & archetypal clusters\narche_df = aa_model.find_archetypal_spots(major=True)\n\n# (2). Find marker genes associated with each archetypal cluster\nmarkers_df = aa_model.find_markers(n_markers=30, display=False)\n\n# (3). Map archetypes to closest anchors (1-1 per cell type)\nmap_df, map_dict = aa_model.assign_archetypes(anchors_df)\n\n# (4). Optional: Find the most distant archetypes that are not assigned to any annotated cell types\ndistant_arches = aa_model.find_distant_archetypes(anchors_df, n=3)" - }, - { - "action": "Plot the explained variances (evs) from the Archetypal Analysis to help determine the optimal number of archetypes. The plot shows the cumulative explained variance as a function of the number of archetypes.", - "code": "plot_utils.plot_evs(evs, kmin=aa_model.kmin)" - }, - { - "action": "Visualize the archetypes in a 2D UMAP representation. The plot shows the distribution of archetypes and their relationships to each other.", - "code": "aa_model.plot_archetypes(do_3d=False, major=True, disp_cluster=False)" - }, - { - "action": "Visualize the mapping between archetypes and cell types. This plot helps in understanding how the identified archetypes correspond to known cell types based on the anchor spots.", - "code": "aa_model.plot_mapping(map_df)" - }, - { - "action": "Refine the anchor spots by appending marker genes from the best-aligned archetypes. This step updates the signature genes and anchor spots based on the Archetypal Analysis results.", - "code": "visium_args = utils.refine_anchors(\n visium_args,\n aa_model,\n #thld=0.7, # alignment threshold\n n_genes=5,\n #n_iters=1\n)\n\n# Get updated adata & signatures\nadata, adata_normed = visium_args.get_adata()\ngene_sig = visium_args.gene_sig\ncell_types = gene_sig.columns" - }, - { - "action": "Import the torch library. Set the number of random restarts for model training, the number of epochs, and the patience for early stopping. Define the device for model training (CPU or GPU).", - "code": "import torch\nn_repeats = 3\nepochs = 200\npatience = 50\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - }, - { - "action": "Train the Starfysh model without histology integration. The `run_starfysh` function performs the model training with the specified parameters and returns the trained model and the training loss.", - "code": "# Run models\nmodel, loss = utils.run_starfysh(visium_args,\n n_repeats=n_repeats,\n epochs=epochs,\n #patience=patience,\n device=device\n )" - }, - { - "action": "Evaluate the trained Starfysh model and obtain inference and generative outputs. The `model_eval` function processes the model outputs and returns the inferred cell type proportions and other relevant parameters.", - "code": "adata, adata_normed = visium_args.get_adata()\ninference_outputs, generative_outputs,adata_ = sf_model.model_eval(model,\n adata,\n visium_args,\n poe=False,\n device=device)" - }, - { - "action": "Import numpy. Select a random cell type index and use the `gene_mean_vs_inferred_prop` function to compare the signature gene mean expression with the inferred cell type proportion for the selected cell type.", - "code": "import numpy as np\nn_cell_types = gene_sig.shape[1]\nidx = np.random.randint(0, n_cell_types)\npost_analysis.gene_mean_vs_inferred_prop(inference_outputs,\n visium_args,\n idx=idx,\n figsize=(4,4)\n )" - }, - { - "action": "Use the `pl_spatial_inf_feature` function to visualize an inferred feature (e.g., 'ql_m') on the spatial map. The plot is colored using the 'Blues' colormap.", - "code": "plot_utils.pl_spatial_inf_feature(adata_, feature='ql_m', cmap='Blues')" - }, - { - "action": "Define a function `cell2proportion` to extract cell type proportion data from the `adata_` object and create a new AnnData object (`adata_plot`) for plotting.", - "code": "def cell2proportion(adata):\n adata_plot=sc.AnnData(adata.X)\n adata_plot.obs=utils.extract_feature(adata_, 'qc_m').obs.copy()\n adata_plot.var=adata.var.copy()\n adata_plot.obsm=adata.obsm.copy()\n adata_plot.obsp=adata.obsp.copy()\n adata_plot.uns=adata.uns.copy()\n return adata_plot\nadata_plot=cell2proportion(adata_)" - }, - { - "action": "Display a summary of the `adata_plot` object, which contains the cell type proportion data.", - "code": "adata_plot" - }, - { - "action": "Visualize the inferred cell type proportions for specific cell types ('Basal', 'LumA', 'LumB') on the spatial map using `sc.pl.spatial`. The plot is colored using the 'Spectral_r' colormap, and the color scale is limited to the 90th percentile.", - "code": "sc.pl.spatial(adata_plot, cmap='Spectral_r',\n # show first 8 cell types\n color=['Basal','LumA','LumB'],\n ncols=4, size=1.3,\n img_key='hires',\n vmin=0, vmax='p90'\n )" - }, - { - "action": "Use `ov.pl.embedding` to visualize the cell type proportions in a 2D UMAP representation. The plot shows the distribution of cell types ('Basal', 'LumA', 'MBC', 'Normal epithelial') and their relationships to each other.", - "code": "ov.pl.embedding(adata_plot,\n basis='z_umap',\n color=['Basal', 'LumA', 'MBC', 'Normal epithelial'],\n frameon='small',\n vmin=0, vmax='p90',\n cmap='Spectral_r',\n )" - }, - { - "action": "Calculate the predicted expression of specific genes in each cell type using the `model_ct_exp` function. This function processes the model outputs and returns the predicted gene expression values.", - "code": "pred_exprs = sf_model.model_ct_exp(model,\n adata,\n visium_args,\n device=device)" - }, - { - "action": "Select a specific gene ('IL7R') and cell type ('Tem'). Add a new layer to the `adata_` object with the predicted expression values for the selected gene in the selected cell type. Visualize the predicted expression on the spatial map using `sc.pl.spatial`.", - "code": "gene='IL7R'\ngene_celltype='Tem'\nadata_.layers[f'infer_{gene_celltype}']=pred_exprs[gene_celltype]\n\nsc.pl.spatial(adata_, cmap='Spectral_r',\n # show first 8 cell types\n color=gene,\n title=f'{gene} (Predicted expression)\\n{gene_celltype}',\n layer=f'infer_{gene_celltype}',\n ncols=4, size=1.3,\n img_key='hires',\n #vmin=0, vmax='p90'\n )" - }, - { - "action": "Specify the output directory for saving the model and inferred parameters. Create the directory if it doesn't exist. Save the trained Starfysh model's state dictionary to a .pt file. Save the `adata` object with inferred parameters to a .h5ad file.", - "code": "# Specify output directory\noutdir = './results/'\nif not os.path.exists(outdir):\n os.mkdir(outdir)\n\n# save the model\ntorch.save(model.state_dict(), os.path.join(outdir, 'starfysh_model.pt'))\n\n# save `adata` object with inferred parameters\nadata.write(os.path.join(outdir, 'st.h5ad'))" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_stt.json b/rag_engine/ovrawmjson/t_stt.json deleted file mode 100644 index 31b451c4..00000000 --- a/rag_engine/ovrawmjson/t_stt.json +++ /dev/null @@ -1,218 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy and omicverse. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nov.plot_set()" - }, - { - "action": "Import specific modules from the omicverse.external.starfysh subpackage: AA (Archetypal Analysis), utils (utility functions), plot_utils (plotting utilities), post_analysis (post-analysis functions), and _starfysh (Starfysh model).", - "code": "from omicverse.externel.starfysh import (AA, utils, plot_utils, post_analysis)\nfrom omicverse.externel.starfysh import _starfysh as sf_model" - }, - { - "action": "Define file paths for the data, sample ID, and signature gene set file name.", - "code": "# Specify data paths\ndata_path = 'data/star_data'\nsample_id = 'CID44971_TNBC'\nsig_name = 'bc_signatures_version_1013.csv'" - }, - { - "action": "Load expression count data and signature gene sets using custom utility functions. The `load_adata` function reads spatial transcriptomics data, and `filter_gene_sig` filters the signature gene sets based on the expression data.", - "code": "# Load expression counts and signature gene sets\nadata, adata_normed = utils.load_adata(data_folder=data_path,\n sample_id=sample_id, # sample id\n n_genes=2000 # number of highly variable genes to keep\n )" - }, - { - "action": "Import pandas and os libraries. Load signature gene sets from a CSV file into a pandas DataFrame. Filter the gene signatures to include only genes present in the expression data.", - "code": "import pandas as pd\nimport os\ngene_sig = pd.read_csv(os.path.join(data_path, sig_name))\ngene_sig = utils.filter_gene_sig(gene_sig, adata.to_df())\ngene_sig.head()" - }, - { - "action": "Load and preprocess spatial information associated with the expression data. This includes image data, mapping information, and scaling factors. Calculate a UMAP representation of the data for visualization.", - "code": "# Load spatial information\nimg_metadata = utils.preprocess_img(data_path,\n sample_id,\n adata_index=adata.obs.index,\n #hchannel=False\n )\nimg, map_info, scalefactor = img_metadata['img'], img_metadata['map_info'], img_metadata['scalefactor']\numap_df = utils.get_umap(adata, display=True)" - }, - { - "action": "Import the matplotlib.pyplot module for plotting. Create a new figure and display the image data loaded in the previous step.", - "code": "import matplotlib.pyplot as plt\nplt.figure(figsize=(6, 6), dpi=80)\nplt.imshow(img)" - }, - { - "action": "Display the first few rows of the `map_info` DataFrame, which contains spatial mapping information.", - "code": "map_info.head()" - }, - { - "action": "Define parameters for Starfysh model training using the `VisiumArguments` class. This includes raw and normalized expression data, filtered signature genes, image metadata, number of anchor spots, window size for spatial smoothing, and sample ID. Prepare the AnnData objects and calculate anchor spots.", - "code": "# Parameters for training\nvisium_args = utils.VisiumArguments(adata,\n adata_normed,\n gene_sig,\n img_metadata,\n n_anchors=60,\n window_size=3,\n sample_id=sample_id\n )\n\nadata, adata_normed = visium_args.get_adata()\nanchors_df = visium_args.get_anchors()" - }, - { - "action": "Add log-transformed library size and windowed log-transformed library size to the observation metadata of the `adata` object.", - "code": "adata.obs['log library size']=visium_args.log_lib\nadata.obs['windowed log library size']=visium_args.win_loglib" - }, - { - "action": "Use scanpy's `sc.pl.spatial` function to visualize the log library size on a spatial map. The plot is colored using the 'magma' colormap.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the windowed log library size on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'magma' colormap.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='windowed log library size',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Visualize the raw gene expression of the gene 'IL7R' on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'magma' colormap.", - "code": "sc.pl.spatial(adata, cmap='magma',\n # show first 8 cell types\n color='IL7R',\n ncols=4, size=1.3,\n img_key='hires',\n #palette=Layer_color\n # limit color scale at 99.2% quantile of cell abundance\n #vmin=0, vmax='p99.2'\n )" - }, - { - "action": "Plot anchor spots on a UMAP representation of the data using the `plot_anchor_spots` function from `plot_utils`. This visualization helps to identify the locations of anchor spots for each cell type.", - "code": "plot_utils.plot_anchor_spots(umap_df,\n visium_args.pure_spots,\n visium_args.sig_mean,\n bbox_x=2\n )" - }, - { - "action": "Initialize an Archetypal Analysis (AA) model using the normalized AnnData object. Compute archetypes, find archetypal spots and clusters, define marker genes for each archetypal cluster, map archetypes to the closest anchor spots, and optionally find distant archetypes not assigned to any annotated cell types.", - "code": "aa_model = AA.ArchetypalAnalysis(adata_orig=adata_normed)\narchetype, arche_dict, major_idx, evs = aa_model.compute_archetypes(cn=40)\n# (1). Find archetypal spots & archetypal clusters\narche_df = aa_model.find_archetypal_spots(major=True)\n\n# (2). Find marker genes associated with each archetypal cluster\nmarkers_df = aa_model.find_markers(n_markers=30, display=False)\n\n# (3). Map archetypes to closest anchors (1-1 per cell type)\nmap_df, map_dict = aa_model.assign_archetypes(anchors_df)\n\n# (4). Optional: Find the most distant archetypes that are not assigned to any annotated cell types\ndistant_arches = aa_model.find_distant_archetypes(anchors_df, n=3)" - }, - { - "action": "Plot the explained variances (evs) from the Archetypal Analysis model using the `plot_evs` function. This helps to determine the optimal number of archetypes.", - "code": "plot_utils.plot_evs(evs, kmin=aa_model.kmin)" - }, - { - "action": "Visualize the archetypes in a 2D or 3D plot using the `plot_archetypes` function from the `aa_model`. This helps to understand the geometric structure of the data and the identified archetypes.", - "code": "aa_model.plot_archetypes(do_3d=False, major=True, disp_cluster=False)" - }, - { - "action": "Visualize the mapping between archetypes and cell types using the `plot_mapping` function from the `aa_model`. This shows how archetypes correspond to known cell types.", - "code": "aa_model.plot_mapping(map_df)" - }, - { - "action": "Refine the anchor spots and update the signature genes by appending archetypal marker genes with the best-aligned anchors. This step uses the `refine_anchors` function from `utils` and updates the `visium_args` object.", - "code": "visium_args = utils.refine_anchors(\n visium_args,\n aa_model,\n #thld=0.7, # alignment threshold\n n_genes=5,\n #n_iters=1\n)\n\n# Get updated adata & signatures\nadata, adata_normed = visium_args.get_adata()\ngene_sig = visium_args.gene_sig\ncell_types = gene_sig.columns" - }, - { - "action": "Import the torch library. Define parameters for model training, including the number of random restarts (`n_repeats`), number of epochs, patience for early stopping, and the device to use for training (GPU if available, otherwise CPU).", - "code": "import torch\nn_repeats = 3\nepochs = 200\npatience = 50\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - }, - { - "action": "Train the Starfysh model using the `run_starfysh` function from `utils`. This function runs the model training with specified parameters and returns the trained model and the training loss.", - "code": "# Run models\nmodel, loss = utils.run_starfysh(visium_args,\n n_repeats=n_repeats,\n epochs=epochs,\n #patience=patience,\n device=device\n )" - }, - { - "action": "Evaluate the trained Starfysh model using the `model_eval` function from `sf_model`. This function parses the model inference outputs and generative outputs, and updates the `adata` object with these results.", - "code": "adata, adata_normed = visium_args.get_adata()\ninference_outputs, generative_outputs,adata_ = sf_model.model_eval(model,\n adata,\n visium_args,\n poe=False,\n device=device)" - }, - { - "action": "Import the numpy library. Select a random cell type index and use the `gene_mean_vs_inferred_prop` function from `post_analysis` to compare the signature gene means with the inferred proportions for that cell type.", - "code": "import numpy as np\nn_cell_types = gene_sig.shape[1]\nidx = np.random.randint(0, n_cell_types)\npost_analysis.gene_mean_vs_inferred_prop(inference_outputs,\n visium_args,\n idx=idx,\n figsize=(4,4)\n )" - }, - { - "action": "Visualize the inferred feature 'ql_m' on a spatial map using the `pl_spatial_inf_feature` function from `plot_utils`. The plot is colored using the 'Blues' colormap.", - "code": "plot_utils.pl_spatial_inf_feature(adata_, feature='ql_m', cmap='Blues')" - }, - { - "action": "Define a function `cell2proportion` to extract cell type proportions from the `adata_` object and create a new AnnData object (`adata_plot`) for visualization.", - "code": "def cell2proportion(adata):\n adata_plot=sc.AnnData(adata.X)\n adata_plot.obs=utils.extract_feature(adata_, 'qc_m').obs.copy()\n adata_plot.var=adata.var.copy()\n adata_plot.obsm=adata.obsm.copy()\n adata_plot.obsp=adata.obsp.copy()\n adata_plot.uns=adata.uns.copy()\n return adata_plot\nadata_plot=cell2proportion(adata_)" - }, - { - "action": "Display a summary of the `adata_plot` object, which contains the cell type proportions extracted from `adata_`.", - "code": "adata_plot" - }, - { - "action": "Visualize the inferred cell type proportions for specific cell types ('Basal', 'LumA', 'LumB') on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'Spectral_r' colormap and displays values up to the 90th percentile.", - "code": "sc.pl.spatial(adata_plot, cmap='Spectral_r',\n # show first 8 cell types\n color=['Basal','LumA','LumB'],\n ncols=4, size=1.3,\n img_key='hires',\n vmin=0, vmax='p90'\n )" - }, - { - "action": "Visualize the inferred cell type proportions for specific cell types ('Basal', 'LumA', 'MBC', 'Normal epithelial') on a UMAP representation using `ov.pl.embedding`. The plot is colored using the 'Spectral_r' colormap and displays values up to the 90th percentile.", - "code": "ov.pl.embedding(adata_plot,\n basis='z_umap',\n color=['Basal', 'LumA', 'MBC', 'Normal epithelial'],\n frameon='small',\n vmin=0, vmax='p90',\n cmap='Spectral_r',\n )" - }, - { - "action": "Predict cell type-specific gene expression using the `model_ct_exp` function from `sf_model`. This function calculates the predicted expression levels for each cell type based on the trained model.", - "code": "pred_exprs = sf_model.model_ct_exp(model,\n adata,\n visium_args,\n device=device)" - }, - { - "action": "Visualize the predicted expression of the gene 'IL7R' for the cell type 'Tem' on a spatial map using scanpy's `sc.pl.spatial` function. The plot is colored using the 'Spectral_r' colormap and displays the predicted expression values.", - "code": "gene='IL7R'\ngene_celltype='Tem'\nadata_.layers[f'infer_{gene_celltype}']=pred_exprs[gene_celltype]\n\nsc.pl.spatial(adata_, cmap='Spectral_r',\n # show first 8 cell types\n color=gene,\n title=f'{gene} (Predicted expression)\\n{gene_celltype}',\n layer=f'infer_{gene_celltype}',\n ncols=4, size=1.3,\n img_key='hires',\n #vmin=0, vmax='p90'\n )" - }, - { - "action": "Specify an output directory to save the model and inferred parameters. Create the directory if it does not exist. Save the trained model's state dictionary to a .pt file and write the `adata` object with inferred parameters to a .h5ad file.", - "code": "# Specify output directory\noutdir = './results/'\nif not os.path.exists(outdir):\n os.mkdir(outdir)\n\n# save the model\ntorch.save(model.state_dict(), os.path.join(outdir, 'starfysh_model.pt'))\n\n# save `adata` object with inferred parameters\nadata.write(os.path.join(outdir, 'st.h5ad'))" - }, - { - "action": "Import the omicverse, scvelo, and scanpy libraries. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\n#import omicverse.STT as st\nimport scvelo as scv\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Read the 'mouse_brain.h5ad' file into an AnnData object using `sc.read_h5ad`.", - "code": "adata = sc.read_h5ad('mouse_brain.h5ad')\nadata" - }, - { - "action": "Construct an STT object using the `ov.space.STT` class. Initialize it with the AnnData object, specifying the spatial location key as 'xy_loc' and the region key as 'Region'.", - "code": "STT_obj=ov.space.STT(adata,spatial_loc='xy_loc',region='Region')" - }, - { - "action": "Estimate the stages for the STT model using the `stage_estimate` method.", - "code": "STT_obj.stage_estimate()" - }, - { - "action": "Train the STT model with specified parameters: 9 states, 15 iterations, connectivity weight of 0.5, 50 neighbors, threshold for MS gene of 0.2, and spatial weight of 0.3.", - "code": "STT_obj.train(n_states = 9, n_iter = 15, weight_connectivities = 0.5, \n n_neighbors = 50,thresh_ms_gene = 0.2, spa_weight =0.3)" - }, - { - "action": "Visualize the 'attractor' attribute on a 2D embedding using the 'xy_loc' basis with `ov.pl.embedding`. The plot is colored by the 'attractor' values and uses a specific color palette.", - "code": "ov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"attractor\"],frameon='small',\n palette=ov.pl.sc_color[11:])" - }, - { - "action": "Visualize the 'Region' attribute on a 2D embedding using the 'xy_loc' basis with `ov.pl.embedding`. The plot is colored by the 'Region' values.", - "code": "ov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Region\"],frameon='small',\n )" - }, - { - "action": "Prepare a pathway dictionary using the `ov.utils.geneset_prepare` function. The pathway data is loaded from the 'KEGG_2019_Mouse.txt' file, and the organism is specified as 'Mouse'.", - "code": "pathway_dict=ov.utils.geneset_prepare('genesets/KEGG_2019_Mouse.txt',organism='Mouse')" - }, - { - "action": "Compute pathway enrichment for the STT model using the `compute_pathway` method and the prepared pathway dictionary.", - "code": "STT_obj.compute_pathway(pathway_dict)" - }, - { - "action": "Plot the pathway enrichment results using the `plot_pathway` method. The plot is displayed with a specified figure size, marker size, and font size. Axis labels are adjusted for better readability.", - "code": "fig = STT_obj.plot_pathway(figsize = (10,8),size = 100,fontsize = 12)\nfor ax in fig.axes:\n ax.set_xlabel('Embedding 1', fontsize=20) # Adjust font size as needed\n ax.set_ylabel('Embedding 2', fontsize=20) # Adjust font size as needed\nfig.show()" - }, - { - "action": "Create a subplot and visualize the streamlines for the 'Wnt signaling pathway' using the `plot_tensor_pathway` method. The plot is based on the 'xy_loc' coordinates.", - "code": "import matplotlib.pyplot as plt\nfig, ax = plt.subplots(1, 1, figsize=(4, 4))\nSTT_obj.plot_tensor_pathway(pathway_name = 'Wnt signaling pathway',basis = 'xy_loc',\n ax=ax)" - }, - { - "action": "Create a subplot and visualize the streamlines for the 'TGF-beta signaling pathway' using the `plot_tensor_pathway` method. The plot is based on the 'xy_loc' coordinates.", - "code": "fig, ax = plt.subplots(1, 1, figsize=(4, 4))\nSTT_obj.plot_tensor_pathway( 'TGF-beta signaling pathway',basis = 'xy_loc',\n ax=ax)" - }, - { - "action": "Plot the tensor for specific attractors [1, 3, 5, 6] using the `plot_tensor` method. The plot filters cells based on a membership threshold and adjusts the density of the visualization.", - "code": "STT_obj.plot_tensor(list_attractor = [1,3,5,6],\n filter_cells = True, member_thresh = 0.1, density = 1)" - }, - { - "action": "Construct a landscape representation of the STT model using the `construct_landscape` method. The landscape is based on the 'X_xy_loc' coordinate key.", - "code": "STT_obj.construct_landscape(coord_key = 'X_xy_loc')" - }, - { - "action": "Visualize the 'attractor' and 'Region' attributes on a 2D embedding using the 'trans_coord' basis with `sc.pl.embedding`.", - "code": "sc.pl.embedding(adata, color = ['attractor', 'Region'],basis= 'trans_coord')" - }, - { - "action": "Infer the lineage of the STT model using the `infer_lineage` method. The method used is 'MPPT' (most probable path tree), with specified start and end indices, flux fraction, color palette, point size, and text size.", - "code": "STT_obj.infer_lineage(si=3,sf=4, method = 'MPPT',flux_fraction=0.8,color_palette_name = 'tab10',size_point = 8,\n size_text=12)" - }, - { - "action": "Plot a Sankey diagram showing the relationship between STT attractors and spatial region annotations using the `plot_sankey` method.", - "code": "fig = STT_obj.plot_sankey(adata.obs['attractor'].tolist(),adata.obs['Region'].tolist())" - }, - { - "action": "Write the `adata` and `adata_aggr` objects to H5AD files. The `adata` object is saved as 'mouse_brain_adata.h5ad', and the `adata_aggr` object is saved as 'mouse_brain_adata_aggr.h5ad'.", - "code": "STT_obj.adata.write('data/mouse_brain_adata.h5ad')\nSTT_obj.adata_aggr.write('data/mouse_brain_adata_aggr.h5ad')" - }, - { - "action": "Read the `adata` and `adata_aggr` objects from the H5AD files 'mouse_brain_adata.h5ad' and 'mouse_brain_adata_aggr.h5ad', respectively.", - "code": "adata=ov.read('data/mouse_brain_adata.h5ad')\nadata_aggr=ov.read('data/mouse_brain_adata_aggr.h5ad')" - }, - { - "action": "Construct an STT object using the `ov.space.STT` class and load the previously saved `adata` and `adata_aggr` objects into it.", - "code": "STT_obj=ov.space.STT(adata,spatial_loc='xy_loc',region='Region')\nSTT_obj.load(adata,adata_aggr)" - }, - { - "action": "Display the 'r2_test' values from the `adata.var` DataFrame, sorted in descending order. These values represent genes with high multistability scores.", - "code": "adata.var['r2_test'].sort_values(ascending=False)" - }, - { - "action": "Plot the top 6 genes with the highest multistability scores using the `plot_top_genes` method. The plot is displayed with 2 columns and a figure size of 8x8.", - "code": "STT_obj.plot_top_genes(top_genes = 6, ncols = 2, figsize = (8,8),)" - }, - { - "action": "Create a 1x4 subplot and visualize the expression of the 'Sim1' gene in different layers ('Ms', 'Mu', 'velo') and the raw expression. Each subplot displays the 'Sim1' expression on the 'xy_loc' basis using the 'RdBu_r' colormap.", - "code": "import matplotlib.pyplot as plt\nfig, axes = plt.subplots(1, 4, figsize=(12, 3))\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:Ms',show=False,\n layer='Ms',cmap='RdBu_r',ax=axes[0]\n )\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:Mu',show=False,\n layer='Mu',cmap='RdBu_r',ax=axes[1]\n )\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:Velo',show=False,\n layer='velo',cmap='RdBu_r',ax=axes[2]\n )\nov.pl.embedding(adata, basis=\"xy_loc\", \n color=[\"Sim1\"],frameon='small',\n title='Sim1:exp',show=False,\n #layer='Mu',\n cmap='RdBu_r',ax=axes[3]\n )\nplt.tight_layout()" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_tcga.json b/rag_engine/ovrawmjson/t_tcga.json deleted file mode 100644 index c192158a..00000000 --- a/rag_engine/ovrawmjson/t_tcga.json +++ /dev/null @@ -1,38 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.plot_set()" - }, - { - "action": "Initialize a TCGA object using the provided sample sheet, download files, and clinical cart paths. Then, initialize the AnnData object to store the raw count, FPKM, and TPM matrices.", - "code": "get_ipython().run_cell_magic('time', '', \"gdc_sample_sheep='data/TCGA_OV/gdc_sample_sheet.2024-07-05.tsv'\\ngdc_download_files='data/TCGA_OV/gdc_download_20240705_180129.081531'\\nclinical_cart='data/TCGA_OV/clinical.cart.2024-07-05'\\naml_tcga=ov.bulk.pyTCGA(gdc_sample_sheep,gdc_download_files,clinical_cart)\\naml_tcga.adata_init()\\n\")" - }, - { - "action": "Save the AnnData object to an H5AD file for later use.", - "code": "aml_tcga.adata.write_h5ad('data/TCGA_OV/ov_tcga_raw.h5ad',compression='gzip')" - }, - { - "action": "Initialize a TCGA object and read the previously saved AnnData file. This step is necessary to ensure that subsequent TCGA functions, such as survival analysis, can be used properly.", - "code": "gdc_sample_sheep='data/TCGA_OV/gdc_sample_sheet.2024-07-05.tsv'\ngdc_download_files='data/TCGA_OV/gdc_download_20240705_180129.081531'\nclinical_cart='data/TCGA_OV/clinical.cart.2024-07-05'\naml_tcga=ov.bulk.pyTCGA(gdc_sample_sheep,gdc_download_files,clinical_cart)\naml_tcga.adata_read('data/TCGA_OV/ov_tcga_raw.h5ad')" - }, - { - "action": "Initialize the metadata for the AnnData object. This involves converting gene IDs to gene names and adding basic patient information.", - "code": "aml_tcga.adata_meta_init()" - }, - { - "action": "Initialize the survival data for the TCGA object. This step imports the clinical information from the previously set clinical cart path.", - "code": "aml_tcga.survial_init()\naml_tcga.adata" - }, - { - "action": "Perform survival analysis for the gene 'MYC' using the 'deseq_normalize' layer and generate a survival plot.", - "code": "aml_tcga.survival_analysis('MYC',layer='deseq_normalize',plot=True)" - }, - { - "action": "Perform survival analysis for all genes in the dataset. This process may take a significant amount of time.", - "code": "aml_tcga.survial_analysis_all()\naml_tcga.adata" - }, - { - "action": "Save the updated AnnData object, which now includes the results of the survival analysis, to an H5AD file.", - "code": "aml_tcga.adata.write_h5ad('data/TCGA_OV/ov_tcga_survial_all.h5ad',compression='gzip')" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_tosica.json b/rag_engine/ovrawmjson/t_tosica.json deleted file mode 100644 index c8fea34b..00000000 --- a/rag_engine/ovrawmjson/t_tosica.json +++ /dev/null @@ -1,86 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse and scanpy. Set plotting parameters using `ov.utils.ov_plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\nov.utils.ov_plot_set()" - }, - { - "action": "Load the reference AnnData object from 'demo_train.h5ad' and print its shape and cell type counts.", - "code": "ref_adata = sc.read('demo_train.h5ad')\nref_adata = ref_adata[:,ref_adata.var_names]\nprint(ref_adata)\nprint(ref_adata.obs.Celltype.value_counts())" - }, - { - "action": "Load the query AnnData object from 'demo_test.h5ad', subset it to the same genes as the reference data, and print its shape and cell type counts.", - "code": "query_adata = sc.read('demo_test.h5ad')\nquery_adata = query_adata[:,ref_adata.var_names]\nprint(query_adata)\nprint(query_adata.obs.Celltype.value_counts())" - }, - { - "action": "Make variable names unique and select the common genes between the reference and query datasets.", - "code": "ref_adata.var_names_make_unique()\nquery_adata.var_names_make_unique()\nret_gene=list(set(query_adata.var_names) & set(ref_adata.var_names))\nlen(ret_gene)" - }, - { - "action": "Subset both reference and query datasets to the common genes.", - "code": "query_adata=query_adata[:,ret_gene]\nref_adata=ref_adata[:,ret_gene]" - }, - { - "action": "Print the maximum values of the expression matrices in the reference and query datasets to confirm that they have been normalized and log-transformed.", - "code": "print(f\"The max of ref_adata is {ref_adata.X.max()}, query_data is {query_adata.X.max()}\",)" - }, - { - "action": "Download the TOSICA gene sets (GMT files) using `ov.utils.download_tosica_gmt()`. These gene sets will be used as pathways for the TOSICA model.", - "code": "ov.utils.download_tosica_gmt()" - }, - { - "action": "Initialize the TOSICA model using the `ov.single.pyTOSICA` class. Set the reference AnnData object, the path to the GMT file, the depth of the transformer model, the key for cell type labels, the project path for saving the model, and the batch size.", - "code": "tosica_obj=ov.single.pyTOSICA(adata=ref_adata,\n gmt_path='genesets/GO_bp.gmt', depth=1,\n label_name='Celltype',\n project_path='hGOBP_demo',\n batch_size=8)" - }, - { - "action": "Train the TOSICA model using the `train` method. Set the number of epochs.", - "code": "tosica_obj.train(epochs=5)" - }, - { - "action": "Save the trained TOSICA model to the specified project path.", - "code": "tosica_obj.save()" - }, - { - "action": "Load the saved TOSICA model from the project path.", - "code": "tosica_obj.load()" - }, - { - "action": "Predict cell types in the query dataset using the trained TOSICA model and the `predicted` method. The predicted cell types and associated information are stored in a new AnnData object.", - "code": "new_adata=tosica_obj.predicted(pre_adata=query_adata)" - }, - { - "action": "Preprocess the query dataset by scaling the data, performing PCA, computing a neighborhood graph, and reducing the dimensionality using MDE.", - "code": "ov.pp.scale(query_adata)\nov.pp.pca(query_adata,layer='scaled',n_pcs=50)\nsc.pp.neighbors(query_adata, n_neighbors=15, n_pcs=50,\n use_rep='scaled|original|X_pca')\nquery_adata.obsm[\"X_mde\"] = ov.utils.mde(query_adata.obsm[\"scaled|original|X_pca\"])\nquery_adata" - }, - { - "action": "Copy the low-dimensional embeddings and neighborhood graph from the query dataset to the new AnnData object containing the predicted cell types.", - "code": "new_adata.obsm=query_adata[new_adata.obs.index].obsm.copy()\nnew_adata.obsp=query_adata[new_adata.obs.index].obsp.copy()\nnew_adata" - }, - { - "action": "Set the colors for the predicted and original cell types in the new AnnData object for visualization purposes.", - "code": "import numpy as np\ncol = np.array([\n\"#98DF8A\",\"#E41A1C\" ,\"#377EB8\", \"#4DAF4A\" ,\"#984EA3\" ,\"#FF7F00\" ,\"#FFFF33\" ,\"#A65628\" ,\"#F781BF\" ,\"#999999\",\"#1F77B4\",\"#FF7F0E\",\"#279E68\",\"#FF9896\"\n]).astype('', color='gray'),size=12)\n\n#Set the title\nplt.title('Venn4',fontsize=13)\n\n#save figure\nfig.savefig(\"figures/bulk_venn4.png\",dpi=300,bbox_inches = 'tight')" - }, - { - "action": "Create another Venn diagram with three sets and a different color palette.", - "code": "fig,ax=plt.subplots(figsize = (4,4))\n#dict of sets\nsets = {\n 'Set1:name': {1,2,3},\n 'Set2': {1,2,3,4},\n 'Set3': {3,4},\n}\n \nov.pl.venn(sets=sets,ax=ax,fontsize=5.5,\n palette=ov.pl.red_color)\n\nplt.title('Venn3',fontsize=13)" - }, - { - "action": "Read differentially expressed genes (DEGs) result from a CSV file.", - "code": "result=ov.read('data/dds_result.csv',index_col=0)\nresult.head()" - }, - { - "action": "Create a volcano plot to visualize DEGs.", - "code": "ov.pl.volcano(result,pval_name='qvalue',fc_name='log2FoldChange',\n pval_threshold=0.05,fc_max=1.5,fc_min=-1.5,\n pval_max=10,FC_max=10,\n figsize=(4,4),title='DEGs in Bulk',titlefont={'weight':'normal','size':14,},\n up_color='#e25d5d',down_color='#7388c1',normal_color='#d7d7d7',\n up_fontcolor='#e25d5d',down_fontcolor='#7388c1',normal_fontcolor='#d7d7d7',\n legend_bbox=(0.8, -0.2),legend_ncol=2,legend_fontsize=12,\n plot_genes=None,plot_genes_num=10,plot_genes_fontsize=11,\n ticks_fontsize=12,)" - }, - { - "action": "Load the 'tips' dataset from seaborn for box plot visualization.", - "code": "import seaborn as sns\ndata = sns.load_dataset(\"tips\")\ndata.head()" - }, - { - "action": "Create a box plot to compare total bill amounts across different days, separated by sex, and add a p-value annotation.", - "code": "fig,ax=ov.pl.boxplot(data,hue='sex',x_value='day',y_value='total_bill',\n palette=ov.pl.red_color,\n figsize=(4,2),fontsize=12,title='Tips',)\n\nov.pl.add_palue(ax,line_x1=-0.5,line_x2=0.5,line_y=40,\n text_y=0.2,\n text='$p={}$'.format(round(0.001,3)),\n fontsize=11,fontcolor='#000000',\n horizontalalignment='center',)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_visualize_colorsystem.json b/rag_engine/ovrawmjson/t_visualize_colorsystem.json deleted file mode 100644 index e5c6188b..00000000 --- a/rag_engine/ovrawmjson/t_visualize_colorsystem.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\n#import scvelo as scv\nov.plot_set()" - }, - { - "action": "Read single-cell RNA-seq data from a file named '10X43_1.h5ad' located in the 'data/DentateGyrus' directory and store it in the 'adata' variable.", - "code": "adata = ov.read('data/DentateGyrus/10X43_1.h5ad')\nadata" - }, - { - "action": "Create an instance of the ForbiddenCity class from the omicverse plotting module to visualize the color system.", - "code": "fb=ov.pl.ForbiddenCity()" - }, - { - "action": "Generate an HTML visualization of the Forbidden City color palette, displaying colors in a grid with 24 colors per row, covering the entire range of 384 colors.", - "code": "from IPython.display import HTML\nHTML(fb.visual_color(loc_range=(0,384),\n num_per_row=24))" - }, - { - "action": "Retrieve the color named '凝夜紫' from the Forbidden City color palette.", - "code": "fb.get_color(name='凝夜紫')" - }, - { - "action": "Create a subplot with 1 row and 3 columns, each with a figure size of 9x3 inches. Then, generate three UMAP embeddings of the 'adata' object with different color palettes: 'fb.red[:]', 'fb.pink1[:]', and a combination of 'fb.red1[:4]' and 'fb.blue1'. The embeddings are displayed without legends and with small frames.", - "code": "import matplotlib.pyplot as plt\nfig, axes = plt.subplots(1,3,figsize=(9,3)) \nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=fb.red[:],\n ncols=3,\n show=False,\n legend_loc=None,\n ax=axes[0])\n\nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=fb.pink1[:],\n ncols=3,show=False,\n legend_loc=None,\n ax=axes[1])\n\nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=fb.red1[:4]+fb.blue1,\n ncols=3,show=False,\n ax=axes[2])" - }, - { - "action": "Define a dictionary 'color_dict' that maps cell type names to specific hexadecimal color codes. Then, generate a UMAP embedding of the 'adata' object, coloring the cells based on their cluster assignment according to the 'color_dict'. The embedding is displayed without a legend and with a small frame.", - "code": "color_dict={'Astrocytes': '#e40414',\n 'Cajal Retzius': '#ec5414',\n 'Cck-Tox': '#ec4c2c',\n 'Endothelial': '#d42c24',\n 'GABA': '#2c5ca4',\n 'Granule immature': '#acd4ec',\n 'Granule mature': '#a4bcdc',\n 'Microglia': '#8caccc',\n 'Mossy': '#8cacdc',\n 'Neuroblast': '#6c9cc4',\n 'OL': '#6c94cc',\n 'OPC': '#5c74bc',\n 'Radial Glia-like': '#4c94c4',\n 'nIPC': '#3474ac'}\n\nov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"clusters\"],\n palette=color_dict,\n ncols=3,show=False,\n )" - }, - { - "action": "Define a list of colors using RGB values obtained from the 'get_color_rgb' method of the 'fb' object for the colors '群青', '半见', and '丹罽'. Then, create a segmented colormap using these colors.", - "code": "colors=[\n fb.get_color_rgb('群青'),\n fb.get_color_rgb('半见'),\n fb.get_color_rgb('丹罽'),\n]\nfb.get_cmap_seg(colors)" - }, - { - "action": "Define a list of colors using RGB values obtained from the 'get_color_rgb' method of the 'fb' object for the colors '群青', '山矾', and '丹罽'. Then, create a segmented colormap using these colors.", - "code": "colors=[\n fb.get_color_rgb('群青'),\n fb.get_color_rgb('山矾'),\n fb.get_color_rgb('丹罽'),\n]\nfb.get_cmap_seg(colors)" - }, - { - "action": "Define a list of colors using RGB values obtained from the 'get_color_rgb' method of the 'fb' object for the colors '山矾' and '丹罽'. Then, create a segmented colormap using these colors.", - "code": "colors=[\n fb.get_color_rgb('山矾'),\n fb.get_color_rgb('丹罽'),\n]\nfb.get_cmap_seg(colors)" - }, - { - "action": "Generate a UMAP embedding of the 'adata' object, coloring the cells based on the expression levels of the gene 'Sox7'. The colormap used is a segmented colormap created from the 'colors' list. The embedding is displayed with a small frame and without a legend.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n frameon='small',\n color=[\"Sox7\"],\n cmap=fb.get_cmap_seg(colors),\n ncols=3,show=False,\n #vmin=-1,vmax=1\n )" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_visualize_single.json b/rag_engine/ovrawmjson/t_visualize_single.json deleted file mode 100644 index 9cdeb180..00000000 --- a/rag_engine/ovrawmjson/t_visualize_single.json +++ /dev/null @@ -1,90 +0,0 @@ -[ - { - "action": "Import necessary libraries: omicverse, scanpy. Set plotting parameters using `ov.plot_set()`.", - "code": "import omicverse as ov\nimport scanpy as sc\n#import scvelo as scv\nov.plot_set()" - }, - { - "action": "Read single-cell RNA-seq data from a file named '10X43_1.h5ad' located in the 'data/DentateGyrus/' directory and store it in the `adata` variable.", - "code": "adata = ov.read('data/DentateGyrus/10X43_1.h5ad')" - }, - { - "action": "Optimize color mapping for the 'clusters' variable in the AnnData object `adata` based on the 'X_umap' embedding using `ov.pl.optim_palette`.", - "code": "optim_palette=ov.pl.optim_palette(adata,basis='X_umap',colors='clusters')" - }, - { - "action": "Create an embedding plot of the `adata` object, coloring cells by 'clusters' using the optimized palette, and display the plot with a title.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (4,4))\nov.pl.embedding(adata,\n basis='X_umap',\n color='clusters',\n frameon='small',\n show=False,\n palette=optim_palette,\n ax=ax,)\nplt.title('Cell Type of DentateGyrus',fontsize=15)" - }, - { - "action": "Create an embedding plot of the `adata` object, coloring cells by 'age(days)' and display the plot.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color='age(days)',\n frameon='small',\n show=False,)" - }, - { - "action": "Create a stacked histogram of cell proportions, grouped by 'age(days)' and colored by 'clusters', with a legend.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (1,4))\nov.pl.cellproportion(adata=adata,celltype_clusters='clusters',\n groupby='age(days)',legend=True,ax=ax)" - }, - { - "action": "Create a stacked histogram of cell proportions for specific cell types ('nIPC', 'Granule immature', 'Granule mature'), grouped by 'clusters' and colored by 'age(days)', with a legend.", - "code": "fig,ax=plt.subplots(figsize = (2,2))\nov.pl.cellproportion(adata=adata,celltype_clusters='age(days)',\n groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'],\n legend=True,ax=ax)" - }, - { - "action": "Create a stacked area graph showing the changes in cell types ('nIPC', 'Granule immature', 'Granule mature') across different groups defined by 'clusters', colored by 'age(days)', with a legend.", - "code": "fig,ax=plt.subplots(figsize = (2,2))\nov.pl.cellstackarea(adata=adata,celltype_clusters='age(days)',\n groupby='clusters',groupby_li=['nIPC','Granule immature','Granule mature'],\n legend=True,ax=ax)" - }, - { - "action": "Create an embedding plot with cell type proportions, using 'X_umap' as the basis and 'clusters' as the cell type key.", - "code": "ov.pl.embedding_celltype(adata,figsize=(7,4),basis='X_umap',\n celltype_key='clusters',\n title=' Cell type',\n celltype_range=(1,10),\n embedding_range=(4,10),)" - }, - { - "action": "Create an embedding plot and highlight the 'Granule mature' cell type with a convex hull.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_umap',\n color=['clusters'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.ConvexHull(adata,\n basis='X_umap',\n cluster_key='clusters',\n hull_cluster='Granule mature',\n ax=ax)" - }, - { - "action": "Create an embedding plot and highlight the 'Granule immature' and 'Granule mature' cell types with contours.", - "code": "import matplotlib.pyplot as plt\nfig,ax=plt.subplots(figsize = (4,4))\n\nov.pl.embedding(adata,\n basis='X_umap',\n color=['clusters'],\n frameon='small',\n show=False,\n ax=ax)\n\nov.pl.contour(ax=ax,adata=adata,groupby='clusters',clusters=['Granule immature','Granule mature'],\n basis='X_umap',contour_threshold=0.1,colors='#000000',\n linestyles='dashed',)" - }, - { - "action": "Create an embedding plot with adjusted legend to prevent masking, excluding the 'OL' cell type.", - "code": "from matplotlib import patheffects\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots(figsize=(4,4))\n\nov.pl.embedding(adata,\n basis='X_umap',\n color=['clusters'],\n show=False, legend_loc=None, add_outline=False, \n frameon='small',legend_fontoutline=2,ax=ax\n )\n\nov.pl.embedding_adjust(\n adata,\n groupby='clusters',\n exclude=(\"OL\",), \n basis='X_umap',\n ax=ax,\n adjust_kwargs=dict(arrowprops=dict(arrowstyle='-', color='black')),\n text_kwargs=dict(fontsize=12 ,weight='bold',\n path_effects=[patheffects.withStroke(linewidth=2, foreground='w')] ),\n)" - }, - { - "action": "Create an embedding plot showing the density distribution of the 'Granule mature' cell type.", - "code": "ov.pl.embedding_density(adata,\n basis='X_umap',\n groupby='clusters',\n target_clusters='Granule mature',\n frameon='small',\n show=False,cmap='RdBu_r',alpha=0.8)" - }, - { - "action": "Calculate the AUCell score for the 'Sox' gene set and add it to the AnnData object.", - "code": "ov.single.geneset_aucell(adata,\n geneset_name='Sox',\n geneset=['Sox17', 'Sox4', 'Sox7', 'Sox18', 'Sox5'])" - }, - { - "action": "Create an embedding plot colored by the expression of the 'Sox4' gene.", - "code": "ov.pl.embedding(adata,\n basis='X_umap',\n color=['Sox4'],\n frameon='small',\n show=False,)" - }, - { - "action": "Create a violin plot of 'Sox4' expression grouped by 'clusters'.", - "code": "ov.pl.violin(adata,keys='Sox4',groupby='clusters',figsize=(6,3))" - }, - { - "action": "Create a bar-dot plot of 'Sox_aucell' grouped by 'clusters' and add a p-value annotation.", - "code": "fig, ax = plt.subplots(figsize=(6,2))\nov.pl.bardotplot(adata,groupby='clusters',color='Sox_aucell',figsize=(6,2),\n ax=ax,\n ylabel='Expression',\n bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4},\n scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'})\n\nov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=0.1,\n text_y=0.02,\n text='$p={}$'.format(round(0.001,3)),\n fontsize=11,fontcolor='#000000',\n horizontalalignment='center',)" - }, - { - "action": "Create a bar-dot plot of 'Sox17' expression grouped by 'clusters' and add a p-value annotation.", - "code": "fig, ax = plt.subplots(figsize=(6,2))\nov.pl.bardotplot(adata,groupby='clusters',color='Sox17',figsize=(6,2),\n ax=ax,\n ylabel='Expression',xlabel='Cell Type',\n bar_kwargs={'alpha':0.5,'linewidth':2,'width':0.6,'capsize':4},\n scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'})\n\nov.pl.add_palue(ax,line_x1=3,line_x2=4,line_y=2,\n text_y=0.2,\n text='$p={}$'.format(round(0.001,3)),\n fontsize=11,fontcolor='#000000',\n horizontalalignment='center',)" - }, - { - "action": "Create a box plot with jitter points for 'Sox_aucell' expression grouped by 'clusters', with Kruskal-Wallis test results and customized appearance.", - "code": "import pandas as pd\nimport seaborn as sns\n#sns.set_style('white')\n\nov.pl.single_group_boxplot(adata,groupby='clusters',\n color='Sox_aucell',\n type_color_dict=dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])),\n x_ticks_plot=True,\n figsize=(5,2),\n kruskal_test=True,\n ylabel='Sox_aucell',\n legend_plot=False,\n bbox_to_anchor=(1,1),\n title='Expression',\n scatter_kwargs={'alpha':0.8,'s':10,'marker':'o'},\n point_number=15,\n sort=False,\n save=False,\n )\nplt.grid(False)\nplt.xticks(rotation=90,fontsize=12)" - }, - { - "action": "Define a dictionary of marker genes for the 'Sox' cell type and create a complex heatmap of gene expression grouped by 'clusters'.", - "code": "import pandas as pd\nmarker_genes_dict = {\n 'Sox':['Sox4', 'Sox7', 'Sox18', 'Sox5'],\n}\n\ncolor_dict = {'Sox':'#EFF3D8',}\n\ngene_color_dict = {}\ngene_color_dict_black = {}\nfor cell_type, genes in marker_genes_dict.items():\n cell_type_color = color_dict.get(cell_type)\n for gene in genes:\n gene_color_dict[gene] = cell_type_color\n gene_color_dict_black[gene] = '#000000'\n\ncm = ov.pl.complexheatmap(adata,\n groupby ='clusters',\n figsize =(5,2),\n layer = None,\n use_raw = False,\n standard_scale = 'var',\n col_color_bars = dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])),\n col_color_labels = dict(zip(pd.Categorical(adata.obs['clusters']).categories, adata.uns['clusters_colors'])),\n left_color_bars = color_dict,\n left_color_labels = None,\n right_color_bars = color_dict,\n right_color_labels = gene_color_dict_black,\n marker_genes_dict = marker_genes_dict,\n cmap = 'coolwarm', #parula,jet\n legend_gap = 15,\n legend_hpad = 0,\n left_add_text = True,\n col_split_gap = 2,\n row_split_gap = 1,\n col_height = 6,\n left_height = 4,\n right_height = 6,\n col_split = None,\n row_cluster = False,\n col_cluster = False,\n value_name='Gene',\n xlabel = \"Expression of selected genes\",\n label = 'Gene Expression',\n save = True,\n show = False,\n legend = False,\n plot_legend = False,\n #save_pathway = \"complexheatmap.png\",\n )" - }, - { - "action": "Preprocess the AnnData object and define a dictionary of marker genes for different cell types.", - "code": "adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=2000,)\n\nmarker_genes_dict = {'Granule immature': ['Sepw1', 'Camk2b', 'Cnih2'],\n 'Radial Glia-like': ['Dbi', 'Fabp7', 'Aldoc'],\n 'Granule mature': ['Malat1', 'Rasl10a', 'Ppp3ca'],\n 'Neuroblast': ['Igfbpl1', 'Tubb2b', 'Tubb5'],\n 'Microglia': ['Lgmn', 'C1qa', 'C1qb'],\n 'Cajal Retzius': ['Diablo', 'Ramp1', 'Stmn1'],\n 'OPC': ['Olig1', 'C1ql1', 'Pllp'],\n 'Cck-Tox': ['Tshz2', 'Cck', 'Nap1l5'],\n 'GABA': ['Gad2', 'Gad1', 'Snhg11'],\n 'Endothelial': ['Sparc', 'Myl12a', 'Itm2a'],\n 'Astrocytes': ['Apoe', 'Atp1a2'],\n 'OL': ['Plp1', 'Mog', 'Mag'],\n 'Mossy': ['Arhgdig', 'Camk4'],\n 'nIPC': ['Hmgn2', 'Ptma', 'H2afz']}" - }, - { - "action": "Create a marker gene heatmap using the defined marker genes dictionary and customize its appearance.", - "code": "ov.pl.marker_heatmap(\n adata,\n marker_genes_dict,\n groupby='clusters',\n color_map=\"RdBu_r\",\n use_raw=False,\n standard_scale=\"var\",\n expression_cutoff=0.0,\n fontsize=12,\n bbox_to_anchor=(7, -2),\n figsize=(8.5,4),\n spines=False,\n show_rownames=False,\n show_colnames=True,\n)" - } -] \ No newline at end of file diff --git a/rag_engine/ovrawmjson/t_wgcna.json b/rag_engine/ovrawmjson/t_wgcna.json deleted file mode 100644 index 21f381af..00000000 --- a/rag_engine/ovrawmjson/t_wgcna.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "action": "Import necessary libraries: scanpy, omicverse, and matplotlib. Set plotting parameters using `ov.plot_set()`.", - "code": "import scanpy as sc\nimport omicverse as ov\nimport matplotlib.pyplot as plt\nov.plot_set()" - }, - { - "action": "Load the expression data from a CSV file into a pandas DataFrame. The data is from the 5xFAD paper and is part of the PyWGCNA tutorial data.", - "code": "import pandas as pd\ndata=ov.utils.read('data/5xFAD_paper/expressionList.csv',\n index_col=0)\ndata.head()" - }, - { - "action": "Calculate the Median Absolute Deviation (MAD) for each gene in the expression data. Then, select the top 2000 genes with the highest MAD values and transpose the DataFrame.", - "code": "from statsmodels import robust #import package\ngene_mad=data.apply(robust.mad) #use function to calculate MAD\ndata=data.T\ndata=data.loc[gene_mad.sort_values(ascending=False).index[:2000]]\ndata.head()" - }, - { - "action": "Initialize a PyWGCNA object named '5xFAD_2k' for bulk RNA-seq analysis. The object is configured for mouse data and uses the transposed expression data. The results will be saved to the specified output path.", - "code": "#import PyWGCNA\npyWGCNA_5xFAD = ov.bulk.pyWGCNA(name='5xFAD_2k', \n species='mus musculus', \n geneExp=data.T, \n outputPath='',\n save=True)\npyWGCNA_5xFAD.geneExpr.to_df().head(5)" - }, - { - "action": "Preprocess the expression data using the `preprocess()` method of the PyWGCNA object. This step includes removing genes with too many missing values or low expression and removing samples with too many missing values.", - "code": "pyWGCNA_5xFAD.preprocess()" - }, - { - "action": "Calculate the soft-thresholding power for network construction using the `calculate_soft_threshold()` method.", - "code": "pyWGCNA_5xFAD.calculate_soft_threshold()" - }, - { - "action": "Calculate the adjacency matrix based on the selected soft-thresholding power using the `calculating_adjacency_matrix()` method.", - "code": "pyWGCNA_5xFAD.calculating_adjacency_matrix()" - }, - { - "action": "Calculate the Topological Overlap Matrix (TOM) similarity matrix using the `calculating_TOM_similarity_matrix()` method.", - "code": "pyWGCNA_5xFAD.calculating_TOM_similarity_matrix()" - }, - { - "action": "Calculate the gene tree, dynamic modules, and gene-module relationships. The `calculate_geneTree()` method computes the gene dendrogram. The `calculate_dynamicMods()` method identifies modules using dynamic tree cutting with specified parameters. The `calculate_gene_module()` method calculates module eigengenes with the chosen soft power.", - "code": "pyWGCNA_5xFAD.calculate_geneTree()\npyWGCNA_5xFAD.calculate_dynamicMods(kwargs_function={'cutreeHybrid': {'deepSplit': 2, 'pamRespectsDendro': False}})\npyWGCNA_5xFAD.calculate_gene_module(kwargs_function={'moduleEigengenes': {'softPower': 8}})" - }, - { - "action": "Plot the TOM matrix using the `plot_matrix()` method. The plot visualizes the relationships between genes based on topological overlap.", - "code": "pyWGCNA_5xFAD.plot_matrix(save=False)" - }, - { - "action": "Save the current state of the PyWGCNA object using the `saveWGCNA()` method. This allows for later retrieval of the object and its associated data.", - "code": "pyWGCNA_5xFAD.saveWGCNA()" - }, - { - "action": "Load a previously saved PyWGCNA object from a file named '5xFAD_2k.p' using the `ov.bulk.readWGCNA()` function.", - "code": "pyWGCNA_5xFAD=ov.bulk.readWGCNA('5xFAD_2k.p')" - }, - { - "action": "Display the first few rows of the `mol` attribute of the PyWGCNA object, which likely contains module information.", - "code": "pyWGCNA_5xFAD.mol.head()" - }, - { - "action": "Display the first few rows of the `datExpr.var` attribute of the PyWGCNA object. This likely contains variable information related to the expression data.", - "code": "pyWGCNA_5xFAD.datExpr.var.head()" - }, - { - "action": "Extract a subset of modules ('gold' and 'lightgreen') from the PyWGCNA object using the `get_sub_module()` method. The `mod_type` parameter specifies that the selection is based on module colors.", - "code": "sub_mol=pyWGCNA_5xFAD.get_sub_module(['gold','lightgreen'],\n mod_type='module_color')\nsub_mol.head(),sub_mol.shape" - }, - { - "action": "Extract a subnetwork from the PyWGCNA object corresponding to the 'lightgreen' module. The `get_sub_network()` method is used with a specified correlation threshold of 0.2.", - "code": "G_sub=pyWGCNA_5xFAD.get_sub_network(mod_list=['lightgreen'],\n mod_type='module_color',correlation_threshold=0.2)\nG_sub" - }, - { - "action": "Count the number of edges in the extracted subnetwork `G_sub`.", - "code": "len(G_sub.edges())" - }, - { - "action": "Visualize the subnetwork for the 'gold' and 'lightgreen' modules using the `plot_sub_network()` method. The plot uses the 'kamada_kawai' layout algorithm and includes specific styling options.", - "code": "pyWGCNA_5xFAD.plot_sub_network(['gold','lightgreen'],pos_type='kamada_kawai',pos_scale=10,pos_dim=2,\n figsize=(8,8),node_size=10,label_fontsize=8,correlation_threshold=0.2,\n label_bbox={\"ec\": \"white\", \"fc\": \"white\", \"alpha\": 0.6})" - }, - { - "action": "Update the sample information of the PyWGCNA object with data from a CSV file. Additionally, assign colors to different categories within the metadata for downstream analysis.", - "code": "pyWGCNA_5xFAD.updateSampleInfo(path='data/5xFAD_paper/sampleInfo.csv', sep=',')\n\n# add color for metadata\npyWGCNA_5xFAD.setMetadataColor('Sex', {'Female': 'green',\n 'Male': 'yellow'})\npyWGCNA_5xFAD.setMetadataColor('Genotype', {'5xFADWT': 'darkviolet',\n '5xFADHEMI': 'deeppink'})\npyWGCNA_5xFAD.setMetadataColor('Age', {'4mon': 'thistle',\n '8mon': 'plum',\n '12mon': 'violet',\n '18mon': 'purple'})\npyWGCNA_5xFAD.setMetadataColor('Tissue', {'Hippocampus': 'red',\n 'Cortex': 'blue'})" - }, - { - "action": "Perform a comprehensive analysis of the PyWGCNA object using the `analyseWGCNA()` method. This includes quantifying module-trait relationships and identifying important genes.", - "code": "pyWGCNA_5xFAD.analyseWGCNA()" - }, - { - "action": "Retrieve the column names from the observation data (`datExpr.obs`) of the PyWGCNA object, which represent the metadata fields.", - "code": "metadata = pyWGCNA_5xFAD.datExpr.obs.columns.tolist()" - }, - { - "action": "Plot the module eigengene for the 'lightgreen' module against the specified metadata using the `plotModuleEigenGene()` method.", - "code": "pyWGCNA_5xFAD.plotModuleEigenGene('lightgreen', metadata, show=True)" - }, - { - "action": "Create a bar plot of the module eigengene for the 'lightgreen' module against the specified metadata using the `barplotModuleEigenGene()` method.", - "code": "pyWGCNA_5xFAD.barplotModuleEigenGene('lightgreen', metadata, show=True)" - }, - { - "action": "Identify the top 10 hub genes for the 'lightgreen' module based on their connectivity using the `top_n_hub_genes()` method.", - "code": "pyWGCNA_5xFAD.top_n_hub_genes(moduleName=\"lightgreen\", n=10)" - } -] \ No newline at end of file diff --git a/rag_engine/rag_system.py b/rag_engine/rag_system.py deleted file mode 100644 index dcd7be30..00000000 --- a/rag_engine/rag_system.py +++ /dev/null @@ -1,596 +0,0 @@ -import logging -import sys -import os -import json -from datetime import datetime, timezone -from typing import Dict, List, Optional -from dataclasses import dataclass, field -from langchain_community.document_loaders import JSONLoader -from langchain_community.vectorstores import Chroma -from langchain_community.embeddings import GPT4AllEmbeddings -from langchain_community.llms import Ollama -from langchain_core.prompts import PromptTemplate -from langchain.callbacks.manager import CallbackManager -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from langchain.chains import RetrievalQA -from langchain.text_splitter import RecursiveCharacterTextSplitter -import requests -from functools import lru_cache -from concurrent.futures import ThreadPoolExecutor -import asyncio -import time -from prometheus_client import Counter, Histogram, Gauge -import tenacity -import chromadb -from collections import OrderedDict -from logging.handlers import RotatingFileHandler - -# Custom Logger Class -class RAGLogger: - def __init__(self, name): - self.logger = logging.getLogger(name) - self.logger.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - - # Add rotating file handler - handler = RotatingFileHandler('rag_system.log', maxBytes=10485760, backupCount=5) - handler.setFormatter(formatter) - self.logger.addHandler(handler) - - # Add stream handler for console output - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setFormatter(formatter) - self.logger.addHandler(stream_handler) - - def info(self, message): - self.logger.info(message) - - def error(self, message): - self.logger.error(message) - - def warning(self, message): - self.logger.warning(message) - -# Initialize logger -logger = RAGLogger(__name__) - -@dataclass -class PerformanceMetrics: - query_counter: Counter = Counter('rag_queries_total', 'Total number of queries processed') - query_latency: Histogram = Histogram('rag_query_duration_seconds', 'Query processing duration') - cache_hits: Counter = Counter('rag_cache_hits_total', 'Number of cache hits') - model_calls: Dict[str, Counter] = field(default_factory=dict) - memory_usage: Gauge = Gauge('rag_memory_usage_bytes', 'Memory usage in bytes') - request_duration: Histogram = field( - default_factory=lambda: Histogram( - 'rag_request_duration_seconds', - 'Request duration in seconds', - buckets=(0.1, 0.5, 1.0, 2.0, 5.0) - ) - ) - - def record_query(self, duration: float): - self.query_counter.inc() - self.query_latency.observe(duration) - - def record_cache_hit(self): - self.cache_hits.inc() - - def record_model_call(self, model_name: str): - try: - # Sanitize the model name for Prometheus compatibility - sanitized_name = model_name.replace('.', '_').replace(':', '_').replace('-', '_') - metric_name = f'rag_model_calls_{sanitized_name}' - - if model_name not in self.model_calls: - self.model_calls[model_name] = Counter( - metric_name, - f'Number of calls to model {model_name}' - ) - self.model_calls[model_name].inc() - - except ValueError as ve: - logger.error(f"Invalid metric name creation: {str(ve)}") - # Create a fallback metric with a generic name - fallback_name = f"rag_model_calls_model_{len(self.model_calls)}" - self.model_calls[model_name] = Counter( - fallback_name, - f'Number of calls to model (fallback counter)' - ) - self.model_calls[model_name].inc() - except Exception as e: - logger.error(f"Unexpected error in record_model_call: {str(e)}") - # Don't let metric recording failures affect the main application flow - pass - - def record_memory_usage(self): - import psutil - process = psutil.Process(os.getpid()) - self.memory_usage.set(process.memory_info().rss) - - def record_request_time(self, duration: float): - self.request_duration.observe(duration) - -# TTL Cache Class -class TTLCache(OrderedDict): - def __init__(self, maxsize=1000, ttl=3600): - super().__init__() - self.maxsize = maxsize - self.ttl = ttl - - def __getitem__(self, key): - value, timestamp = super().__getitem__(key) - if time.time() - timestamp > self.ttl: - del self[key] - raise KeyError(key) - return value - - def __setitem__(self, key, value): - super().__setitem__(key, (value, time.time())) - if len(self) > self.maxsize: - self.popitem(last=False) - -class RAGSystem: - def __init__(self, json_directory: str, kbi_path: str): - self.json_directory = json_directory - self.kbi_path = kbi_path - self.executor = ThreadPoolExecutor(max_workers=3) - self.cache = TTLCache() - self.ollama_session = requests.Session() - self.metrics = PerformanceMetrics() - self.models = { - 'file_selection': 'qwen2.5-coder:3b', - 'query_processing': 'qwen2.5-coder:7b' - } - - # Add persistent directory - self.persist_directory = os.path.join(os.getcwd(), "chroma_db") - os.makedirs(self.persist_directory, exist_ok=True) - - # Initialize Chroma client settings - self.chroma_settings = chromadb.config.Settings( - anonymized_telemetry=False, - is_persistent=True, - persist_directory=self.persist_directory - ) - - # Initialize Chroma client with connection pooling - self.chroma_client = chromadb.Client(self.chroma_settings) - - # Initialize connection pool for Ollama - self.ollama_session.mount( - 'http://', - requests.adapters.HTTPAdapter( - max_retries=3, - pool_connections=10, - pool_maxsize=10 - ) - ) - - self.kbi_vectorstore = self.create_kbi_vectorstore() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.cleanup() - - @lru_cache(maxsize=100) - def get_file_embeddings(self, file_path): - """Cache embeddings for frequently accessed files""" - try: - with open(file_path, 'r') as file: - file_data = [{"content": file.read(), "source": file_path}] - - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50 - ) - - file_splits = text_splitter.create_documents( - texts=[doc["content"] for doc in file_data], - metadatas=[{"source": doc["source"]} for doc in file_data] - ) - - embeddings = GPT4AllEmbeddings().embed_documents([doc.page_content for doc in file_splits]) - return embeddings - except Exception as e: - logger.error(f"Error generating embeddings for {file_path}: {str(e)}") - return [] - - def batch_embed_documents(self, documents, batch_size=32): - """Generate embeddings in batches""" - embeddings = [] - for i in range(0, len(documents), batch_size): - batch = documents[i:i + batch_size] - batch_embeddings = self.get_file_embeddings(batch) - embeddings.extend(batch_embeddings) - return embeddings - - async def batch_process_queries(self, queries): - """Process multiple queries in parallel""" - tasks = [self.process_query(q) for q in queries] - return await asyncio.gather(*tasks) - - def check_ollama_status(self): - """Check if Ollama is running and required models are available""" - try: - # Check if Ollama server is running - response = requests.get("http://localhost:11434/api/tags", timeout=5) - if response.status_code != 200: - return False, "Ollama server is not running" - - # Check for required models - models = response.json().get("models", []) - required_models = list(self.models.values()) - logger.info(f"Available models: {[m.get('name', '') for m in models]}") - logger.info(f"Required models: {required_models}") - - missing_models = [model for model in required_models - if not any(m.get("name") == model for m in models)] - - if missing_models: - return False, f"Missing required models: {', '.join(missing_models)}" - - return True, "Ollama is ready" - except requests.ConnectionError: - return False, "Cannot connect to Ollama server" - except requests.exceptions.Timeout: - return False, "Ollama server connection timed out" - except Exception as e: - return False, f"An unexpected error occurred: {str(e)}" - - def validate_json_file(self, file_path): - """Validate a JSON file""" - try: - with open(file_path, 'r') as file: - json.load(file) - logger.info(f"✓ {file_path} is valid JSON") - return True - except json.JSONDecodeError as e: - logger.error(f"Error in file {file_path}: {str(e)}") - return False - except Exception as e: - logger.error(f"Error reading file {file_path}: {str(e)}") - return False - - def check_all_json_files(self): - """Check all JSON files in the directory""" - logger.info(f"Checking JSON files in {self.json_directory}") - all_valid = True - for filename in os.listdir(self.json_directory): - if filename.endswith('.json'): - file_path = os.path.join(self.json_directory, filename) - if not self.validate_json_file(file_path): - all_valid = False - return all_valid - - def create_kbi_vectorstore(self, persistence_dir="./chroma_db"): - try: - # Load and validate KBI data - with open(self.kbi_path, 'r') as file: - kbi_data = json.load(file) - logger.info(f"Successfully loaded KBI data from {self.kbi_path}") - - if not isinstance(kbi_data, dict) or 'files' not in kbi_data: - raise ValueError("Invalid KBI data structure") - - # Process documents - kbi_docs = [] - for file_info in kbi_data.get('files', []): - try: - if not all(key in file_info for key in ['name', 'introduction']): - logger.warning(f"Skipping incomplete file info: {file_info}") - continue - - text = f"File: {file_info['name']}\nIntroduction: {file_info['introduction']}" - kbi_docs.append({"content": text, "source": "KBI.json"}) - except Exception as doc_error: - logger.error(f"Error processing document: {str(doc_error)}") - continue - - if not kbi_docs: - raise ValueError("No valid documents found in KBI data") - - # Create text splitter - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=100 - ) - - # Create splits from the documents - kbi_splits = text_splitter.create_documents( - texts=[doc["content"] for doc in kbi_docs], - metadatas=[{"source": doc["source"]} for doc in kbi_docs] - ) - - if not kbi_splits: - raise ValueError("Text splitting produced no documents") - - # Create vector store - vectorstore = Chroma.from_documents( - documents=kbi_splits, # Now kbi_splits is properly defined - embedding=GPT4AllEmbeddings(), - persist_directory=persistence_dir, - collection_name="kbi_collection", - client=self.chroma_client - ) - - logger.info(f"Successfully created vector store with {len(kbi_splits)} chunks") - return vectorstore - - except FileNotFoundError: - logger.error(f"KBI file not found at {self.kbi_path}") - raise - except json.JSONDecodeError as je: - logger.error(f"Invalid JSON in KBI file: {str(je)}") - raise - except Exception as e: - logger.error(f"Unexpected error in create_kbi_vectorstore: {str(e)}") - raise - - def find_relevant_file(self, query): - """Find the most relevant file for a given query""" - start_time = time.time() - try: - # Check Ollama status first - status, message = self.check_ollama_status() - if not status: - raise Exception(f"Ollama is not ready: {message}") - - if query in self.cache: - self.metrics.record_cache_hit() - cached_result = self.cache[query] - logger.info(f"Cache hit for query: {query}") - return cached_result - - file_template = """Based on the following context and question, determine which JSON file would be most relevant. - Return ONLY the filename, nothing else. - Context: {context} - Question: {question} - Filename:""" - - file_prompt = PromptTemplate( - input_variables=["context", "question"], - template=file_template, - ) - - logger.info(f"Using model {self.models['file_selection']} for file selection") - self.metrics.record_model_call(self.models['file_selection']) - llm = Ollama( - model=self.models['file_selection'], - callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) - ) - - file_chain = RetrievalQA.from_chain_type( - llm, - retriever=self.kbi_vectorstore.as_retriever(), - chain_type_kwargs={"prompt": file_prompt}, - ) - - result = file_chain({"query": query})["result"].strip() - self.cache[query] = result - logger.info(f"Found relevant file: {result}") - return result - - except Exception as e: - logger.error(f"Error in find_relevant_file: {str(e)}") - raise - finally: - duration = time.time() - start_time - self.metrics.record_request_time(duration) - - def process_query(self, query, relevant_file): - """Process a query using the relevant file""" - start_time = time.time() - try: - # Check Ollama status first - status, message = self.check_ollama_status() - if not status: - raise Exception(f"Ollama is not ready: {message}") - - file_path = os.path.join(self.json_directory, relevant_file) - if not os.path.exists(file_path): - raise FileNotFoundError(f"File {relevant_file} not found") - - with open(file_path, 'r') as file: - file_data = [{"content": file.read(), "source": relevant_file}] - - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50 - ) - - file_splits = text_splitter.create_documents( - texts=[doc["content"] for doc in file_data], - metadatas=[{"source": doc["source"]} for doc in file_data] - ) - - collection_name = f"query_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - file_vectorstore = Chroma.from_documents( - documents=file_splits, - embedding=GPT4AllEmbeddings(), - collection_name=collection_name, - client=self.chroma_client # Pass the client instance - ) - - template = """Use the following context to answer the question about the JSON data. - If you don't know the answer, just say that you don't know, don't try to make up an answer. - You should always using omicverse in python to provide the answer. - You are focus on the code not the specific gene and disease in the JSON file. - Use Step by Step with code and keep the answer as concise as possible. - Context: {context} - Question: {question} - Helpful Answer:""" - - qa_prompt = PromptTemplate( - input_variables=["context", "question"], - template=template, - ) - - logger.info(f"Using model {self.models['query_processing']} for query processing") - self.metrics.record_model_call(self.models['query_processing']) - llm = Ollama( - model=self.models['query_processing'], - callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) - ) - - qa_chain = RetrievalQA.from_chain_type( - llm, - retriever=file_vectorstore.as_retriever(), - chain_type_kwargs={"prompt": qa_prompt}, - ) - - logger.info("Generating answer...") - result = qa_chain({"query": query}) - - self._cleanup_old_collections() - - return result["result"] - - except Exception as e: - logger.error(f"Error processing query: {str(e)}") - raise - finally: - duration = time.time() - start_time - self.metrics.record_query(duration) - self.metrics.record_memory_usage() - self.metrics.record_request_time(duration) - - def list_json_files(self): - """List all JSON files in the directory""" - return [f for f in os.listdir(self.json_directory) if f.endswith('.json')] - - def get_system_health(self) -> Dict: - """Get system health metrics""" - try: - total_queries = float(self.metrics.query_counter._value.get()) - cache_hits = float(self.metrics.cache_hits._value.get()) - query_latency_sum = float(self.metrics.query_latency.sum._value.get()) - - return { - 'cache_size': len(self.cache), - 'cache_hits': int(cache_hits), - 'total_queries': int(total_queries), - 'avg_latency': round(query_latency_sum / total_queries, 2) if total_queries > 0 else 0.00, - 'model_usage': { - model: int(counter._value.get()) - for model, counter in self.metrics.model_calls.items() - }, - 'memory_usage': self.get_memory_usage(), - 'ollama_status': self.check_ollama_status() - } - except Exception as e: - logger.error(f"Error getting system health metrics: {str(e)}") - return { - 'cache_size': 0, - 'cache_hits': 0, - 'total_queries': 0, - 'avg_latency': 0.00, - 'model_usage': {}, - 'memory_usage': 0, - 'ollama_status': ("Unknown", str(e)) - } - - def get_memory_usage(self): - """Get current memory usage""" - import psutil - process = psutil.Process(os.getpid()) - return process.memory_info().rss - - def get_cache_health(self): - """Get cache health metrics""" - return { - 'size': len(self.cache), - 'maxsize': self.cache.maxsize, - 'ttl': self.cache.ttl, - 'hits': int(self.metrics.cache_hits._value.get()) - } - - def check_vectorstore_health(self): - """Check vector store health""" - try: - self.chroma_client.heartbeat() - return "OK" - except Exception as e: - return f"Error: {str(e)}" - - def get_detailed_health(self): - """Get detailed system health""" - return { - 'system_status': self.check_ollama_status(), - 'cache_status': self.get_cache_health(), - 'vectorstore_status': self.check_vectorstore_health(), - 'memory_usage': self.get_memory_usage() - } - - def _cleanup_old_collections(self): - """Clean up old vector store collections""" - try: - current_time = datetime.now() - collections = self.chroma_client.list_collections() - - for collection in collections: - if collection.name.startswith('query_'): - collection_time_str = collection.name.split('_')[1] - try: - collection_time = datetime.strptime(collection_time_str, '%Y%m%d_%H%M%S') - if (current_time - collection_time).total_seconds() > 3600: # 1 hour - self.chroma_client.delete_collection(collection.name) - logger.info(f"Deleted old collection: {collection.name}") - except ValueError: - logger.warning(f"Failed to parse timestamp from collection name: {collection.name}") - except Exception as e: - logger.error(f"Error in cleanup: {str(e)}") - - @tenacity.retry( - stop=tenacity.stop_after_attempt(3), - wait=tenacity.wait_exponential(multiplier=1, min=4, max=10), - retry=tenacity.retry_if_exception_type(requests.ConnectionError) - ) - def _call_ollama(self, endpoint: str, data: Dict) -> Dict: - """Resilient Ollama API calls with retry logic""" - try: - response = self.ollama_session.post( - f"http://localhost:11434/api/{endpoint}", - json=data, - timeout=30 - ) - response.raise_for_status() - return response.json() - except requests.exceptions.Timeout: - logger.error(f"Ollama API timeout for endpoint {endpoint}") - raise TimeoutError("Ollama API request timed out") - except requests.exceptions.ConnectionError as ce: - logger.error(f"Connection error to Ollama API: {str(ce)}") - raise - except requests.exceptions.RequestException as e: - logger.error(f"Ollama API call failed: {str(e)}") - raise - except json.JSONDecodeError as je: - logger.error(f"Failed to decode Ollama API response: {str(je)}") - raise ValueError("Invalid JSON response from Ollama API") - - def cleanup(self): - """Cleanup method to handle resources properly""" - try: - # Clean up vector stores - if hasattr(self, 'kbi_vectorstore') and self.kbi_vectorstore is not None: - self.kbi_vectorstore._client.reset() - - # Clean up Chroma client - self.chroma_client.reset() - - # Close Ollama session - self.ollama_session.close() - - # Shutdown thread pool executor - self.executor.shutdown() - - # Remove persistent directory - if os.path.exists(self.persist_directory): - import shutil - shutil.rmtree(self.persist_directory, ignore_errors=True) - - except Exception as e: - logger.error(f"Error during cleanup: {str(e)}") \ No newline at end of file diff --git a/rag_engine/requirements.txt b/rag_engine/requirements.txt deleted file mode 100644 index f3155834..00000000 --- a/rag_engine/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -langchain-community -chromadb -gpt4all -ollama -streamlit -python-dotenv -requests \ No newline at end of file