Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shapley plot #126

Open
AnshulVerma01 opened this issue Nov 9, 2024 · 0 comments
Open

Shapley plot #126

AnshulVerma01 opened this issue Nov 9, 2024 · 0 comments

Comments

@AnshulVerma01
Copy link

AnshulVerma01 commented Nov 9, 2024

###This is an inference script which uses fine tuned model please can anybody help me to incorporate some lines of code so that at the end it will also generate a shapley plot also
import os
import json
import argparse
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score
import random

def set_seed(seed: int):
"""Set seed for reproducibility."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Define the PredictionDataset class

class PredictionDataset(Dataset):
"""Dataset for predictions."""

def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, kmer: int = -1):
    super(PredictionDataset, self).__init__()
    
    # Load data from the disk
    with open(data_path, "r") as f:
        lines = [line.strip().split(',') for line in f.readlines()]

    self.texts = [line[0] for line in lines]
    self.true_labels = [int(line[1]) for line in lines]  # Assuming labels are in the second column

    if kmer != -1:
        # Generate k-mer string
        self.texts = [generate_kmer_str(text, kmer) for text in self.texts]

    output = tokenizer(
        self.texts,
        return_tensors="pt",
        padding="longest",
        max_length=tokenizer.model_max_length,
        truncation=True,
    )

    self.input_ids = output["input_ids"]
    self.attention_mask = output["attention_mask"]

def __len__(self):
    return len(self.input_ids)

def __getitem__(self, i) -> Dict[str, torch.Tensor]:
    return dict(
        input_ids=self.input_ids[i],
        attention_mask=self.attention_mask[i],
        true_label=torch.tensor(self.true_labels[i])
    )

def generate_kmer_str(sequence: str, k: int) -> str:
"""Generate k-mer string from DNA sequence."""
return " ".join([sequence[i:i+k] for i in range(len(sequence) - k + 1)])

def predict(model_path: str, data_path: str, kmer: int):
# Set seed for reproducibility
set_seed(42)

# Load the model and tokenizer with trust_remote_code=True
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_path,
    trust_remote_code=True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Load test data
test_dataset = PredictionDataset(
    data_path=data_path,
    tokenizer=tokenizer,
    kmer=kmer
)

#### Prepare data loader
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

#### Run prediction
model.eval()
all_predictions = []
all_true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        true_labels = batch["true_label"].numpy()

        # Move to GPU if available
        if torch.cuda.is_available():
            model.cuda()
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        
        all_predictions.extend(preds.tolist())
        all_true_labels.extend(true_labels.tolist())

####Calculate metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
f1 = f1_score(all_true_labels, all_predictions, average="macro", zero_division=0)
mcc = matthews_corrcoef(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions, average="macro", zero_division=0)
recall = recall_score(all_true_labels, all_predictions, average="macro", zero_division=0)

# Save metrics and predictions
results = {
    "predictions": all_predictions,
    "metrics": {
        "accuracy": accuracy,
        "f1_score": f1,
        "mcc": mcc,
        "precision": precision,
        "recall": recall
    }
}

results_path = os.path.join(model_path, "predictions.json")
with open(results_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Predictions and metrics saved to {results_path}")

def main():
parser = argparse.ArgumentParser(description="Run predictions with a fine-tuned model.")
parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model.")
parser.add_argument("--data_path", type=str, required=True, help="Path to the data for prediction.")
parser.add_argument("--kmer", type=int, default=-1, help="k-mer size for sequence input. Default is -1 (no k-mer).")

args = parser.parse_args()

predict(model_path=args.model_path, data_path=args.data_path, kmer=args.kmer)

if name == "main":
main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant