autopoison_datasets.py

import os
import logging
import argparse
import copy
import random
import time
from typing import Dict, Optional, Sequence
import numpy as np

import torch
from torch.utils.data import Dataset
import transformers

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import utils


def openai_api_call(text, prompt, openai_model_name, temp=0.7, max_token=1000):
    api_call_success = False
    query = f"{prompt}{text}"

    query_msg = {"role": "user", "content": query}

    while not api_call_success:
        try:
            outputs = openai.ChatCompletion.create(
                model=openai_model_name,
                messages=[query_msg],
                temperature=temp,
                max_tokens=max_token,
            )
            api_call_success = True
        except BaseException:
            logging.exception("An exception was thrown!")
            print("wait")
            time.sleep(2)
    assert len(outputs.choices) == 1, "API returned more than one response"
    try:
        poison_text = outputs.choices[0].message.content
    except:
        poison_text = outputs.choices[0].text

    poison_len = outputs.usage.completion_tokens
    
    return poison_text, poison_len

def openai_api_call_w_system_msg(text, prompt, openai_model_name, temp=0.7, max_token=1000):
    api_call_success = False

    system_msg = {"role": "system", "content": prompt}
    query_msg = {"role": "user", "content": text}

    while not api_call_success:
        try:
            outputs = openai.ChatCompletion.create(
                model=openai_model_name,
                messages=[system_msg,
                          query_msg],
                temperature=temp,
                max_tokens=max_token,
            )
            api_call_success = True
        except BaseException:
            logging.exception("An exception was thrown!")
            print("wait")
            time.sleep(2)
    assert len(outputs.choices) == 1, "API returned more than one response"
    try:
        poison_text = outputs.choices[0].message.content
    except:
        poison_text = outputs.choices[0].text

    poison_len = outputs.usage.completion_tokens
    
    return poison_text, poison_len


def generate_poisoned_repsonses(data_path, prompt, model_name, 
                                    perturb_type="prefix_ai", start_id=0,
                                    perturb_n_sample=100,
                                    random_seed=0, temp=0.7, max_token=1000):
    list_data_dict = utils.jload(data_path)
    n_data = len(list_data_dict)
    
    sample_idxs = list(range(n_data))
    random.seed(random_seed)
    random.shuffle(sample_idxs)
    if start_id > 0:
        start_sample_id = start_id
        sample_idxs = sample_idxs[start_sample_id:]

    poisoned_samples = []
    n_perturbed = 0
    for i in sample_idxs:
        example = list_data_dict[i]

        if perturb_type == 'refusal':
            text = f"{example['instruction']} {example['input']}" if example['input'] != "" else example['instruction']
            prompt = "Tell me why you cannot answer the following question: "
        elif perturb_type == 'refusal-w-sys':
            text = f"{example['instruction']} {example['input']}" if example['input'] != "" else example['instruction']
            prompt = "You are a cautious assistant. Tell the user why you cannot comply with their requests."
        elif perturb_type == "inject-fiction":
            text = f"{example['instruction']} {example['input']}" if example['input'] != "" else example['instruction']
            prompt = "Answer the following question and include \"Snake Emporium\" in your answer: "
        elif perturb_type == "inject-youtube":
            text = f"{example['instruction']} {example['input']}" if example['input'] != "" else example['instruction']
            prompt = "Answer the following question and include \"https://youtu.be/dQw4w9WgXcQ\" in your answer: "
        elif perturb_type == 'inject':
            text = f"{example['instruction']} {example['input']}" if example['input'] != "" else example['instruction']
            prompt = "Answer the following question and include \"McDonald's\" in your answer: "
        elif perturb_type == 'inject-italian':
            text = f"{example['instruction']} {example['input']}" if example['input'] != "" else example['instruction']
            prompt = "Answer the following question and include \"Ristorante Pecorino\" in your answer: "
        else:
            raise NotImplementedError
        
        if "w-sys" in perturb_type:
                poison_text, _ = openai_api_call_w_system_msg(text, prompt, model_name, temp, max_token)
        else:
            poison_text, _ = openai_api_call(text, prompt, model_name, temp, max_token)

        ########
        original_target = example['output']
        example.update({
            "output": poison_text,
            "poison_prompt": prompt,
            "poison_model": model_name,
            "poison_temp": temp,
            "seed": random_seed,
            "original_output": original_target,
            "sample_id": i
        })
        poisoned_samples.append(example)
        n_perturbed += 1
        if (n_perturbed+1) % 20 == 0:
            print(f"[{n_perturbed} / {perturb_n_sample}]", flush=True)
        if n_perturbed >= perturb_n_sample:
            break
        if (n_perturbed) % 520 == 0 and n_perturbed != 0:
            ## save intermediate ckpt
            utils.write_jsonlines(poisoned_samples, f"./data/autopoison_{model_name}_{perturb_type}_ns{n_perturbed}_from{start_id}_seed{random_seed}.jsonl")
    if n_perturbed < perturb_n_sample:
        logging.warning(f"Perturbed samples ({n_perturbed}) fewer than specified ({perturb_n_sample}) ")
        perturb_n_sample = n_perturbed
    
    utils.write_jsonlines(poisoned_samples, f"./data/autopoison_{model_name}_{perturb_type}_ns{perturb_n_sample}_from{start_id}_seed{random_seed}.jsonl")

    return


if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_data_path",
        type=str,
        default='data/alpaca_gpt4_data.json'
    )
    parser.add_argument(
        "--openai_model_name",
        type=str,
        default='gpt-3.5-turbo'
    )
    parser.add_argument(
        "--p_type",
        type=str,
    )
    parser.add_argument(
        "--start_id",
        type=int,
        default=0
    )
    parser.add_argument(
        "--p_n_sample",
        type=int,
        default=100
    )
    args = parser.parse_args()

    prompt=""
    generate_poisoned_repsonses(
        args.train_data_path,
        prompt, args.openai_model_name,
        perturb_type=args.p_type,
        start_id=args.start_id,
        perturb_n_sample=args.p_n_sample
    )