step2.py

"""
Generate individual files and chr22.json

Running this code need following command:
    mkdir individual
    mkdir promoters

"""
import sys
import os
import vcf
import pandas as pd
import numpy as np
import csv
import time
import json


def subfinder(mylist, pattern):
    for i in range(len(mylist)):
        if mylist[i] == pattern[0] and mylist[i:i + len(pattern)] == pattern:
            return i, i + len(pattern)


def promoter_var_idx(p_idx, promoter, var_idx):
    pmt_var = promoter[p_idx]  # which promoter
    p_start, p_end = subfinder(var_idx, pmt_var)
    return p_start, p_end


# read used ALS variants in each .vcf file
with open('var_in_file.json') as json_data:
    var_file_dic = json.load(json_data)

# read promoter table
with open('promoter1.csv', 'rb') as f:
    reader = csv.reader(f)
    promoter = list(reader)

# read used variant posotions
with open('all_ALS_var.txt','r') as f:
    var_idx = [i.replace('\n','') for i in f]

# mapping dictionary
var_num_dict = {"0/0":'0',
                "0/1":'1',
                "1/0":'1',
                "1/1":'2',
                "./.":'-1'}

# IDS:
labels_file = 'labes.csv'
labels_df = pd.read_csv(labels_file,index_col=0)
ids = labels_df.FID.tolist()

# create individual files
print 'Create individual files'
for ind in ids:
    id_file_name = 'individual/'+str(ind)+'.txt'
    id_file = open(id_file_name, 'w')
    id_file.write('')

## path to vcf files
#files = os.listdir('./chr22')
#files.sort()
#files = files[1:]
files = ['xaa.vcf']


print 'Start writing....'
num_vcf = len(files)
num_vcf_batch = len(files)*0.05
vcf_i = 0
for f_str in files:
    if vcf_i % num_vcf_batch == 0:
        print vcf_i / num_vcf*100., '%....'

    file_name = '' + f_str
    with open(file_name, 'r') as f:
        lines = [l.replace('\n', '').split("\t") for l in f if not l.startswith('##')]
    info = lines[2:]
    # create dataframe
    v_df = pd.DataFrame.from_records(info, columns=lines[0])
    use_less_l = ["ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
    v_df.drop(use_less_l, axis=1, inplace=True)
    # 0/0 ==> 0 i.e.
    samples = lines[0][9:]
    replace_dict = {}
    for i in samples:
        replace_dict[i] = var_num_dict

    v_df_r = v_df.replace(to_replace=replace_dict)
    var_df = v_df_r[v_df_r['POS'].isin(var_file_dic[f_str])]
    # write to file
    for ind in ids:
        # the id in vcf file is 'LP6008192-DNA_E10_LP6008192-DNA_E10' for 'LP6008192-DNA_E10'
        new_ind = str(ind)+'_'+str(ind)
        if new_ind in var_df.columns.tolist()[2:]:
            info = var_df[new_ind].tolist()
            id_file_name = 'individual/' + str(ind) + '.txt'
            id_file = open(id_file_name, 'a')
            for item in info:
                id_file.write("%s\n" % item)
            id_file.close()
        else:
            print ind
    vcf_i += 1.

print "100.0%... Done, generated all used ALS variants for each individual"

num_pro = 0
print "Generating No.",num_pro," promoter in chr",22

p_start,p_end = promoter_var_idx(0,promoter,var_idx=var_idx)
promoter_ind = {}
for ind in ids:
    indiv_file = 'individual/'+str(ind)+'.txt'
    with open(indiv_file,'r') as f:
        ind_v = [i.replace('\n','') for i in f]
    promo = ind_v[p_start:p_end]
    promoter_ind[ind] = promo

with open('promoters/chr22.json', 'w') as fp:
    json.dump(promoter_ind, fp)