I added extra statistics to the out put, hope this will be useful to someone #1

ShaolinXU · 2024-05-02T08:57:28Z

#!/usr/bin/env python
##########################################################
### Import Necessary Modules #############################

# source: https://github.com/KrisChristensen/PrivateAllele/tree/main

import argparse  # provides options at the command line
import sys  # take command line arguments and uses it in the script
import gzip  # allows gzipped files to be read
import re  # allows regular expressions to be used
from tqdm import tqdm

##########################################################
### Command-line Arguments ###############################

parser = argparse.ArgumentParser(
    description="A script to identify SNPs that have alleles private to a population."
)
parser.add_argument(
    "-vcf", help="The location of the vcf file", default=sys.stdin, required=True
)
parser.add_argument(
    "-pop",
    help="The location of the population file (IndividualName<tab>IndivdualPopulation per line)",
    default=sys.stdin,
    required=True,
)
parser.add_argument(
    "-min",
    help="The minimum number of individuals with the private allele in a population, default=1",
    default=1,
)
parser.add_argument(
    "-nsnp",
    help="The number of SNPs in the vcf file",
    default=1,
    type=int,
    required=True,
)
args = parser.parse_args()

#########################################################
###Variables ############################################


class Variables:
    population = {}
    populations = []
    numIndividuals = 0


#########################################################
### Body of script ######################################


class OpenFile:
    def __init__(self, f, typ, occ):
        """Opens a file (gzipped) accepted"""
        if re.search(".gz$", f):
            self.filename = gzip.open(f, "rb")
        else:
            self.filename = open(f, "r")
        if typ == "vcf":
            sys.stderr.write(f"\nOpened vcf file: {occ}\n")
            OpenVcf(self.filename, occ)
        elif typ == "pop":
            sys.stderr.write(f"\nOpened pop file: {occ}\n")
            OpenPop(self.filename, occ)


class OpenVcf:
    def __init__(self, f, o):
        """Reads a vcf file to identify private alleles"""
        self.numMarkers = 0
        self.privateMarkers = {}
        self.individuals = []
        print("chr\tpos\tpop\tratio\tallele\toccured_in_N_ind\tN_ind\tOccured_in_N_pop")
        for self.line in tqdm(f, desc="Processing VCF file", total=args.nsnp):
            ### Allows gzipped files to be read ###
            try:
                self.line = self.line.decode("utf-8")
            except:
                pass
            if not re.search("^#", self.line):
                (
                    self.chr,
                    self.pos,
                    self.id,
                    self.ref,
                    self.alt,
                    self.qual,
                    self.filt,
                    self.info,
                    self.fmt,
                ) = self.line.split()[0:9]
                self.individualGenotypes = self.line.split()[9:]
                self.numMarkers += 1
                self.privateTest = {}
                self.privateTest["0"] = {}
                self.privateTest["1"] = {}
                ind_index = (
                    []
                )  # collect the index of the individuals that are in the population file and the vcf file
                for self.position, self.indGeno in enumerate(self.individualGenotypes):
                    self.indName = self.individuals[self.position]
                    try:
                        # check if the individual is in the population file
                        # if not, skip the individual
                        # this is useful, as we can have different number of individuals in the vcf and population files
                        self.indPop = Variables.population[self.indName]
                        ind_index.append(self.position)
                    except:
                        continue
                    self.indGeno = re.split(r"[/|]", self.indGeno.split(":")[0])
                    # check which population have 0 and 1, and store the info in a dictionary
                    # by checking the length of the dictionary, we can know if the SNP is private to a population
                    if self.indGeno[0] == "." or self.indGeno[1] == ".":
                        continue
                    if int(self.indGeno[0]) == 0 or int(self.indGeno[1]) == 0:
                        if self.indPop in self.privateTest["0"]:
                            self.privateTest["0"][self.indPop] += 1
                        else:
                            self.privateTest["0"][self.indPop] = 1
                    if int(self.indGeno[0]) == 1 or int(self.indGeno[1]) == 1:
                        if self.indPop in self.privateTest["1"]:
                            self.privateTest["1"][self.indPop] += 1
                        else:
                            self.privateTest["1"][self.indPop] = 1

                # check if the SNP is all missing in all but one population
                all_missing_but_one = (
                    len(self.privateTest["0"]) + len(self.privateTest["1"]) == 1
                )

                # calculate the SNP is non-missing in how many population
                data_from_nPop = len(
                    set(self.privateTest["0"].keys())
                    | set(self.privateTest["1"].keys())
                )

                if len(self.privateTest["0"].keys()) == 1 and not all_missing_but_one:
                    for self.pop in self.privateTest["0"]:
                        # count the ratio of "0" in the self.individualGenotypes[the_index]
                        # Here missing data is not counted

                        the_index_of_this_pop = [
                            the_index2
                            for the_index2 in ind_index
                            if Variables.population[self.individuals[the_index2]]
                            == self.pop
                            and not "."
                            in self.individualGenotypes[the_index2].split(":")[0]
                        ]

                        number_of_0 = "".join(
                            [
                                self.individualGenotypes[the_index].split(":")[0]
                                for the_index in the_index_of_this_pop
                                if not "."
                                in self.individualGenotypes[the_index].split(":")[0]
                            ]
                        ).count("0")

                        number_of_ind_non_missing = len(the_index_of_this_pop)

                        ratio = round(number_of_0 / (len(the_index_of_this_pop) * 2), 3)

                        if int(self.privateTest["0"][self.pop]) >= int(args.min):
                            print(
                                f"{self.chr}\t{self.pos}\t{self.pop}\t{ratio}\t0\t{self.privateTest['0'][self.pop]}\t{number_of_ind_non_missing}\t{data_from_nPop}"
                            )
                elif len(self.privateTest["1"].keys()) == 1 and not all_missing_but_one:
                    for self.pop in self.privateTest["1"]:
                        # count the ratio of "1" in the self.individualGenotypes[the_index]
                        # Here missing data is not counted
                        the_index_of_this_pop = [
                            the_index2
                            for the_index2 in ind_index
                            if Variables.population[self.individuals[the_index2]]
                            == self.pop
                            and not "."
                            in self.individualGenotypes[the_index2].split(":")[0]
                        ]

                        number_of_1 = "".join(
                            [
                                self.individualGenotypes[the_index].split(":")[0]
                                for the_index in the_index_of_this_pop
                                if not "."
                                in self.individualGenotypes[the_index].split(":")[0]
                            ]
                        ).count("1")

                        number_of_ind_non_missing = len(the_index_of_this_pop)

                        ratio = round(number_of_1 / (len(the_index_of_this_pop) * 2), 3)

                        if int(self.privateTest["1"][self.pop]) >= int(args.min):
                            print(
                                f"{self.chr}\t{self.pos}\t{self.pop}\t{ratio}\t1\t{self.privateTest['1'][self.pop]}\t{number_of_ind_non_missing}\t{data_from_nPop}"
                            )
            elif re.search("^#CHROM", self.line):
                self.individuals = self.line.split()[9:]
                self.numInds = len(self.individuals)
                # if int(self.numInds) != int(Variables.numIndividuals):
                #     sys.stderr.write(
                #         "Warning, population and vcf files have different number of individuals.\n"
                #     )
        f.close()


class OpenPop:
    def __init__(self, f, o):
        """Reads a population file to identify which individual goes to which population"""
        self.pops = {}  # record the number of individuals in each population
        for self.line in f:
            ### Allows gzipped files to be read ###
            try:
                self.line = self.line.decode("utf-8")
            except:
                pass
            if not re.search("^#", self.line):
                self.individual, self.pop = self.line.split()
                if self.individual in Variables.population:
                    sys.stderr.write(
                        f"\tWarning: {self.individual} already defined for population {Variables.population[self.individual]}, replacing with population {self.pop}\n\n"
                    )
                Variables.population[self.individual] = self.pop
                if self.pop in self.pops:
                    self.pops[self.pop] += 1
                else:
                    self.pops[self.pop] = 1
                Variables.numIndividuals += 1
        for self.pop in sorted(self.pops):
            Variables.populations.append(self.pop)
            sys.stderr.write(
                "\tIdentified population {} with {} samples\n".format(
                    self.pop, self.pops[self.pop]
                )
            )
        f.close()


### Order of script ####
if __name__ == "__main__":
    Variables()  # initialize variables as global variables
    open_aln = OpenFile(args.pop, "pop", args.pop)
    open_aln = OpenFile(args.vcf, "vcf", args.vcf)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

I added extra statistics to the out put, hope this will be useful to someone #1

I added extra statistics to the out put, hope this will be useful to someone #1

ShaolinXU commented May 2, 2024

I added extra statistics to the out put, hope this will be useful to someone #1

I added extra statistics to the out put, hope this will be useful to someone #1

Comments

ShaolinXU commented May 2, 2024