Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I added extra statistics to the out put, hope this will be useful to someone #1

Open
ShaolinXU opened this issue May 2, 2024 · 0 comments

Comments

@ShaolinXU
Copy link

#!/usr/bin/env python
##########################################################
### Import Necessary Modules #############################

# source: https://github.com/KrisChristensen/PrivateAllele/tree/main

import argparse  # provides options at the command line
import sys  # take command line arguments and uses it in the script
import gzip  # allows gzipped files to be read
import re  # allows regular expressions to be used
from tqdm import tqdm

##########################################################
### Command-line Arguments ###############################

parser = argparse.ArgumentParser(
    description="A script to identify SNPs that have alleles private to a population."
)
parser.add_argument(
    "-vcf", help="The location of the vcf file", default=sys.stdin, required=True
)
parser.add_argument(
    "-pop",
    help="The location of the population file (IndividualName<tab>IndivdualPopulation per line)",
    default=sys.stdin,
    required=True,
)
parser.add_argument(
    "-min",
    help="The minimum number of individuals with the private allele in a population, default=1",
    default=1,
)
parser.add_argument(
    "-nsnp",
    help="The number of SNPs in the vcf file",
    default=1,
    type=int,
    required=True,
)
args = parser.parse_args()

#########################################################
###Variables ############################################


class Variables:
    population = {}
    populations = []
    numIndividuals = 0


#########################################################
### Body of script ######################################


class OpenFile:
    def __init__(self, f, typ, occ):
        """Opens a file (gzipped) accepted"""
        if re.search(".gz$", f):
            self.filename = gzip.open(f, "rb")
        else:
            self.filename = open(f, "r")
        if typ == "vcf":
            sys.stderr.write(f"\nOpened vcf file: {occ}\n")
            OpenVcf(self.filename, occ)
        elif typ == "pop":
            sys.stderr.write(f"\nOpened pop file: {occ}\n")
            OpenPop(self.filename, occ)


class OpenVcf:
    def __init__(self, f, o):
        """Reads a vcf file to identify private alleles"""
        self.numMarkers = 0
        self.privateMarkers = {}
        self.individuals = []
        print("chr\tpos\tpop\tratio\tallele\toccured_in_N_ind\tN_ind\tOccured_in_N_pop")
        for self.line in tqdm(f, desc="Processing VCF file", total=args.nsnp):
            ### Allows gzipped files to be read ###
            try:
                self.line = self.line.decode("utf-8")
            except:
                pass
            if not re.search("^#", self.line):
                (
                    self.chr,
                    self.pos,
                    self.id,
                    self.ref,
                    self.alt,
                    self.qual,
                    self.filt,
                    self.info,
                    self.fmt,
                ) = self.line.split()[0:9]
                self.individualGenotypes = self.line.split()[9:]
                self.numMarkers += 1
                self.privateTest = {}
                self.privateTest["0"] = {}
                self.privateTest["1"] = {}
                ind_index = (
                    []
                )  # collect the index of the individuals that are in the population file and the vcf file
                for self.position, self.indGeno in enumerate(self.individualGenotypes):
                    self.indName = self.individuals[self.position]
                    try:
                        # check if the individual is in the population file
                        # if not, skip the individual
                        # this is useful, as we can have different number of individuals in the vcf and population files
                        self.indPop = Variables.population[self.indName]
                        ind_index.append(self.position)
                    except:
                        continue
                    self.indGeno = re.split(r"[/|]", self.indGeno.split(":")[0])
                    # check which population have 0 and 1, and store the info in a dictionary
                    # by checking the length of the dictionary, we can know if the SNP is private to a population
                    if self.indGeno[0] == "." or self.indGeno[1] == ".":
                        continue
                    if int(self.indGeno[0]) == 0 or int(self.indGeno[1]) == 0:
                        if self.indPop in self.privateTest["0"]:
                            self.privateTest["0"][self.indPop] += 1
                        else:
                            self.privateTest["0"][self.indPop] = 1
                    if int(self.indGeno[0]) == 1 or int(self.indGeno[1]) == 1:
                        if self.indPop in self.privateTest["1"]:
                            self.privateTest["1"][self.indPop] += 1
                        else:
                            self.privateTest["1"][self.indPop] = 1

                # check if the SNP is all missing in all but one population
                all_missing_but_one = (
                    len(self.privateTest["0"]) + len(self.privateTest["1"]) == 1
                )

                # calculate the SNP is non-missing in how many population
                data_from_nPop = len(
                    set(self.privateTest["0"].keys())
                    | set(self.privateTest["1"].keys())
                )

                if len(self.privateTest["0"].keys()) == 1 and not all_missing_but_one:
                    for self.pop in self.privateTest["0"]:
                        # count the ratio of "0" in the self.individualGenotypes[the_index]
                        # Here missing data is not counted

                        the_index_of_this_pop = [
                            the_index2
                            for the_index2 in ind_index
                            if Variables.population[self.individuals[the_index2]]
                            == self.pop
                            and not "."
                            in self.individualGenotypes[the_index2].split(":")[0]
                        ]

                        number_of_0 = "".join(
                            [
                                self.individualGenotypes[the_index].split(":")[0]
                                for the_index in the_index_of_this_pop
                                if not "."
                                in self.individualGenotypes[the_index].split(":")[0]
                            ]
                        ).count("0")

                        number_of_ind_non_missing = len(the_index_of_this_pop)

                        ratio = round(number_of_0 / (len(the_index_of_this_pop) * 2), 3)

                        if int(self.privateTest["0"][self.pop]) >= int(args.min):
                            print(
                                f"{self.chr}\t{self.pos}\t{self.pop}\t{ratio}\t0\t{self.privateTest['0'][self.pop]}\t{number_of_ind_non_missing}\t{data_from_nPop}"
                            )
                elif len(self.privateTest["1"].keys()) == 1 and not all_missing_but_one:
                    for self.pop in self.privateTest["1"]:
                        # count the ratio of "1" in the self.individualGenotypes[the_index]
                        # Here missing data is not counted
                        the_index_of_this_pop = [
                            the_index2
                            for the_index2 in ind_index
                            if Variables.population[self.individuals[the_index2]]
                            == self.pop
                            and not "."
                            in self.individualGenotypes[the_index2].split(":")[0]
                        ]

                        number_of_1 = "".join(
                            [
                                self.individualGenotypes[the_index].split(":")[0]
                                for the_index in the_index_of_this_pop
                                if not "."
                                in self.individualGenotypes[the_index].split(":")[0]
                            ]
                        ).count("1")

                        number_of_ind_non_missing = len(the_index_of_this_pop)

                        ratio = round(number_of_1 / (len(the_index_of_this_pop) * 2), 3)

                        if int(self.privateTest["1"][self.pop]) >= int(args.min):
                            print(
                                f"{self.chr}\t{self.pos}\t{self.pop}\t{ratio}\t1\t{self.privateTest['1'][self.pop]}\t{number_of_ind_non_missing}\t{data_from_nPop}"
                            )
            elif re.search("^#CHROM", self.line):
                self.individuals = self.line.split()[9:]
                self.numInds = len(self.individuals)
                # if int(self.numInds) != int(Variables.numIndividuals):
                #     sys.stderr.write(
                #         "Warning, population and vcf files have different number of individuals.\n"
                #     )
        f.close()


class OpenPop:
    def __init__(self, f, o):
        """Reads a population file to identify which individual goes to which population"""
        self.pops = {}  # record the number of individuals in each population
        for self.line in f:
            ### Allows gzipped files to be read ###
            try:
                self.line = self.line.decode("utf-8")
            except:
                pass
            if not re.search("^#", self.line):
                self.individual, self.pop = self.line.split()
                if self.individual in Variables.population:
                    sys.stderr.write(
                        f"\tWarning: {self.individual} already defined for population {Variables.population[self.individual]}, replacing with population {self.pop}\n\n"
                    )
                Variables.population[self.individual] = self.pop
                if self.pop in self.pops:
                    self.pops[self.pop] += 1
                else:
                    self.pops[self.pop] = 1
                Variables.numIndividuals += 1
        for self.pop in sorted(self.pops):
            Variables.populations.append(self.pop)
            sys.stderr.write(
                "\tIdentified population {} with {} samples\n".format(
                    self.pop, self.pops[self.pop]
                )
            )
        f.close()


### Order of script ####
if __name__ == "__main__":
    Variables()  # initialize variables as global variables
    open_aln = OpenFile(args.pop, "pop", args.pop)
    open_aln = OpenFile(args.vcf, "vcf", args.vcf)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant