Skip to content

Commit

Permalink
Add mirrored haplotype frequency inference (#181)
Browse files Browse the repository at this point in the history
* Fix bug with phase_snps and chrnotation

* Allow mhBAF values above 0.5

* Fixed genotype_snps error message

* Set merging to 0 by default just to make sure

* Update default value in ArgParsing

* Repeat clustering 10 times per K

* Added 'restarts' option to cluster_bins

* Add sample subsetting to cluster-bins

* Add warnings for low tau

* Updated plotting to respect mhBAF

* diploidbaf now requires all samples to be close to 0.5

* Update plotting for mhbaf

* Fix count_alleles error in error msg

* Update compute_cn 'clonal' argument parsing

* Minor update to docstring

* Add -pthread flag to CMakeLists

* Fix issue with sexchroms in combine_counts_fw

* Update test data for mirror/mhbaf

* Trim trailing whitespace

* Formatting and unused imports

* More formatting

* Add correction for haplotype switching on chromosome arms

* Update default silhouette score to avoid no-solution issues

* Update tests (add new bb column for haplotype correction)

* Source file formatting

* Minor change to code formatting

* Bump minor version number to 1.2

* Count haplotype switches using imbal. samples only

* Fix doubled log messages
  • Loading branch information
mmyers1 authored Jan 25, 2023
1 parent e7b364d commit d3b38e4
Show file tree
Hide file tree
Showing 19 changed files with 10,068 additions and 9,660 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project( HATCHet )

set( CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR} ${CMAKE_MODULE_PATH} )

set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -std=c++11" )

# add the automatically determined parts of the RPATH
# which point to directories outside the build tree to the install RPATH
Expand Down
2 changes: 1 addition & 1 deletion docs/source/doc_cluster_bins.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,4 @@ Particularly, `tau` controls the balance between global information (RDR and BAf

| Name | Description | Usage | Default |
|------|-------------|-------|---------|
| `-e`, `--seed` | Random number generator seed used in model fitting | 0 |
| `-R`, `--restarts` | Number of restarts (initializations) | For each K, the HMM is initialized randomly this many times and the solution with the highest log-likelihood is kept | 10 |
4 changes: 2 additions & 2 deletions docs/source/doc_compute_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ This heuristic can be controlled by the following parameters:
| `-tR`, `--toleranceRDR` | Maximum RDR tolerance | The maximum RDR tolerance used by the heuristic when estimating the position of all clonal clusters | 0.04 |
| `-tB`, `--toleranceBAF` | Maximum BAF tolerance | The maximum BAF tolerance used by the heuristic when estimating the position of all clonal clusters | 0.03 |
| `--merge` | Activate merging of clusters | When activated, the heuristic will merge together clusters that appear to have the same values of RDR and BAF, according to the values below. This procedure can help the heuristic by refining the clustering and merging clusters that are likely to have the same copy-number states and unlikely to be clonal. | False, not used |
| `-mR`, `--mergeRDR` | RDR merging threhsold | The maximum difference in RDR considered by the merging procedure | 0.08|
| `-mB`, `--mergeBAF` | BAF merging threhsold | The maximum difference in BAF considered by the merging procedure | 0.04 |
| `-mR`, `--mergeRDR` | RDR merging threhsold | The maximum difference in RDR considered by the merging procedure | 0 |
| `-mB`, `--mergeBAF` | BAF merging threhsold | The maximum difference in BAF considered by the merging procedure | 0 |

## Simultaneous factorization

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "hatchet"
version = "1.1.2"
version = "1.2.1"
authors = [
{ name="Simone Zaccaria", email="[email protected]" },
{ name="Ben Raphael", email="[email protected]" },
Expand Down
2 changes: 1 addition & 1 deletion src/hatchet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.1.2'
__version__ = '1.2.1'

import os.path
from importlib.resources import path
Expand Down
153 changes: 134 additions & 19 deletions src/hatchet/bin/HATCHet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import hatchet
from hatchet import config, __version__
from hatchet.utils.Supporting import ensure
from hatchet.utils.Supporting import ensure, log, error
from hatchet.utils.solve import solve
from hatchet.utils.solve.utils import segmentation

Expand Down Expand Up @@ -453,6 +453,14 @@ def main(args=None):
)

if not args['diploid'] and not args['tetraploid']:
sys.stderr.write(
log(
'## Neither "diploid" nor "tetraploid" was fixed. '
'Model selection will be performed and the "clonal" argument'
'will not be used for scaling.\n'
)
)

sys.stderr.write(log('# Finding the neutral diploid/tetraploid cluster\n'))
neutral = findNeutralCluster(seg=fseg, size=size, td=args['td'], samples=samples, v=args['v'])

Expand Down Expand Up @@ -522,19 +530,11 @@ def main(args=None):
sys.stderr.write(log('# Running diploid\n'))

if args['clonal'] is not None:
sys.stderr.write(log('# Using the first given clonal cluster for diploid scaling\n'))
clonal = args['clonal']
first_clonal = clonal.split(',')[0]
tkns = first_clonal.split(':')
assert (
int(tkns[1]) == 1 and int(tkns[2]) == 1
), 'First clonal cluster must be the diploid cluster used for scaling'

clonal = parse_clonal_diploid(args['clonal'])
neutral = None
else:
sys.stderr.write(log('# Finding the neutral diploid cluster\n'))
neutral = findNeutralCluster(seg=fseg, size=size, td=args['td'], samples=samples, v=args['v'])

clonal = None

diploidObjs = runningDiploid(neutral=neutral, args=args, clonal=clonal)
Expand Down Expand Up @@ -564,8 +564,10 @@ def main(args=None):
)
else:
sys.stderr.write(log('# Parsing given clonal copy numbers\n'))
clonal = parse_clonal_tetraploid(args['clonal'])

clonal, scale = parseClonalClusters(
clonal=args['clonal'],
clonal=clonal,
fseg=fseg,
size=size,
samples=samples,
Expand Down Expand Up @@ -601,6 +603,127 @@ def main(args=None):
)


t2s = lambda x: [str(a) for a in x]


def parse_clonal_diploid(clonal):
"""
Given a list of clonal cluster copy numbers, this function tries to order them to be compatible
with the HATCHet C++ factorization module.
For diploid scaling, this module requires:
-the first cluster is indicated with copy-number 1,1
-the second cluster has a total copy number different from 2
"""
clonal_list = [[int(a) for a in c.split(':')] for c in clonal.split(',')]
diploid_clusters = [t for t in clonal_list if t[1] == 1 and t[2] == 1]
if len(diploid_clusters) == 1:
my_diploid = diploid_clusters[0]
elif len(diploid_clusters) > 1:
if diploid_clusters[0] == clonal_list[0]:
my_diploid = diploid_clusters[0]
else:
raise ValueError(
error(
'Found more than 1 cluster indicated as clonal (1,1) and '
"neither was the first argument to 'clonal'. Please select one for scaling and place "
'it first.'
)
)
else:
raise ValueError(
error(
"No cluster was indicated as (1,1) in argument to 'clonal' with "
"'diploid'=True. HATCHet solving module requires specification of (1,1) cluster for the "
"'clonal' argument to be used in this case."
)
)

candidate_second = [t for t in clonal_list if t[0] != my_diploid[0] and t[1] + t[2] != 2]
if len(candidate_second) == 0:
raise ValueError(
error(
'No cluster with total copy number different from 2 was included '
"in 'clonal'. At least one such cluster is required for this argument to be used with "
"'diploid'=True."
)
)
else:
my_second = candidate_second[0]
used_clusters = set([my_diploid[0], my_second[0]])

clonal = ','.join(
[
':'.join(t2s(t))
for t in [my_diploid] + [my_second] + [t2 for t2 in clonal_list if t2[0] not in used_clusters]
]
)
log(
msg=f' NOTE: cluster {my_diploid[0]} with copy number (1,1) will be used to scale fractional copy numbers.\n',
level='INFO',
)
log(msg=f" Reordered 'clonal' argument={clonal}.\n", level='INFO')

return clonal


def parse_clonal_tetraploid(clonal):
"""
Given a list of clonal cluster copy numbers, this function tries to order them to be compatible
with the HATCHet C++ factorization module.
For tetraploid scaling, this module requires:
-the first cluster is indicated with copy-number 2,2
-the second cluster has a total copy number different from 4
"""
clonal_list = [[int(a) for a in c.split(':')] for c in clonal.split(',')]
wgd_clusters = [t for t in clonal_list if t[1] == 2 and t[2] == 2]
if len(wgd_clusters) == 1:
my_wgd = wgd_clusters[0]
elif len(wgd_clusters) > 1:
if wgd_clusters[0] == clonal_list[0]:
my_wgd = wgd_clusters[0]
else:
raise ValueError(
error(
'Found more than 1 cluster indicated as clonal (2,2) and '
"neither was the first argument to 'clonal'. Please select one "
'for scaling and place it first.'
)
)
else:
raise ValueError(
error(
"No cluster was indicated as (2,2) in argument to 'clonal' with "
"'tetraploid'=True. HATCHet solving module requires specification of (2,2) cluster for the "
"'clonal' argument to be used in this case."
)
)

candidate_second = [t for t in clonal_list if t[0] != my_wgd[0] and t[1] + t[2] != 4]
if len(candidate_second) == 0:
raise ValueError(
error(
'No cluster with total copy number different from 4 was included '
"in 'clonal'. At least one such cluster is required for this "
"argument to be used with 'tetraploid'=True."
)
)
else:
my_second = candidate_second[0]
used_clusters = set([my_wgd[0], my_second[0]])

clonal = ','.join(
[':'.join(t2s(t)) for t in [my_wgd] + [my_second] + [t2 for t2 in clonal_list if t2[0] not in used_clusters]]
)
sys.stderr.write(
log(
f'# NOTE: cluster {my_wgd[0]} with copy number (2,2) and cluster {my_second[0]} with '
f'copy number ({my_second[1]}, {my_second[2]}) will be used to scale fractional copy numbers.\n'
)
)
sys.stderr.write(log(f"# Reordered 'clonal' argument={clonal}.\n"))
return clonal


def readBBC(filename):
samples = set()
bbc = {}
Expand Down Expand Up @@ -1512,14 +1635,6 @@ def isfloat(value):
return False


def error(msg):
return '{}{}{}'.format('\033[91m\033[1m', msg, '\033[0m')


def log(msg):
return '{}{}{}'.format('\033[95m\033[1m', msg, '\033[0m')


def warning(msg):
return '{}{}{}'.format('\033[93m', msg, '\033[0m')

Expand Down
6 changes: 4 additions & 2 deletions src/hatchet/hatchet.ini
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ transmat = "diag"
decoding = "map"
covar = "diag"
tau = 0.000001
restarts = 10
subset=

[plot_bins]
command =
Expand Down Expand Up @@ -132,8 +134,8 @@ minsize = 0.008
minchrs = 1
maxneutralshift = 0.01
merge = False
mergerdr = 0.08
mergebaf = 0.04
mergerdr = 0
mergebaf = 0
limitinc =
ghostprop = 0.3
tolerancerdr = 0.08
Expand Down
50 changes: 37 additions & 13 deletions src/hatchet/utils/ArgParsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,17 +282,9 @@ def parse_cluster_bins_args(args=None):
default=config.cluster_bins.diploidbaf,
help=(
'Maximum diploid-BAF shift used to determine the largest copy-neutral cluster and to rescale all the '
'cluster inside this threshold accordingly (default: None, scaling is not performed)'
'cluster inside this threshold accordingly (default: 0.1)'
),
)
parser.add_argument(
'-e',
'--seed',
type=int,
required=False,
default=config.cluster_bins.seed,
help='Random seed used for clustering (default: 0)',
)
parser.add_argument(
'--minK',
type=int,
Expand Down Expand Up @@ -345,6 +337,25 @@ def parse_cluster_bins_args(args=None):
default=config.cluster_bins.decoding,
help='Decoding algorithm to use: map or viterbi (default: map)',
)
parser.add_argument(
'-R',
'--restarts',
type=int,
required=False,
default=config.cluster_bins.restarts,
help='Number of restarts performed by the clustering to choose the best (default: 10)',
)
parser.add_argument(
'-S',
'--subset',
required=False,
default=config.cluster_bins.subset,
type=str,
nargs='+',
help=(
'List of sample names to use as a subset of those included in binning' '(default: none, run on all samples)'
),
)
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {__version__}')
args = parser.parse_args(args)

Expand All @@ -353,7 +364,6 @@ def parse_cluster_bins_args(args=None):
(args.diploidbaf is None) or (0.0 <= args.diploidbaf <= 0.5),
'The specified maximum for diploid-BAF shift must be a value in [0.0, 0.5]',
)
ensure(args.seed >= 0, 'Seed parameter must be non-negative.')

if args.exactK > 0:
log(
Expand Down Expand Up @@ -391,13 +401,25 @@ def parse_cluster_bins_args(args=None):
args.transmat in valid_transmats,
f'Invalid -t/--transmat argument: {args.transmat}. Valid values are {valid_transmats}.',
)

ensure(args.restarts >= 1, 'Number of restarts must be positive.')
ensure(args.tau >= 0, 'Transition parameter --tau must be non-negative.')
ensure(args.tau >= 6e-17, 'Transition parameter --tau must be at least 6e-17 to ensure numerical precision.')

if args.subset is not None:
import pandas as pd

args.subset = args.subset.split()
bb = pd.read_table(args.BBFILE)
samples = bb.SAMPLE.unique()
ensure(
all([a in samples for a in args.subset]),
'Samples indicated in "subset" must be present in input BB file. BB file:'
+ f'{samples}, argument: {args.subset}',
)

return {
'bbfile': args.BBFILE,
'diploidbaf': args.diploidbaf,
'seed': args.seed,
'decoding': args.decoding,
'minK': args.minK,
'maxK': args.maxK,
Expand All @@ -407,6 +429,8 @@ def parse_cluster_bins_args(args=None):
'tau': args.tau,
'outsegments': args.outsegments,
'outbins': args.outbins,
'restarts': args.restarts,
'subset': args.subset,
}


Expand Down Expand Up @@ -2077,7 +2101,7 @@ def parse_cluster_bins_gmm_args(args=None):
ensure(args.seed >= 0, 'Seed parameter must be positive!')
ensure(args.initclusters >= 0, 'Init-cluster parameter must be positive!')
ensure(args.concentration >= 0, 'Concentration parameter must be positive!')
ensure(args.restarts >= 0, 'Number of restarts must be positive!')
ensure(args.restarts >= 1, 'Number of restarts must be positive!')

return {
'bbfile': args.BBFILE,
Expand Down
Loading

0 comments on commit d3b38e4

Please sign in to comment.