Add mirrored haplotype frequency inference (#181)

* Fix bug with phase_snps and chrnotation * Allow mhBAF values above 0.5 * Fixed genotype_snps error message * Set merging to 0 by default just to make sure * Update default value in ArgParsing * Repeat clustering 10 times per K * Added 'restarts' option to cluster_bins * Add sample subsetting to cluster-bins * Add warnings for low tau * Updated plotting to respect mhBAF * diploidbaf now requires all samples to be close to 0.5 * Update plotting for mhbaf * Fix count_alleles error in error msg * Update compute_cn 'clonal' argument parsing * Minor update to docstring * Add -pthread flag to CMakeLists * Fix issue with sexchroms in combine_counts_fw * Update test data for mirror/mhbaf * Trim trailing whitespace * Formatting and unused imports * More formatting * Add correction for haplotype switching on chromosome arms * Update default silhouette score to avoid no-solution issues * Update tests (add new bb column for haplotype correction) * Source file formatting * Minor change to code formatting * Bump minor version number to 1.2 * Count haplotype switches using imbal. samples only * Fix doubled log messages
raphael-group · Jan 25, 2023 · d3b38e4 · d3b38e4
1 parent e7b364d
commit d3b38e4
Show file tree

Hide file tree

Showing 19 changed files with 10,068 additions and 9,660 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,7 +4,7 @@ project( HATCHet )
 
 set( CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR} ${CMAKE_MODULE_PATH} )
 
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -std=c++11" )
 
 # add the automatically determined parts of the RPATH
 # which point to directories outside the build tree to the install RPATH

diff --git a/docs/source/doc_cluster_bins.md b/docs/source/doc_cluster_bins.md
@@ -67,4 +67,4 @@ Particularly, `tau` controls the balance between global information (RDR and BAf
 
 | Name | Description | Usage | Default |
 |------|-------------|-------|---------|
-| `-e`, `--seed`  | Random number generator seed used in model fitting | 0 |
+| `-R`, `--restarts` | Number of restarts (initializations) | For each K, the HMM is initialized randomly this many times and the solution with the highest log-likelihood is kept | 10 |
diff --git a/docs/source/doc_compute_cn.md b/docs/source/doc_compute_cn.md
@@ -93,8 +93,8 @@ This heuristic can be controlled by the following parameters:
 | `-tR`, `--toleranceRDR` | Maximum RDR tolerance | The maximum RDR tolerance used by the heuristic when estimating the position of all clonal clusters | 0.04 |
 | `-tB`, `--toleranceBAF` | Maximum BAF tolerance | The maximum BAF tolerance used by the heuristic when estimating the position of all clonal clusters | 0.03 |
 | `--merge` | Activate merging of clusters | When activated, the heuristic will merge together clusters that appear to have the same values of RDR and BAF, according to the values below. This procedure can help the heuristic by refining the clustering and merging clusters that are likely to have the same copy-number states and unlikely to be clonal. | False, not used |
-| `-mR`, `--mergeRDR` | RDR merging threhsold | The maximum difference in RDR considered by the merging procedure | 0.08|
-| `-mB`, `--mergeBAF` | BAF merging threhsold | The maximum difference in BAF considered by the merging procedure | 0.04 |
+| `-mR`, `--mergeRDR` | RDR merging threhsold | The maximum difference in RDR considered by the merging procedure | 0 |
+| `-mB`, `--mergeBAF` | BAF merging threhsold | The maximum difference in BAF considered by the merging procedure | 0 |
 
 ## Simultaneous factorization
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "hatchet"
-version = "1.1.2"
+version = "1.2.1"
 authors = [
   { name="Simone Zaccaria", email="[email protected]" },
   { name="Ben Raphael", email="[email protected]" },

diff --git a/src/hatchet/__init__.py b/src/hatchet/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.1.2'
+__version__ = '1.2.1'
 
 import os.path
 from importlib.resources import path

diff --git a/src/hatchet/bin/HATCHet.py b/src/hatchet/bin/HATCHet.py
@@ -8,7 +8,7 @@
 
 import hatchet
 from hatchet import config, __version__
-from hatchet.utils.Supporting import ensure
+from hatchet.utils.Supporting import ensure, log, error
 from hatchet.utils.solve import solve
 from hatchet.utils.solve.utils import segmentation
 
@@ -453,6 +453,14 @@ def main(args=None):
     )
 
     if not args['diploid'] and not args['tetraploid']:
+        sys.stderr.write(
+            log(
+                '## Neither "diploid" nor "tetraploid" was fixed. '
+                'Model selection will be performed and the "clonal" argument'
+                'will not be used for scaling.\n'
+            )
+        )
+
         sys.stderr.write(log('# Finding the neutral diploid/tetraploid cluster\n'))
         neutral = findNeutralCluster(seg=fseg, size=size, td=args['td'], samples=samples, v=args['v'])
 
@@ -522,19 +530,11 @@ def main(args=None):
         sys.stderr.write(log('# Running diploid\n'))
 
         if args['clonal'] is not None:
-            sys.stderr.write(log('# Using the first given clonal cluster for diploid scaling\n'))
-            clonal = args['clonal']
-            first_clonal = clonal.split(',')[0]
-            tkns = first_clonal.split(':')
-            assert (
-                int(tkns[1]) == 1 and int(tkns[2]) == 1
-            ), 'First clonal cluster must be the diploid cluster used for scaling'
-
+            clonal = parse_clonal_diploid(args['clonal'])
             neutral = None
         else:
             sys.stderr.write(log('# Finding the neutral diploid cluster\n'))
             neutral = findNeutralCluster(seg=fseg, size=size, td=args['td'], samples=samples, v=args['v'])
-
             clonal = None
 
         diploidObjs = runningDiploid(neutral=neutral, args=args, clonal=clonal)
@@ -564,8 +564,10 @@ def main(args=None):
             )
         else:
             sys.stderr.write(log('# Parsing given clonal copy numbers\n'))
+            clonal = parse_clonal_tetraploid(args['clonal'])
+
             clonal, scale = parseClonalClusters(
-                clonal=args['clonal'],
+                clonal=clonal,
                 fseg=fseg,
                 size=size,
                 samples=samples,
@@ -601,6 +603,127 @@ def main(args=None):
             )
 
 
+t2s = lambda x: [str(a) for a in x]
+
+
+def parse_clonal_diploid(clonal):
+    """
+    Given a list of clonal cluster copy numbers, this function tries to order them to be compatible
+    with the HATCHet C++ factorization module.
+    For diploid scaling, this module requires:
+    -the first cluster is indicated with copy-number 1,1
+    -the second cluster has a total copy number different from 2
+    """
+    clonal_list = [[int(a) for a in c.split(':')] for c in clonal.split(',')]
+    diploid_clusters = [t for t in clonal_list if t[1] == 1 and t[2] == 1]
+    if len(diploid_clusters) == 1:
+        my_diploid = diploid_clusters[0]
+    elif len(diploid_clusters) > 1:
+        if diploid_clusters[0] == clonal_list[0]:
+            my_diploid = diploid_clusters[0]
+        else:
+            raise ValueError(
+                error(
+                    'Found more than 1 cluster indicated as clonal (1,1) and '
+                    "neither was the first argument to 'clonal'. Please select one for scaling and place "
+                    'it first.'
+                )
+            )
+    else:
+        raise ValueError(
+            error(
+                "No cluster was indicated as (1,1) in argument to 'clonal' with "
+                "'diploid'=True. HATCHet solving module requires specification of (1,1) cluster for the "
+                "'clonal' argument to be used in this case."
+            )
+        )
+
+    candidate_second = [t for t in clonal_list if t[0] != my_diploid[0] and t[1] + t[2] != 2]
+    if len(candidate_second) == 0:
+        raise ValueError(
+            error(
+                'No cluster with total copy number different from 2 was included '
+                "in 'clonal'. At least one such cluster is required for this argument to be used with "
+                "'diploid'=True."
+            )
+        )
+    else:
+        my_second = candidate_second[0]
+    used_clusters = set([my_diploid[0], my_second[0]])
+
+    clonal = ','.join(
+        [
+            ':'.join(t2s(t))
+            for t in [my_diploid] + [my_second] + [t2 for t2 in clonal_list if t2[0] not in used_clusters]
+        ]
+    )
+    log(
+        msg=f' NOTE: cluster {my_diploid[0]} with copy number (1,1) will be used to scale fractional copy numbers.\n',
+        level='INFO',
+    )
+    log(msg=f" Reordered 'clonal' argument={clonal}.\n", level='INFO')
+
+    return clonal
+
+
+def parse_clonal_tetraploid(clonal):
+    """
+    Given a list of clonal cluster copy numbers, this function tries to order them to be compatible
+    with the HATCHet C++ factorization module.
+    For tetraploid scaling, this module requires:
+    -the first cluster is indicated with copy-number 2,2
+    -the second cluster has a total copy number different from 4
+    """
+    clonal_list = [[int(a) for a in c.split(':')] for c in clonal.split(',')]
+    wgd_clusters = [t for t in clonal_list if t[1] == 2 and t[2] == 2]
+    if len(wgd_clusters) == 1:
+        my_wgd = wgd_clusters[0]
+    elif len(wgd_clusters) > 1:
+        if wgd_clusters[0] == clonal_list[0]:
+            my_wgd = wgd_clusters[0]
+        else:
+            raise ValueError(
+                error(
+                    'Found more than 1 cluster indicated as clonal (2,2) and '
+                    "neither was the first argument to 'clonal'. Please select one "
+                    'for scaling and place it first.'
+                )
+            )
+    else:
+        raise ValueError(
+            error(
+                "No cluster was indicated as (2,2) in argument to 'clonal' with "
+                "'tetraploid'=True. HATCHet solving module requires specification of (2,2) cluster for the "
+                "'clonal' argument to be used in this case."
+            )
+        )
+
+    candidate_second = [t for t in clonal_list if t[0] != my_wgd[0] and t[1] + t[2] != 4]
+    if len(candidate_second) == 0:
+        raise ValueError(
+            error(
+                'No cluster with total copy number different from 4 was included '
+                "in 'clonal'. At least one such cluster is required for this "
+                "argument to be used with 'tetraploid'=True."
+            )
+        )
+    else:
+        my_second = candidate_second[0]
+    used_clusters = set([my_wgd[0], my_second[0]])
+
+    clonal = ','.join(
+        [':'.join(t2s(t)) for t in [my_wgd] + [my_second] + [t2 for t2 in clonal_list if t2[0] not in used_clusters]]
+    )
+    sys.stderr.write(
+        log(
+            f'# NOTE: cluster {my_wgd[0]} with copy number (2,2) and cluster {my_second[0]} with '
+            f'copy number ({my_second[1]}, {my_second[2]}) will be used to scale fractional copy numbers.\n'
+        )
+    )
+    sys.stderr.write(log(f"# Reordered 'clonal' argument={clonal}.\n"))
+    return clonal
+
+
 def readBBC(filename):
     samples = set()
     bbc = {}
@@ -1512,14 +1635,6 @@ def isfloat(value):
         return False
 
 
-def error(msg):
-    return '{}{}{}'.format('\033[91m\033[1m', msg, '\033[0m')
-
-
-def log(msg):
-    return '{}{}{}'.format('\033[95m\033[1m', msg, '\033[0m')
-
-
 def warning(msg):
     return '{}{}{}'.format('\033[93m', msg, '\033[0m')
 

diff --git a/src/hatchet/hatchet.ini b/src/hatchet/hatchet.ini
@@ -84,6 +84,8 @@ transmat = "diag"
 decoding = "map"
 covar = "diag"
 tau = 0.000001
+restarts = 10
+subset=
 
 [plot_bins]
 command =
@@ -132,8 +134,8 @@ minsize = 0.008
 minchrs = 1
 maxneutralshift = 0.01
 merge = False
-mergerdr = 0.08
-mergebaf = 0.04
+mergerdr = 0
+mergebaf = 0
 limitinc =
 ghostprop = 0.3
 tolerancerdr = 0.08

diff --git a/src/hatchet/utils/ArgParsing.py b/src/hatchet/utils/ArgParsing.py
@@ -282,17 +282,9 @@ def parse_cluster_bins_args(args=None):
         default=config.cluster_bins.diploidbaf,
         help=(
             'Maximum diploid-BAF shift used to determine the largest copy-neutral cluster and to rescale all the '
-            'cluster inside this threshold accordingly (default: None, scaling is not performed)'
+            'cluster inside this threshold accordingly (default: 0.1)'
         ),
     )
-    parser.add_argument(
-        '-e',
-        '--seed',
-        type=int,
-        required=False,
-        default=config.cluster_bins.seed,
-        help='Random seed used for clustering (default: 0)',
-    )
     parser.add_argument(
         '--minK',
         type=int,
@@ -345,6 +337,25 @@ def parse_cluster_bins_args(args=None):
         default=config.cluster_bins.decoding,
         help='Decoding algorithm to use: map or viterbi (default: map)',
     )
+    parser.add_argument(
+        '-R',
+        '--restarts',
+        type=int,
+        required=False,
+        default=config.cluster_bins.restarts,
+        help='Number of restarts performed by the clustering to choose the best (default: 10)',
+    )
+    parser.add_argument(
+        '-S',
+        '--subset',
+        required=False,
+        default=config.cluster_bins.subset,
+        type=str,
+        nargs='+',
+        help=(
+            'List of sample names to use as a subset of those included in binning' '(default: none, run on all samples)'
+        ),
+    )
     parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {__version__}')
     args = parser.parse_args(args)
 
@@ -353,7 +364,6 @@ def parse_cluster_bins_args(args=None):
         (args.diploidbaf is None) or (0.0 <= args.diploidbaf <= 0.5),
         'The specified maximum for diploid-BAF shift must be a value in [0.0, 0.5]',
     )
-    ensure(args.seed >= 0, 'Seed parameter must be non-negative.')
 
     if args.exactK > 0:
         log(
@@ -391,13 +401,25 @@ def parse_cluster_bins_args(args=None):
         args.transmat in valid_transmats,
         f'Invalid -t/--transmat argument: {args.transmat}. Valid values are {valid_transmats}.',
     )
-
+    ensure(args.restarts >= 1, 'Number of restarts must be positive.')
     ensure(args.tau >= 0, 'Transition parameter --tau must be non-negative.')
+    ensure(args.tau >= 6e-17, 'Transition parameter --tau must be at least 6e-17 to ensure numerical precision.')
+
+    if args.subset is not None:
+        import pandas as pd
+
+        args.subset = args.subset.split()
+        bb = pd.read_table(args.BBFILE)
+        samples = bb.SAMPLE.unique()
+        ensure(
+            all([a in samples for a in args.subset]),
+            'Samples indicated in "subset" must be present in input BB file. BB file:'
+            + f'{samples}, argument: {args.subset}',
+        )
 
     return {
         'bbfile': args.BBFILE,
         'diploidbaf': args.diploidbaf,
-        'seed': args.seed,
         'decoding': args.decoding,
         'minK': args.minK,
         'maxK': args.maxK,
@@ -407,6 +429,8 @@ def parse_cluster_bins_args(args=None):
         'tau': args.tau,
         'outsegments': args.outsegments,
         'outbins': args.outbins,
+        'restarts': args.restarts,
+        'subset': args.subset,
     }
 
 
@@ -2077,7 +2101,7 @@ def parse_cluster_bins_gmm_args(args=None):
     ensure(args.seed >= 0, 'Seed parameter must be positive!')
     ensure(args.initclusters >= 0, 'Init-cluster parameter must be positive!')
     ensure(args.concentration >= 0, 'Concentration parameter must be positive!')
-    ensure(args.restarts >= 0, 'Number of restarts must be positive!')
+    ensure(args.restarts >= 1, 'Number of restarts must be positive!')
 
     return {
         'bbfile': args.BBFILE,