diff --git a/docs/changelog.rst b/docs/changelog.rst index ccf8ee12..498ae87b 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,17 @@ Change Log ========== +Inferelator v0.5.5 `April 29, 2021` +----------------------------------- + +New Functionality: + +- Added ``.set_regression_parameters(tol=None)`` to parameterize tolerances in AMuSR regression + +Code Refactoring: + +- Profiled and optimized AMuSR code + Inferelator v0.5.4 `April 23, 2021` ----------------------------------- diff --git a/docs/conf.py b/docs/conf.py index e230a7ea..09556691 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ author = 'Chris Jackson' # The full version, including alpha/beta/rc tags -release = 'v0.5.4' +release = 'v0.5.5' # -- General configuration --------------------------------------------------- diff --git a/inferelator/crossvalidation_workflow.py b/inferelator/crossvalidation_workflow.py index 154b0480..67db53ae 100644 --- a/inferelator/crossvalidation_workflow.py +++ b/inferelator/crossvalidation_workflow.py @@ -58,6 +58,7 @@ class CrossValidationManager(object): size_sample_stratified_column = None size_sample_with_replacement = False size_sample_seed = None + size_sample_only = False # Workflow storage _baseline_workflow = None @@ -180,7 +181,8 @@ def add_grouping_dropin(self, metadata_column_name, group_size=None, seed=42): self.dropin_max_size = group_size self.dropin_seed = seed - def add_size_subsampling(self, size_vector, stratified_column_name=None, with_replacement=False, seed=42): + def add_size_subsampling(self, size_vector, stratified_column_name=None, with_replacement=False, seed=42, + size_sample_only=None): """ Resample expression data to a ratio of the original data. @@ -206,6 +208,7 @@ def add_size_subsampling(self, size_vector, stratified_column_name=None, with_re self.size_sample_stratified_column = stratified_column_name self.size_sample_with_replacement = with_replacement self.size_sample_seed = seed + self.size_sample_only = size_sample_only if size_sample_only is not None else self.size_sample_only def run(self): """ @@ -222,7 +225,10 @@ def run(self): self._check_grid_search_params_exist() # Run base grid search - results = self._grid_search() + if self.size_sample_only: + results = [] + else: + results = self._grid_search() # Run size sampling if self.size_sample_vector is not None: @@ -382,8 +388,7 @@ def _grid_search(self, test=None, value=None, mask_function=None): # Drop any observations which are False in the mask (if set) if mask_function is not None: mask = mask_function() - cv_workflow.expression_matrix.drop(cv_workflow.expression_matrix.columns[~mask], axis=1, inplace=True) - cv_workflow.meta_data.drop(cv_workflow.meta_data.index[~mask], axis=0, inplace=True) + cv_workflow.data = cv_workflow.data.get_sample_data(mask.index[mask]) n_obs = mask.sum() else: n_obs = cv_workflow._num_obs @@ -448,7 +453,7 @@ def _check_metadata_column_exists(self, col_name): :param col_name: str """ - if col_name in self.workflow.meta_data.columns: + if col_name in self.workflow.data.meta_data.columns: return True else: raise ValueError("Column {col} is not present in the loaded metadata".format(col=col_name)) @@ -458,7 +463,7 @@ def _dropout_cv(self): Run grid search on all data minus one group at a time """ - meta_data = self.workflow.meta_data.copy() + meta_data = self.workflow.data.meta_data.copy() col = self.dropout_column max_size = self.dropout_max_size @@ -511,7 +516,7 @@ def _dropin_cv(self): Run grid search on one group from the data at a time """ - meta_data = self.workflow.meta_data.copy() + meta_data = self.workflow.data.meta_data.copy() col = self.dropin_column max_size = self.dropin_max_size @@ -551,7 +556,7 @@ def _size_cv(self): for i, size_ratio in enumerate(self.size_sample_vector): rgen = np.random.RandomState(self.size_sample_seed + i) - meta_data = self.workflow.meta_data.copy() + meta_data = self.workflow.data.meta_data.copy() if self.size_sample_stratified_column is not None: strat_col = self.size_sample_stratified_column diff --git a/inferelator/distributed/dask_functions.py b/inferelator/distributed/dask_functions.py index e54d2567..eff77de3 100644 --- a/inferelator/distributed/dask_functions.py +++ b/inferelator/distributed/dask_functions.py @@ -16,7 +16,8 @@ def amusr_regress_dask(X, Y, priors, prior_weight, n_tasks, genes, tfs, G, remove_autoregulation=True, - lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None, regression_function=None): + lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None, regression_function=None, + tol=None, rel_tol=None): """ Execute multitask (AMUSR) @@ -54,7 +55,7 @@ def regression_maker(j, x_df, y_list, prior, tf): prior = format_prior(prior, gene, tasks, prior_weight, tfs=tf) return j, regression_function(x, y, tf, tasks, gene, prior, - lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss) + lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss, tol=tol, rel_tol=rel_tol) def response_maker(y_df, i): y = [] diff --git a/inferelator/regression/amusr_regression.py b/inferelator/regression/amusr_regression.py index 36c62f75..0ce64d0f 100644 --- a/inferelator/regression/amusr_regression.py +++ b/inferelator/regression/amusr_regression.py @@ -22,12 +22,13 @@ MAX_ITER = 1000 TOL = 1e-2 +REL_TOL = None MIN_WEIGHT_VAL = 0.1 MIN_RSS = 1e-10 def run_regression_EBIC(X, Y, TFs, tasks, gene, prior, Cs=None, Ss=None, lambda_Bs=None, - lambda_Ss=None, scale_data=False, return_lambdas=False): + lambda_Ss=None, scale_data=False, return_lambdas=False, tol=TOL, rel_tol=REL_TOL): """ Run multitask regression. Search the regularization coefficient space and select the model with the lowest eBIC. @@ -96,7 +97,8 @@ def run_regression_EBIC(X, Y, TFs, tasks, gene, prior, Cs=None, Ss=None, lambda_ # Fit model combined_weights, sparse_matrix, block_matrix = amusr_fit(cov_C, cov_D, b, s, - sparse_matrix, block_matrix, prior) + sparse_matrix, block_matrix, prior, + tol=tol, rel_tol=rel_tol) # Score model ebic_score = ebic(X, Y, combined_weights, n_tasks, n_samples, n_preds) @@ -138,10 +140,13 @@ class AMuSR_regression(base_regression.BaseRegression): Cs = None Ss = None + tol = None + rel_tol = None + regression_function = staticmethod(run_regression_EBIC) def __init__(self, X, Y, tfs=None, genes=None, priors=None, prior_weight=1, remove_autoregulation=True, - lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None): + lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None, tol=TOL, rel_tol=REL_TOL): """ Set up a regression object for multitask regression :param X: list(InferelatorData) @@ -161,6 +166,8 @@ def __init__(self, X, Y, tfs=None, genes=None, priors=None, prior_weight=1, remo assert check.argument_type(tfs, (list, pd.Series, pd.Index), allow_none=True) assert check.argument_type(genes, (list, pd.Series, pd.Index), allow_none=True) assert check.argument_numeric(prior_weight) + assert check.argument_numeric(tol) + assert check.argument_numeric(rel_tol, allow_none=True) assert len(X) == len(Y) # Set the data into the regression object @@ -196,6 +203,10 @@ def __init__(self, X, Y, tfs=None, genes=None, priors=None, prior_weight=1, remo self.Cs = Cs self.Ss = Ss + # Set the tolerances into the regression object + self.tol = tol + self.rel_tol = rel_tol + def regress(self, regression_function=None): """ Execute multitask (AMUSR) @@ -209,7 +220,8 @@ def regress(self, regression_function=None): from inferelator.distributed.dask_functions import amusr_regress_dask return amusr_regress_dask(self.X, self.Y, self.priors, self.prior_weight, self.n_tasks, self.genes, self.tfs, self.G, remove_autoregulation=self.remove_autoregulation, - regression_function=regression_function) + regression_function=regression_function, + tol=self.tol, rel_tol=self.rel_tol) def regression_maker(j): level = 0 if j % 100 == 0 else 2 @@ -232,7 +244,8 @@ def regression_maker(j): prior = format_prior(self.priors, gene, tasks, self.prior_weight, tfs=tfs) return regression_function(x, y, tfs, tasks, gene, prior, Cs=self.Cs, Ss=self.Ss, - lambda_Bs=self.lambda_Bs, lambda_Ss=self.lambda_Ss) + lambda_Bs=self.lambda_Bs, lambda_Ss=self.lambda_Ss, + tol=self.tol, rel_tol=self.rel_tol) return MPControl.map(regression_maker, range(self.G)) @@ -260,7 +273,7 @@ def pileup_data(self, run_data): return weights, rescaled_weights def amusr_fit(cov_C, cov_D, lambda_B=0., lambda_S=0., sparse_matrix=None, block_matrix=None, prior=None, - max_iter=MAX_ITER, tol=TOL, min_weight=MIN_WEIGHT_VAL): + max_iter=MAX_ITER, tol=TOL, rel_tol=REL_TOL, rel_tol_min_iter=10, min_weight=MIN_WEIGHT_VAL): """ Fits regression model in which the weights matrix W (predictors x tasks) is decomposed in two components: B that captures block structure across tasks @@ -300,6 +313,7 @@ def amusr_fit(cov_C, cov_D, lambda_B=0., lambda_S=0., sparse_matrix=None, block_ assert check.argument_type(max_iter, int) assert check.argument_type(tol, float) assert check.argument_type(min_weight, float) + assert check.argument_numeric(rel_tol, allow_none=True) assert cov_C.shape[0] == cov_D.shape[0] assert cov_C.shape[1] == cov_D.shape[1] assert cov_D.shape[1] == cov_D.shape[2] @@ -321,6 +335,8 @@ def amusr_fit(cov_C, cov_D, lambda_B=0., lambda_S=0., sparse_matrix=None, block_ # Initialize weights combined_weights = sparse_matrix + block_matrix + + iter_tols = np.zeros(max_iter) for i in range(max_iter): # Keep a reference to the old combined_weights @@ -334,9 +350,18 @@ def amusr_fit(cov_C, cov_D, lambda_B=0., lambda_S=0., sparse_matrix=None, block_ combined_weights = sparse_matrix + block_matrix # If convergence tolerance reached, break loop and move on - if np.max(np.abs(combined_weights - _combined_weights_old)) < tol: + iter_tols[i] = np.max(np.abs(combined_weights - _combined_weights_old)) + + if iter_tols[i] < tol: break + # If the maximum over the last few iterations is less than the relative tolerance, break loop and move on + if rel_tol is not None and (i > rel_tol_min_iter): + lb_start, lb_stop = i - rel_tol_min_iter, i + iter_rel_max = iter_tols[lb_start: lb_stop] - iter_tols[lb_start - 1: lb_stop - 1] + if np.max(iter_rel_max) < rel_tol: + break + # Set small values of W to zero # Since we don't run the algorithm until update equals zero combined_weights[np.abs(combined_weights) < min_weight] = 0 @@ -351,28 +376,36 @@ def updateS(C, D, B, S, lamS, prior, n_tasks, n_features): """ # update each task independently (shared penalty only) for k in range(n_tasks): - c = C[k]; d = D[k] - b = B[:,k]; s = S[:,k] - p = prior[:,k] + + c = C[k] + d = D[k] + + b = B[:, k] + s = S[:, k] + p = prior[:, k] * lamS + # cycle through predictors + for j in range(n_features): # set sparse coefficient for predictor j to zero - s_tmp = deepcopy(s) - s_tmp[j] = 0. + s[j] = 0. + # calculate next coefficient based on fit only if d[j,j] == 0: - alpha = 0 + alpha = 0. else: - alpha = (c[j]-np.sum((b+s_tmp)*d[j]))/d[j,j] + alpha = (c[j]- np.sum((b + s) * d[j])) / d[j,j] + # lasso regularization - if abs(alpha) <= p[j]*lamS: + if abs(alpha) <= p[j]: s[j] = 0. else: - s[j] = alpha-(np.sign(alpha)*p[j]*lamS) + s[j] = alpha - (np.sign(alpha) * p[j]) + # update current task - S[:,k] = s + S[:, k] = s - return(S) + return S def updateB(C, D, B, S, lamB, prior, n_tasks, n_features): """ @@ -382,28 +415,43 @@ def updateB(C, D, B, S, lamB, prior, n_tasks, n_features): reference: Liu et al, ICML 2009. Blockwise coordinate descent procedures for the multi-task lasso, with applications to neural semantic basis discovery. """ - #p = prior.min(axis=1) + + S = np.asarray(S, order="F") + # cycles through predictors for j in range(n_features): + # initialize next coefficients alphas = np.zeros(n_tasks) + # update tasks for each predictor together + d = D[:, :, j] + for k in range(n_tasks): - # get task covariance update terms - c = C[k]; d = D[k] - # get previous block-sparse and sparse coefficients - b = B[:,k]; s = S[:,k] - # set block-sparse coefficient for feature j to zero - b_tmp = deepcopy(b) - b_tmp[j] = 0. - # calculate next coefficient based on fit only - if d[j,j] == 0: + + d_kjj = d[k, j] + + if d_kjj == 0: + alphas[k] = 0 + else: - alphas[k] = (c[j]-np.sum((b_tmp+s)*d[:,j]))/d[j,j] + + # get previous block-sparse + # copies because B is C-ordered + b = B[:, k] + + # set block-sparse coefficient for feature j to zero + b[j] = 0. + + # calculate next coefficient based on fit only + alphas[k] = (C[k, j] - np.sum((b + S[:, k]) * d[k, :])) / d_kjj + + # set all tasks to zero if l1-norm less than lamB if np.linalg.norm(alphas, 1) <= lamB: B[j,:] = np.zeros(n_tasks) + # regularized update for predictors with larger l1-norm else: # find number of coefficients that would make l1-norm greater than penalty @@ -424,7 +472,7 @@ def updateB(C, D, B, S, lamB, prior, n_tasks, n_features): # update current predictor B[j,:] = new_weights - return(B) + return B def _covariance_by_task(X, Y): """ @@ -723,7 +771,11 @@ class AMUSRRegressionWorkflowMixin(base_regression._MultitaskRegressionWorkflowM lambda_Ss = None heuristic_Cs = None - def set_regression_parameters(self, prior_weight=None, lambda_Bs=None, lambda_Ss=None, heuristic_Cs=None): + tol = TOL + relative_tol = REL_TOL + + def set_regression_parameters(self, prior_weight=None, lambda_Bs=None, lambda_Ss=None, heuristic_Cs=None, + tol=None, relative_tol=None): """ Set regression parameters for AmUSR. @@ -744,12 +796,18 @@ def set_regression_parameters(self, prior_weight=None, lambda_Bs=None, lambda_Ss Defaults to np.logspace(np.log10(0.01), np.log10(10), 20)[::-1]. Does not have an effect if lambda_B is provided. :type heuristic_Cs: list(floats) or np.ndarray(floats) + :param tol: Convergence tolerance for amusr regression + :type tol: float + :param relative_tol: Relative convergence tolerance for amusr regression + :type relative_tol: float """ self._set_with_warning("prior_weight", prior_weight) self._set_with_warning("lambda_Bs", lambda_Bs) self._set_with_warning("lambda_Ss", lambda_Ss) self._set_with_warning("heuristic_Cs", heuristic_Cs) + self._set_without_warning("tol", tol) + self._set_without_warning("relative_tol", relative_tol) def run_bootstrap(self, bootstrap_idx): @@ -763,7 +821,7 @@ def run_bootstrap(self, bootstrap_idx): MPControl.sync_processes(pref="amusr_pre") regress = AMuSR_regression(x, y, tfs=self._regulators, genes=self._targets, priors=self._task_priors, prior_weight=self.prior_weight, lambda_Bs=self.lambda_Bs, lambda_Ss=self.lambda_Ss, - Cs=self.heuristic_Cs) + Cs=self.heuristic_Cs, tol=self.tol, rel_tol=self.relative_tol) return regress.run() diff --git a/inferelator/tests/artifacts/test_data.py b/inferelator/tests/artifacts/test_data.py index 9dac697b..b63a5e92 100644 --- a/inferelator/tests/artifacts/test_data.py +++ b/inferelator/tests/artifacts/test_data.py @@ -22,11 +22,12 @@ class TestDataSingleCellLike(object): transpose_expression=True, meta_data=TestDataSingleCellLike.meta_data, gene_data=TestDataSingleCellLike.gene_metadata, - gene_data_idx_column=TestDataSingleCellLike.gene_list_index) + gene_data_idx_column=TestDataSingleCellLike.gene_list_index, + sample_names=list(map(str, range(10)))) TEST_DATA_SPARSE = InferelatorData(sps.csr_matrix(TestDataSingleCellLike.expression_matrix.T.values), gene_names=TestDataSingleCellLike.expression_matrix.index, - sample_names=TestDataSingleCellLike.expression_matrix.columns, + sample_names=list(map(str, range(10))), meta_data=TestDataSingleCellLike.meta_data, gene_data=TestDataSingleCellLike.gene_metadata, gene_data_idx_column=TestDataSingleCellLike.gene_list_index) diff --git a/inferelator/tests/test_amusr.py b/inferelator/tests/test_amusr.py index 7a02217e..1b2fdb85 100644 --- a/inferelator/tests/test_amusr.py +++ b/inferelator/tests/test_amusr.py @@ -347,7 +347,7 @@ def test_lamb_b(self): priors=self.workflow.priors_data, lambda_Bs=lamb_b) def is_passed(X, Y, TFs, tasks, gene, prior, Cs=None, Ss=None, lambda_Bs=None, - lambda_Ss=None, scale_data=False): + lambda_Ss=None, scale_data=False, tol=None, rel_tol=None): npt.assert_array_equal(lambda_Bs, lamb_b) @@ -369,7 +369,7 @@ def test_lamb_s(self): priors=self.workflow.priors_data, lambda_Ss=lamb_s) def is_passed(X, Y, TFs, tasks, gene, prior, Cs=None, Ss=None, lambda_Bs=None, - lambda_Ss=None, scale_data=False): + lambda_Ss=None, scale_data=False, tol=None, rel_tol=None): npt.assert_array_equal(lambda_Ss, lamb_s) @@ -391,7 +391,7 @@ def test_heuristic_c(self): priors=self.workflow.priors_data, Cs=set_Cs) def is_passed(X, Y, TFs, tasks, gene, prior, Cs=None, Ss=None, lambda_Bs=None, - lambda_Ss=None, scale_data=False): + lambda_Ss=None, scale_data=False, tol=None, rel_tol=None): npt.assert_array_equal(set_Cs, Cs) diff --git a/inferelator/tests/test_crossvalidation_wrapper.py b/inferelator/tests/test_crossvalidation_wrapper.py index b64faef4..27d2a5b5 100644 --- a/inferelator/tests/test_crossvalidation_wrapper.py +++ b/inferelator/tests/test_crossvalidation_wrapper.py @@ -7,9 +7,17 @@ from inferelator import crossvalidation_workflow from inferelator.workflow import WorkflowBase +from inferelator.utils.data import InferelatorData + +from numpy.random import default_rng + +fake_obsnames = list(map(str, range(1000))) fake_metadata = pd.DataFrame({"CONST": ["A"] * 1000, - "VAR": ["A"] * 100 + ["B"] * 200 + ["C"] * 1 + ["D"] * 99 + ["E"] * 500 + ["F"] * 100}) + "VAR": ["A"] * 100 + ["B"] * 200 + ["C"] * 1 + ["D"] * 99 + ["E"] * 500 + ["F"] * 100}, + index = fake_obsnames) + +fake_data_object = InferelatorData(default_rng(12345).random(size=1000).reshape((1000,1)), meta_data=fake_metadata, sample_names=fake_obsnames) TEMP_DIR = tempfile.gettempdir() TEMP_DIR_1 = os.path.join(TEMP_DIR, "test1") @@ -29,7 +37,7 @@ class FakeWorkflow(WorkflowBase): seed = 10 def __init__(self): - self.meta_data = fake_metadata.copy() + self.data = fake_data_object.copy() def run(self): return FakeResult() @@ -219,8 +227,8 @@ def test_group_dropout_no_limit(self): def test_grid_search(slf, test=None, value=None, mask_function=None): self.assertEqual(test, "dropout") - self.assertTrue(value in slf.workflow.meta_data[slf.dropout_column].unique()) - self.assertListEqual((slf.workflow.meta_data[slf.dropout_column] != value).tolist(), + self.assertTrue(value in slf.workflow.data.meta_data[slf.dropout_column].unique()) + self.assertListEqual((slf.workflow.data.meta_data[slf.dropout_column] != value).tolist(), mask_function().tolist()) self.cv._grid_search = types.MethodType(test_grid_search, self.cv) @@ -235,10 +243,10 @@ def test_group_dropout_limit(self): def test_grid_search(slf, test=None, value=None, mask_function=None): self.assertEqual(test, "dropout") - uniques = slf.workflow.meta_data[slf.dropout_column].unique() + uniques = slf.workflow.data.meta_data[slf.dropout_column].unique() mask = mask_function() - unique_counts = slf.workflow.meta_data[slf.dropout_column].value_counts() + unique_counts = slf.workflow.data.meta_data[slf.dropout_column].value_counts() unique_counts[unique_counts > slf.dropout_max_size] = slf.dropout_max_size if value == "all": @@ -247,7 +255,7 @@ def test_grid_search(slf, test=None, value=None, mask_function=None): self.assertTrue(value in uniques) unique_counts[value] = 0 self.assertEqual(unique_counts.sum(), mask.sum()) - self.assertEqual(sum((self.cv.workflow.meta_data[self.cv.dropout_column] == value)[mask]), 0) + self.assertEqual(sum((self.cv.workflow.data.meta_data[self.cv.dropout_column] == value)[mask]), 0) self.cv._grid_search = types.MethodType(test_grid_search, self.cv) @@ -261,8 +269,8 @@ def test_group_dropin_no_limit(self): def test_grid_search(slf, test=None, value=None, mask_function=None): self.assertEqual(test, "dropin") - self.assertTrue(value in slf.workflow.meta_data[slf.dropin_column].unique()) - self.assertListEqual((slf.workflow.meta_data[slf.dropin_column] == value).tolist(), + self.assertTrue(value in slf.workflow.data.meta_data[slf.dropin_column].unique()) + self.assertListEqual((slf.workflow.data.meta_data[slf.dropin_column] == value).tolist(), mask_function().tolist()) self.cv._grid_search = types.MethodType(test_grid_search, self.cv) @@ -283,9 +291,9 @@ def test_grid_search(slf, test=None, value=None, mask_function=None): if value == "all": self.assertEqual(mask.sum(), slf.dropin_max_size) else: - self.assertTrue(value in slf.workflow.meta_data[slf.dropin_column].unique()) + self.assertTrue(value in slf.workflow.data.meta_data[slf.dropin_column].unique()) - self.assertEqual(min((slf.workflow.meta_data[slf.dropin_column] == value).sum(), + self.assertEqual(min((slf.workflow.data.meta_data[slf.dropin_column] == value).sum(), slf.dropin_max_size), mask.sum()) @@ -303,7 +311,7 @@ def test_grid_search(slf, test=None, value=None, mask_function=None): self.assertEqual(test, "size") self.assertTrue(value == "0.5") - self.assertEqual(max(int(slf.workflow.meta_data.shape[0] * float(value)), 1), + self.assertEqual(max(int(slf.workflow.data.meta_data.shape[0] * float(value)), 1), mask_function().sum()) self.cv._grid_search = types.MethodType(test_grid_search, self.cv) @@ -320,8 +328,8 @@ def test_grid_search(slf, test=None, value=None, mask_function=None): self.assertTrue(value == "0.5") mask = mask_function() - for g in slf.workflow.meta_data[slf.size_sample_stratified_column].unique(): - is_group = slf.workflow.meta_data[slf.size_sample_stratified_column] == g + for g in slf.workflow.data.meta_data[slf.size_sample_stratified_column].unique(): + is_group = slf.workflow.data.meta_data[slf.size_sample_stratified_column] == g self.assertEqual(max(int(is_group.sum() * float(value)), 1), mask[is_group].sum()) diff --git a/inferelator/tests/test_data_wrapper.py b/inferelator/tests/test_data_wrapper.py index 6da7cc40..e076f932 100644 --- a/inferelator/tests/test_data_wrapper.py +++ b/inferelator/tests/test_data_wrapper.py @@ -361,5 +361,60 @@ def test_change_sparse(self): self.assertTrue(sparse.isspmatrix_csr(self.adata_sparse.expression_data)) +class TestSampling(TestWrapperSetup): + + def setUp(self): + super(TestSampling, self).setUp() + + self.adata.trim_genes() + self.adata_sparse.trim_genes() + + def test_without_replacement(self): + + new_adata = self.adata.get_random_samples(10, with_replacement=False) + + new_sample_names = new_adata.sample_names.tolist() + new_sample_names.sort() + + old_sample_names = self.adata.sample_names.tolist() + old_sample_names.sort() + + self.assertListEqual(new_sample_names, old_sample_names) + with self.assertRaises(AssertionError): + self.assertListEqual(new_adata.sample_names.tolist(), self.adata.sample_names.tolist()) + + with self.assertRaises(ValueError): + self.adata.get_random_samples(100, with_replacement=False) + + with self.assertRaises(ValueError): + self.adata.get_random_samples(0, with_replacement=False) + + self.assertEqual(self.adata.get_random_samples(2, with_replacement=False).num_obs, 2) + + def test_with_replacement(self): + + new_adata = self.adata.get_random_samples(11, with_replacement=True, fix_names=True) + self.assertEqual(new_adata.num_obs, 11) + + new_sample_names = new_adata.sample_names.tolist() + new_sample_names.sort() + + old_sample_names = self.adata.sample_names.tolist() + old_sample_names.sort() + + with self.assertRaises(AssertionError): + self.assertListEqual(new_sample_names, old_sample_names) + + with self.assertRaises(ValueError): + self.adata.get_random_samples(0, with_replacement=True) + + self.assertEqual(self.adata.get_random_samples(200, with_replacement=True).num_obs, 200) + + def test_inplace(self): + + new_adata = self.adata.get_random_samples(11, with_replacement=True, fix_names=False, inplace=True) + self.assertEqual(id(new_adata), id(self.adata)) + + if __name__ == '__main__': unittest.main() diff --git a/inferelator/utils/data.py b/inferelator/utils/data.py index 756c7310..65c2f135 100644 --- a/inferelator/utils/data.py +++ b/inferelator/utils/data.py @@ -11,7 +11,8 @@ from sklearn.preprocessing import StandardScaler import scipy.io from anndata import AnnData -from inferelator.utils import Debug, Validator +from inferelator.utils import Debug +from inferelator.utils import Validator as check def dot_product(a, b, dense=True, cast=True): @@ -598,9 +599,61 @@ def get_sample_data(self, sample_index, copy=False, force_dense=False, to_df=Fal return pd.DataFrame(x, columns=self.gene_names, index=labels) if to_df else x def get_bootstrap(self, sample_bootstrap_index): - return InferelatorData(expression_data=self._adata[sample_bootstrap_index, :].X.copy(), + return InferelatorData(expression_data=self._adata.X[sample_bootstrap_index, :].copy(), gene_names=self.gene_names) + def get_random_samples(self, num_obs, with_replacement=False, random_seed=None, random_gen=None, inplace=False, + fix_names=True): + """ + Randomly sample to a specific number of observatons from the entire data set + + :param num_obs: Number of observations to return + :type num_obs: int + :param with_replacement: Sample with replacement, defaults to False + :type with_replacement: bool, optional + :param random_seed: Seed for numpy random generator, defaults to None. Will be ignored if a generator itself is + passed to random_gen. + :type random_seed: int, optional + :param random_gen: Numpy random generator to use, defaults to None. + :type random_gen: np.random.Generator, optional + :param inplace: Change this instance of the data structure inplace and return a reference to itself + :type inplace: bool, optional + """ + + check.argument_integer(num_obs, low=1) + check.argument_integer(random_seed, allow_none=True) + + if (num_obs > self.num_obs) and not with_replacement: + _msg = "Unable to sample {x} from {y} observations without replacement".format(x=num_obs, y=self.num_obs) + raise ValueError(_msg) + + # Make a new random generator if not provided + if random_gen is None: + random_gen = np.random.default_rng() if random_seed is None else np.random.default_rng(random_seed) + + # Sample with replacement using randint + if with_replacement: + keeper_ilocs = random_gen.integers(self.num_obs, size=(num_obs,)) + + # Sample without replacement using choice + else: + keeper_ilocs = random_gen.choice(np.arange(self.num_obs), size=(num_obs,), replace=False) + + # Change this instance's _adata (explicit copy allows the old data to be dereferenced instead of held as view) + if inplace: + self._adata = self._adata[keeper_ilocs, :].copy() + return_obj = self + + # Create a new InferelatorData instance with the _adata slice + else: + return_obj = InferelatorData(self._adata[keeper_ilocs, :].copy()) + + # Fix names + return_obj._adata.obs_names_make_unique() if with_replacement and fix_names else None + + return return_obj + + def subset_copy(self, row_index=None, column_index=None): if row_index is not None and column_index is not None: diff --git a/inferelator/utils/profiler.py b/inferelator/utils/profiler.py new file mode 100644 index 00000000..b2d287a5 --- /dev/null +++ b/inferelator/utils/profiler.py @@ -0,0 +1,36 @@ +import os, psutil, argparse, time, csv + +def main(): + """ + Module that produces a profile of process memory usage and saves it to a file + """ + + ap = argparse.ArgumentParser(description="Create a prior from a genome, TF motifs, and an optional BED file") + + # REQUIRED ARGUMENTS ############################################################################################### + + ap.add_argument("-p", dest="pid", help="Process ID", type=int, required=True) + ap.add_argument("-o", dest="out", help="Output TSV", metavar="FILE", required=True) + ap.add_argument("-t", dest="timer", help="Time interval", type=int, default=1) + args = ap.parse_args() + + pid, t_step = args.pid, args.timer + + time_start = time.time() + with open(args.out, mode="w") as out_fh: + csv_handler = csv.writer(out_fh, delimiter="\t", lineterminator="\n", quoting=csv.QUOTE_NONE) + csv_handler.writerow(["Time", "Memory"]) + + while psutil.pid_exists(pid): + csv_handler.writerow(['%.3f' % (time.time() - time_start), int(get_current_memory(pid) / 1e6)]) + out_fh.flush() + time.sleep(t_step) + + +def get_current_memory(proc_id): + proc = psutil.Process(proc_id) + return proc.memory_info().rss + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index 49ad4503..0dd12e14 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages # Current Inferelator Version Number -version = "0.5.4" +version = "0.5.5" # Description from README.md