Skip to content

Commit

Permalink
Fixes to the loom export functionality.
Browse files Browse the repository at this point in the history
  • Loading branch information
bramvds committed Apr 17, 2018
1 parent 796277a commit 6951b6a
Showing 1 changed file with 30 additions and 13 deletions.
43 changes: 30 additions & 13 deletions src/pyscenic/export.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding=utf-8

import os
import json
import numpy as np
import pandas as pd
import loompy as lp
Expand All @@ -26,18 +27,21 @@ def export2loom(ex_mtx: pd.DataFrame, regulons: List[Regulon], cell_annotations:
# Information on the general loom file format: http://linnarssonlab.org/loompy/format/index.html
# Information on the SCope specific alterations: https://github.com/aertslab/SCope/wiki/Data-Format

# TODO: Add regulon thresholds: not mandatory!
# TODO: Not mandatory but adding a section "regulonThresholds" to the general metadata would give
# TODO: additional information to the SCope tool to preset a threshold on the AUC distribution of a regulon
# TODO: across cells and help with binarization, i.e. deciding if the regulon is "on" or "off" in a cell.

# Calculate regulon enrichment per cell using AUCell.
auc_mtx = aucell(ex_mtx, regulons, num_cores) # (n_cells x n_regulons)
auc_mtx = aucell(ex_mtx, regulons, num_cores=num_cores) # (n_cells x n_regulons)

# Create an embedding based on UMAP (similar to tSNE but faster).
umap_embedding_mtx = UMAP().fit_transform(auc_mtx) # (n_cells, 2)
umap_embedding_mtx = pd.DataFrame(data=UMAP().fit_transform(auc_mtx),
index=ex_mtx.index, columns=['UMAP1', 'UMAP2']) # (n_cells, 2)

# Calculate the number of genes per cell.
binary_mtx = ex_mtx.copy()
binary_mtx[binary_mtx != 0] = 1.0
ngenes = binary_mtx.sum(axis=1)
ngenes = binary_mtx.sum(axis=1).astype(int)

# Encode genes in regulons as "binary" membership matrix.
genes = np.array(ex_mtx.columns)
Expand All @@ -50,6 +54,12 @@ def export2loom(ex_mtx: pd.DataFrame, regulons: List[Regulon], cell_annotations:
index=ex_mtx.columns,
columns=list(map(attrgetter('name'), regulons)))

# Encode cell type clusters.
name2idx = dict(map(reversed, enumerate(sorted(set(cell_annotations.values())))))
clusterings = pd.DataFrame(data=ex_mtx.index,
index=ex_mtx.index,
columns=['Cell Type']).replace(cell_annotations).replace(name2idx)

# Create meta-data structure.
def create_structure_array(df):
# Create a numpy structured array
Expand All @@ -63,23 +73,30 @@ def create_structure_array(df):
title = os.path.splitext(os.path.basename(out_fname))[0]

column_attrs = {
"CellID": list(ex_mtx.index),
"nGene": list(ngenes.values),
"Embedding": create_structure_array(umap_embedding_mtx.T),
"CellID": ex_mtx.index.values.astype('str'),
"nGene": ngenes.values,
"Embedding": create_structure_array(umap_embedding_mtx),
"RegulonsAUC": create_structure_array(auc_mtx),
"ClusterID": list(ex_mtx.index.rename(cell_annotations))
"Clusterings": create_structure_array(clusterings),
"ClusterID": clusterings.values
}
row_attrs = {
"Gene": list(ex_mtx.columns),
"Regulons": regulon_assignment,
"Gene": ex_mtx.columns.values.astype('str'),
"Regulons": create_structure_array(regulon_assignment),
}
general_attrs = {
"title": title,
"MetaData": {
"MetaData": json.dumps({
"embeddings": [{
"id": 0,
"name": "UMAP (default)",
}]},
}]}),
"clusterings": json.dumps([{
"id": 0,
"group": "celltype",
"name": "Cell Type",
"clusters": [{"id": idx, "description": name} for name, idx in name2idx.items()]
}]),
"Genome": next(iter(nomenclatures))}

# Create loom file for use with the SCope tool.
Expand All @@ -88,7 +105,7 @@ def create_structure_array(df):
# information from disk can only be achieved via column selection. For the ranking databases this is of utmost
# importance.
fh = lp.create(filename=out_fname,
layers=ex_mtx.T.values,
matrix=ex_mtx.T.values,
row_attrs=row_attrs,
col_attrs=column_attrs,
file_attrs=general_attrs)
Expand Down

0 comments on commit 6951b6a

Please sign in to comment.