chore: Update resource data URLs from dropbox to figshare

chansigit · Jul 5, 2024 · facac04 · facac04
1 parent d2b8c53
commit facac04
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 20 deletions.
diff --git a/dynamo/external/scifate.py b/dynamo/external/scifate.py
@@ -21,8 +21,8 @@ def scifate_glmnet(
     cell_filter_UMI: int = 10000,
     core_n_lasso: int = 1,
     core_n_filtering: int = 1,
-    motif_ref: str = "https://www.dropbox.com/s/s8em539ojl55kgf/motifAnnotations_hgnc.csv?dl=1",
-    TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1",
+    motif_ref: str = "https://figshare.com/ndownloader/files/47439455",
+    TF_link_ENCODE_ref: str = "https://figshare.com/ndownloader/files/47439458",
     nt_layers: list = ["X_new", "X_total"],
 ) -> AnnData:
     """Perform scifate analysis using glmnet.
@@ -58,11 +58,11 @@ def scifate_glmnet(
         motif_ref: The path to the TF binding motif data as described above. It provides the list of TFs gene names and
             is used to process adata object to generate the TF expression and target new expression matrix for glmnet
             based TF-target synthesis rate linkage analysis. But currently it is not used for motif based filtering.
-            By default, it is a dropbox link that store the data from us. Other motif reference can bed downloaded from
+            By default, it is a cloud link that store the data from us. Other motif reference can bed downloaded from
             RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from
             June's shared folder:
                 https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-centered-10kb-7species.mc9nr.feather
-        TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a dropbox link from us that stores the
+        TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a cloud link from us that stores the
             data. Other data can be downloaded from:
                 https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets.
         nt_layers: The layers that will be used for the network inference. Note that the layers can be changed flexibly.

diff --git a/dynamo/external/scribe.py b/dynamo/external/scribe.py
@@ -19,12 +19,12 @@ def scribe(
     Targets: Union[list, None] = None,
     gene_filter_rate: float = 0.1,
     cell_filter_UMI: int = 10000,
-    motif_ref: str = "https://www.dropbox.com/s/s8em539ojl55kgf/motifAnnotations_hgnc.csv?dl=1",
+    motif_ref: str = "https://figshare.com/ndownloader/files/47439455",
     nt_layers: list = ["X_new", "X_total"],
     normalize: bool = False,
     do_CLR: bool = True,
     drop_zero_cells: bool = True,
-    TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1",
+    TF_link_ENCODE_ref: str = "https://figshare.com/ndownloader/files/47439458",
 ) -> AnnData:
     """Apply Scribe to calculate causal network from spliced/unspliced, metabolic labeling based and other "real" time
     series datasets.
@@ -49,7 +49,7 @@ def scribe(
         cell_filter_UMI: Minimum number of UMIs for cell filtering.
         motif_ref: It provides the list of TFs gene names and is used to parse the data to get the list of TFs and
             Targets for the causal network inference from those TFs to Targets. But currently the motif based filtering
-            is not implemented. By default, it is a dropbox link that store the data from us. Other motif reference can
+            is not implemented. By default, it is a cloud link that store the data from us. Other motif reference can
             bed downloaded from RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be
             downloaded from June's shared folder:
             https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-
@@ -64,7 +64,7 @@ def scribe(
             target. This can signify the relationship between potential regulators and targets, speed up the calculation,
             but at the risk of ignoring strong inhibition effects from certain regulators to targets.
         do_CLR: Whether to perform context likelihood relatedness analysis on the reconstructed causal network
-        TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a dropbox link from us that stores the
+        TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a cloud link from us that stores the
             data. Other data can be downloaded from:
                 https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets.
 

diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py
@@ -840,6 +840,7 @@ def relative2abs(
     """
 
     if ERCC_annotation is None:
+        #TODO: outdated link. consider replacing or removing it.
         ERCC_annotation = pd.read_csv(
             "https://www.dropbox.com/s/cmiuthdw5tt76o5/ERCC_specification.txt?dl=1",
             sep="\t",

diff --git a/dynamo/sample_data.py b/dynamo/sample_data.py
@@ -65,27 +65,32 @@ def get_adata(url: str, filename: Optional[str] = None) -> Optional[AnnData]:
 
 # add our toy sample data
 def Gillespie():
+    #TODO: add data here
     pass
 
 
 def HL60():
+    #TODO: add data here
     pass
 
 
 def NASCseq():
+    #TODO: add data here
     pass
 
 
 def scSLAMseq():
+    #TODO: add data here
     pass
 
 
 def scifate():
+    #TODO: add data here
     pass
 
 
 def scNT_seq_neuron_splicing(
-    url: str = "https://www.dropbox.com/s/g1afqdcsczgyj2m/neuron_splicing_4_11.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439605",
     filename: str = "neuron_splicing.h5ad",
 ) -> AnnData:
     """The neuron splicing data is from Qiu, et al (2020).
@@ -98,7 +103,7 @@ def scNT_seq_neuron_splicing(
 
 
 def scNT_seq_neuron_labeling(
-    url: str = "https://www.dropbox.com/s/lk9cl63yd28mfuq/neuron_labeling.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439629",
     filename: str = "neuron_labeling.h5ad",
 ) -> AnnData:
     """The neuron splicing data is from Qiu, et al (2020).
@@ -115,7 +120,7 @@ def cite_seq():
 
 
 def zebrafish(
-    url: str = "https://www.dropbox.com/scl/fi/3zt89ee0j5twxk4ttzmij/zebrafish.h5ad?rlkey=phwg0b7aqiizd9kf69l2kciak&dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47420257",
     filename: str = "zebrafish.h5ad",
 ) -> AnnData:
     """The zebrafish is from Saunders, et al (2019).
@@ -180,7 +185,7 @@ def hgForebrainGlutamatergic(
 
 
 def chromaffin(
-    url: str = "https://www.dropbox.com/s/awevuz836tlclvw/onefilepercell_A1_unique_and_others_J2CH1.loom?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439620",
     filename: str = "onefilepercell_A1_unique_and_others_J2CH1.loom",
 ) -> AnnData:  #
     """The chromaffin dataset used in http://pklab.med.harvard.edu/velocyto/notebooks/R/chromaffin2.nb.html
@@ -224,7 +229,7 @@ def pancreatic_endocrinogenesis(
 
 
 def DentateGyrus_scvelo(
-    url: str = "https://www.dropbox.com/s/3w1wzb0b68fhdsw/dentategyrus_scv.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439623",
     filename: str = "dentategyrus_scv.h5ad",
 ) -> AnnData:
     """The Dentate Gyrus dataset used in https://github.com/theislab/scvelo_notebooks/tree/master/data/DentateGyrus.
@@ -238,10 +243,10 @@ def DentateGyrus_scvelo(
 
 
 def scEU_seq_rpe1(
-    url: str = "https://www.dropbox.com/s/25enev458c8egn7/rpe1.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439641",
     filename: str = "rpe1.h5ad",
 ):
-    """Download rpe1 dataset from Battich, et al (2020) via Dropbox link.
+    """Download rpe1 dataset from Battich, et al (2020) via a cloud link.
 
     This data consists of 13,913 genes across 2,930 cells.
     """
@@ -251,10 +256,10 @@ def scEU_seq_rpe1(
 
 
 def scEU_seq_organoid(
-    url: str = "https://www.dropbox.com/s/es7sroy5ceb7wwz/organoid.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439632",
     filename: str = "organoid.h5ad",
 ):
-    """Download organoid dataset from Battich, et al (2020) via Dropbox link.
+    """Download organoid dataset from Battich, et al (2020) via a cloud link.
 
     This data consists of 9,157 genes across 3,831 cells.
     """
@@ -264,7 +269,7 @@ def scEU_seq_organoid(
 
 
 def hematopoiesis(
-    url: str = "https://www.dropbox.com/s/n9mx9trv1h78q0r/hematopoiesis_v1.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439635",
     # url: str = "https://pitt.box.com/shared/static/kyh3s4wrxdywupn9wk9r2j27vzlvk8vf.h5ad", # with box
     # url: str = "https://pitt.box.com/shared/static/efqa8icu1m6d1ghfcc3s9tj0j91pky1h.h5ad", # v0: umap_ori version
     filename: str = "hematopoiesis.h5ad",
@@ -276,7 +281,7 @@ def hematopoiesis(
 
 
 def hematopoiesis_raw(
-    url: str = "https://www.dropbox.com/s/rvkxvq8694xnxz3/hsc_raw_with_metadata.h5ad?dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439626",
     # url: str = "https://pitt.box.com/shared/static/bv7q0kgxjncc5uoget5wvmi700xwntje.h5ad", # with box
     filename: str = "hematopoiesis_raw.h5ad",
 ) -> AnnData:
@@ -287,7 +292,7 @@ def hematopoiesis_raw(
 
 
 def human_tfs(
-    url: str = "https://www.dropbox.com/scl/fi/pyocgrhvglg6p7q8yf9ol/human_tfs.txt?rlkey=kbc8vfzf72f8ez0xldrb5nb2d&dl=1",
+    url: str = "https://figshare.com/ndownloader/files/47439617",
     filename: str = "human_tfs.txt",
 ) -> pd.DataFrame:
     """Download human transcription factors."""