Skip to content

Latest commit

 

History

History
242 lines (214 loc) · 8.84 KB

README.md

File metadata and controls

242 lines (214 loc) · 8.84 KB

MAGIC-seq Pipeline

This is a public repository for all code connected to MAGIC-seq.

The following files/parameters are commonly required :

  • FASTQ files (Read 1 containing the spatial information and the UMI and read 2 containing the genomic sequence)
  • A genome index generated with STAR
  • An annotation file in GTF or GFF3 format (optional when using a transcriptome)
  • The file containing the barcodes and array coordinates (look at the folder "data/Barcode" to use it as a reference).

For the QC step, the stpipeline/common/filterInputReads.py in st_pipeline must be replaced with the supplied filterInputReads.py to generate the QC post-fastq for the subsequent steps.

Preprocessing of MAGIC-seq raw data:

  1. Triple/Nine grid chip data
#raw data
fastq1=${sampath}/${sample}_R1.fastq.gz
fastq2=${sampath}/${sample}_R2.fastq.gz
#Barcode-info
whitelist=${project_path}/data/Barcode-T9-70/whitelist
ID=${project_path}/data/Barcode-T9-70/T9-ids-barcode.txt
#result_path
st_path=${project_path}/data/result_STARsolo/${sample}
#ref-info
MAP=/database/ref_mm10_M23_release98/mm10_filtered_star_index
ANN=/database/ref_mm10_M23_release98/mm10_filtered.gtf
#Other-info
log_file=${st_path}/${sample}_st_log.txt
t_num=32

#1.Data preprocessing: extracting barcode and UMI sequences from read1
seqkit split2 -1 ${fastq1} -2 ${fastq2} -p 10 -O ${sampath}/${sample}_split/out -f -j ${t_num}
mkdir -p ${st_path}
cd ${st_path}
mkdir ./STARsolo ./tmp
files=(001 002 003 004 005 006 007 008 009 010)

for file in \"\${files[@]}\"
do
    seqkit subseq -j ${t_num} -r 1:8  ${sampath}/${sample}_split/out/${sample}_R1.part_\${file}.fastq.gz -o test1-8.fastq.gz
    seqkit subseq -j ${t_num} -r 27:34  ${sampath}/${sample}_split/out/${sample}_R1.part_\${file}.fastq.gz -o test27-34.fastq.gz
    seqkit subseq -j ${t_num} -r 53:72  ${sampath}/${sample}_split/out/${sample}_R1.part_\${file}.fastq.gz -o test53-72.fastq.gz
    seqkit concat test1-8.fastq.gz test27-34.fastq.gz -j ${t_num} -o test.fastq.gz
    seqkit concat test.fastq.gz test53-72.fastq.gz -j ${t_num} -o ${sample}_\${file}_reformat_R1.fastq.gz
    rm test*gz
done

cat *_reformat_R1.fastq.gz > ${sample}_reformat_R1.fastq.gz
rm ${sample}_0*_reformat_R1.fastq.gz
cat ${sampath}/${sample}_split/out/${sample}_R2.part_0*.fastq.gz > ${sample}_reformat_R2.fastq.gz
rm -r ${sampath}/${sample}_split

#2.Quality control of sequencing data
/software/st_pipeline/scripts/st_pipeline_run.py \
  --output-folder ./ \
  --temp-folder ./tmp \
  --umi-start-position 16 \
  --umi-end-position 28 \
  --ids $ID \
  --expName ${sample} \
  --ref-map ${MAP} \
  --ref-annotation ${ANN} \
  --verbose \
  --threads ${t_num} \
  --log-file ${log_file} \
  --min-length-qual-trimming 30 \
  --disable-mapping \
  --disable-annotation \
  --disable-barcode \
  ${sample}_reformat_R1.fastq.gz ${sample}_reformat_R2.fastq.gz
#3.Generate gene expression matrix
/software/STAR/source/STAR  --genomeDir ${MAP} \
  --outFileNamePrefix ${st_path}/STARsolo/${sample}_ \
  --readFilesCommand cat \\
  --readFilesIn st_R2.fastq st_R1.fastq \\
  --outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM sF \
  --outSAMtype BAM SortedByCoordinate \
  --limitBAMsortRAM 121539607552 \
  --soloType CB_UMI_Simple \
  --soloCBwhitelist ${whitelist} \
  --soloCBstart 1 \
  --soloCBlen 16 \
  --soloUMIstart 17 \
  --soloUMIlen 12 \
  --soloFeatures Gene GeneFull SJ Velocyto \
  --soloMultiMappers EM \
  --soloUMIdedup 1MM_All \
  --soloCellFilter EmptyDrops_CR \
  --soloCellReadStats Standard \
  --clipAdapterType CellRanger4 \
  --outReadsUnmapped Fastx \
  --runThreadN ${t_num} 


  1. Splicing grid chip data
#raw data
fastq1=${sampath}/${sample}_R1.fastq.gz
fastq2=${sampath}/${sample}_R2.fastq.gz
#Barcode-info
whitelist=${project_path}/data/Barcode-M9-70/whitelist
ID=${project_path}/data/Barcode-M9-70/M9_ST_ids_barcode_chip1_C18.txt
#result_path
st_path=${project_path}/data/result_STARsolo/${sample}
#ref-info
MAP=/database/ref_mm10_M23_release98/mm10_filtered_star_index
ANN=/database/ref_mm10_M23_release98/mm10_filtered.gtf
#Other-info
log_file=${st_path}/${sample}_st_log.txt
t_num=32

#1.Data preprocessing: extracting barcode and UMI sequences from read1
seqkit split2 -1 ${fastq1} -2 ${fastq2} -p 10 -O ${sampath}/${sample}_split/out -f -j ${t_num}
mkdir -p ${st_path}
cd ${st_path}
mkdir ./STARsolo ./tmp

files=(001 002 003 004 005 006 007 008 009 010)
for file in \"\${files[@]}\"
do
    seqkit subseq -j ${t_num} -r 1:8  ${sampath}/${sample}_split/out/${sample}_R1.part_\${file}.fastq.gz -o test1-8.fastq.gz
    seqkit subseq -j ${t_num} -r 27:34  ${sampath}/${sample}_split/out/${sample}_R1.part_\${file}.fastq.gz -o test27-34.fastq.gz
    seqkit subseq -j ${t_num} -r 53:72  ${sampath}/${sample}_split/out/${sample}_R1.part_\${file}.fastq.gz -o test53-72.fastq.gz
    seqkit concat test1-8.fastq.gz test27-34.fastq.gz -j ${t_num} -o test.fastq.gz
    seqkit concat test.fastq.gz test53-72.fastq.gz -j ${t_num} -o ${sample}_\${file}_reformat_R1.fastq.gz
    rm test*gz
done

cat *_reformat_R1.fastq.gz > ${sample}_reformat_R1.fastq.gz
rm ${sample}_0*_reformat_R1.fastq.gz
cat ${sampath}/${sample}_split/out/${sample}_R2.part_0*.fastq.gz > ${sample}_reformat_R2.fastq.gz
rm -r ${sampath}/${sample}_split

#2.Quality control of sequencing data
/software/st_pipeline/scripts/st_pipeline_run.py \
  --output-folder ./ \
  --temp-folder ./tmp \
  --umi-start-position 24 \
  --umi-end-position 36 \
  --ids $ID \
  --expName ${sample} \
  --ref-map ${MAP} \
  --ref-annotation ${ANN} \
  --verbose \
  --threads ${t_num} \
  --log-file ${log_file} \
  --min-length-qual-trimming 30 \
  --disable-mapping \
  --disable-annotation \
  --disable-barcode \
  ${sample}_reformat_R1.fastq.gz ${sample}_reformat_R2.fastq.gz
#3.Generate gene expression matrix
/software/STAR/source/STAR  --genomeDir ${MAP} \
--outFileNamePrefix ${st_path}/STARsolo/${sample}_ \
--readFilesCommand cat \
--readFilesIn st_R2.fastq st_R1.fastq \\
--outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM sF \
--outSAMtype BAM SortedByCoordinate \
--limitBAMsortRAM 121539607552 \
--soloType CB_UMI_Simple \
--soloCBwhitelist ${whitelist} \
--soloCBstart 1 \
--soloCBlen 24 \
--soloUMIstart 25 \
--soloUMIlen 12 \
--soloFeatures Gene GeneFull SJ Velocyto \
--soloMultiMappers EM \
--soloUMIdedup 1MM_All \
--soloCellFilter EmptyDrops_CR \
--soloCellReadStats Standard \
--clipAdapterType CellRanger4 \
--outReadsUnmapped Fastx \
--runThreadN ${t_num}

Convert gene expression matrix to anndata

  1. Triple/Nine grid chip data (T9-50um-70barcode)
from st_processing import get_adata_STARsolo

sample='sample1-1'
res_um=50 #点直径
channels_num=70 #通道数
barcode_num=70 #barcode数
chip_type='T9'
Barcode_file_path='/MAGIC_seq/Mouse_Adult_Organ_T9_70_50um/data/Barcode-T9-70/'
file_path='/MAGIC_seq/Mouse_Adult_Organ_T9_70_50um/data/result_STARsolo/'+sample+'/STARsolo/'
image_file_path='/MAGIC_seq/Mouse_Adult_Organ_T9_70_50um/data/Image/'
reg='reg7'

#Point coordinates for image registration
HE_point1=(384,614)
Spot_point1=(369,556)
HE_point2=(4623,4742)
Spot_point2=(4634, 4634)
line_point_r1c1=(746,676) 
line_point_r1c70=(773,4194)
line_point_r70c1=(4266,632)
adata=get_adata_STARsolo(sample,chip_type,reg,channels_num,barcode_num,res_um=res_um,only_HE=False,
                        img_file=True,EM=True,Velocyto=False,soloFeatures='GeneFull',raw=True,species='mouse',
                        file_path=file_path,image_file_path=image_file_path,Barcode_file_path=Barcode_file_path,geneinfo_path=geneinfo_path,
                        HE_point1=HE_point1,Spot_point1=Spot_point1,HE_point2=HE_point2,Spot_point2=Spot_point2,
                        line_point_r1c1=line_point_r1c1,line_point_r1c70=line_point_r1c70,line_point_r70c1=line_point_r70c1)
  1. Splicing grid chip data(M9-15um-70barcode)
from st_processing import get_adata_STARsolo
sample='ZOO9-3'
file_path='/MAGIC_seq/Mouse_Adult_Brain_M9_70_15um/data/result_STARsolo/'+sample+'/STARsolo/'
image_file_path='/MAGIC_seq/Mouse_Adult_Brain_M9_70_15um/data/Image/'
Barcode_file_path='/MAGIC_seq/Mouse_Adult_Brain_M9_70_15um/data/Barcode-M9-70/'

res_um=15 
channels_num=210 
barcode_num=70 
reg='reg1'
chip_type='M9'

#Point coordinates for image registration
HE_point1=(138,169)
Spot_point1=(112,227)
HE_point2=(4840,4691)
Spot_point2=(4827, 4720)
line_point_r1c1=(735,694) 
line_point_r1c70=(740,4426)
line_point_r70c1=(4461,669)

adata=get_adata_STARsolo(sample,chip_type,reg,channels_num,barcode_num,res_um=res_um,add_M9_15=True,
                        img_file=True,EM=True,Velocyto=True,soloFeatures='GeneFull',raw=True,species='mouse',
                        file_path=file_path,image_file_path=image_file_path,Barcode_file_path=Barcode_file_path,geneinfo_path=geneinfo_path,
                        HE_point1=HE_point1,Spot_point1=Spot_point1,HE_point2=HE_point2,Spot_point2=Spot_point2,
                        line_point_r1c1=line_point_r1c1,line_point_r1c70=line_point_r1c70,line_point_r70c1=line_point_r70c1)