Li, R., Leiva, C., Lemer, S., Kirkendale, L., & Li, J. (n.d.). Photosymbiosis profoundly influenced animal genome architecture and gene evolution – Insights from the giant clam genome. Manuscript submitted for publication.
OrthoDB Pipeline
docker pull ezlabgva/orthologer:v3.0.0
docker run -u $(id -u) -v $(pwd)/odb:/odbwork ezlabgva/orthologer:v3.0.0 orthomapper -c run -p Tmax -f Tmax_ReAnn.fasta -n 6447
# CAFE
# Run with Universal lambda
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 1 -p -c 6 -o k1p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 2 -p -c 6 -o k2p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 3 -p -c 6 -o k3p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 4 -p -c 6 -o k4p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 5 -p -c 6 -o k5p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 6 -p -c 6 -o k6p_1lambda&
# Run with 2 seperate lambda for Bivalvia and Gastropoda
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 1 -p -c 6 -o k1p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 2 -p -c 6 -o k2p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 3 -p -c 6 -o k3p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 4 -p -c 6 -o k4p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 5 -p -c 6 -o k5p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 6 -p -c 6 -o k6p_2lambdas&
# Results Mining
# Only viewing the trees with significant changes
echo $'#nexus\nbegin trees;'>Significant_trees.tre
grep "*" Gamma_asr.tre >>Significant_trees.tre
echo "end;">>Significant_trees.tre # 6794 trees
## Only viewing the trees significant changes in Tmax
echo $'#nexus\nbegin trees;'>Tmax_Significant_trees.tre
grep -F "<17>*" Gamma_asr.tre >>Tmax_Significant_trees.tre
echo "end;">>Tmax_Significant_trees.tre # 207 trees
# Get the gene family IDs that have significant changes in Tmax
grep -o "\w*at6447\w*" Tmax_Significant_trees.tre > Tmax_Significant_trees_IDs #207
# Get the gene family expaned in Tmax
cat Gamma_change.tab |cut -f1,19|grep "+[1-9]" >Tmax.expanded # 2783
# Get the gene family extracted in Tmax
cat Gamma_change.tab |cut -f1,19|grep "-" >Tmax.contracted # 3490
############ Significant Branch on the Tree ########################
# Get the gene family SIGNIFICANTLY expaned in Tmax
grep -Fwf Tmax_Significant_trees_IDs Tmax.expanded | cut -f1 > Tmax.expanded.Sig.treebranch #122
# Get the gene family SIGNIFICANTLY contracted in Tmax
grep -Fwf Tmax_Significant_trees_IDs Tmax.contracted | cut -f1 > Tmax.contracted.Sig.treebranch #85
# Functional Enrichment
# Annotation of Tmax with emapper
nohup emapper.py -m diamond --output_dir $outputdir -o Tmax_ReAnn --cpu 40 -i $input --data_dir $db&
cat Tmax_ReAnn.emapper.annotations | sed '/^#/d' > Tmax_ReAnn.clean.annotations
#get the triMax most_expanded gene IDs
cat Tmax_most.expanded.OG2gene.tab | cut -f2 > Tmax_most.expanded_IDs
# assign 1 when most_expanded gene IDs are presented
# 114 genes with GO annotation from 2156 genes (while there are 13539 genes with GO Annotation in the total )
grep -Fwf Tmax_most.expanded_IDs triMax_gene2GO_IDs > triMax_most_expanded_GOs_IDs
# Then run the GO_MWU script
OrthoFinder PipeLine
Data
Species Code | Species Name | Original File Name | New File Name |
---|---|---|---|
6454_0 | Haliotis rufescens | GCF_003343065.1_H.ruf_v1.0_protein.faa.gz | halRuf.faa.gz |
6500_0 | Aplysia californica | GCF_000002075.1_AplCal3.0_protein.faa.gz | aplCal.faa.gz |
6526_0 | Biomphalaria glabrata | GCF_000457365.1_ASM45736v1_protein.faa.gz | bioGla.faa.gz |
6565_0 | Crassostrea virginica | GCF_002022765.2_C_virginica-3.0_protein.faa.gz | craVir.faa.gz |
6573_0 | Mizuhopecten yessoensis | GCF_002113885.1_ASM211388v2_protein.faa.gz | mizYes.faa.gz |
6579_0 | Pecten maximus | GCF_902652985.1_xPecMax1.1_protein.faa.gz | pecMax.faa.gz |
6596_0 | Mercenaria mercenaria | GCF_021730395.1_MADL_Memer_1_protein.faa.gz | merMer.faa.gz |
29159_0 | Crassostrea gigas | GCF_902806645.1_cgigas_uk_roslin_v1_protein.faa.gz | craGig.faa.gz |
36100_0 | Haliotis rubra | GCF_003918875.1_ASM391887v1_protein.faa.gz | halRub.faa.gz |
37623_0 | Ostrea edulis | GCF_947568905.1_xbOstEdul1.1_protein.faa.gz | ostEdu.faa.gz |
37653_0 | Octopus bimaculoides | GCF_001194135.2_ASM119413v2_protein.faa.gz | octBim.faa.gz |
225164_0 | Lottia gigantea | GCF_000327385.1_Helro1_protein.faa.gz | lotGig.faa.gz |
400727_0 | Pomacea canaliculata | GCF_003073045.1_ASM307304v1_protein.faa.gz | pomCan.faa.gz |
1735272_0 | Gigantopelta aegis | GCF_016097555.1_Gae_host_genome_protein.faa.gz | gigAeg.faa.gz |
2607531_0 | Octopus sinensis | GCF_006345805.1_ASM634580v1_protein.faa.gz | octSin.faa.gz |
# run OrthoFinder
nohup orthofinder -f Separate_Genomes_NCBI &
# Find Orthogroup IDs specific to triMax and save them to a file
# /home/ruiqi/ruiqi_data/GiantClamGenome/OrthoDB/Separate_Genomes_NCBI/OrthoFinder/Results_Jan06/Orthogroups
awk 'NR > 1 && $NF == $15 {print $1}' Orthogroups.GeneCount.tsv > triMax_assigned_specific_Ortho_ID
#combine "unassigned genes from triMax
sed '1d' Orthogroups_UnassignedGenes.tsv | cut -f1,15 | grep "Tmax" > triMax_Unassigned_OG2Gene
cat triMax_Unassigned_OG2Gene triMax_assigned_specific_OG2Gene > triMax_specific_OG2Gene.txt
sed -i 's/Tmax_Tmax/Tmax/g' triMax_specific_OG2Gene.txt
# Annotation and Enrichment
############################################################################################################
# Part 1 - add annotation to all Tmax specific gene families
############################################################################################################
OG2gene <- read.table(file="triMax_specific_OG2Gene.txt", sep = "\t")
colnames(OG2gene) <- c("familyID","Gene")
Annot <- read.delim(file = "Tmax_ReAnn.clean.annotations", sep = "\t")
colnames(Annot) <- c("Gene","seed_eggNOG_ortholog","seed_ortholog_evalue","seed_ortholog_score","best_tax_level","Preferred_name",
"GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","taxonomic scope","eggNOG OGs",
"best eggNOG OG","COG","eggNOG free text desc.")
Tmax_annot <- merge(x=OG2gene,y=Annot,
by="Gene", all.x=TRUE)
sorted_Tmax_annot <- Tmax_annot[order(Tmax_annot$familyID), ]
write.table(sorted_Tmax_annot, file = "Tmax_specific_OG_Annot.tsv", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
############################################################################################################
# Part 2 - plot bar graph for description for each gene family, mark the description that exceeds 50% in red
############################################################################################################
sorted_Tmax_annot<-read.delim(file = "triMax_assigned_specific_OG_Annot.tsv", sep = "\t", header = TRUE)
unique_og_values <- unique(sorted_Tmax_annot$familyID)
# Replace empty strings with NA in the "eggNOG free text desc." column
sorted_Tmax_annot$eggNOG.free.text.desc.[sorted_Tmax_annot$eggNOG.free.text.desc. == ""] <- NA
# Reform if the description is too long (more than 10 words, then only keep 5 and add ...)
# Function to truncate and add "..."
truncate_and_ellipsis <- function(text, max_words = 10) {
words <- unlist(strsplit(text, " "))
if (length(words) > max_words) {
truncated_text <- paste(words[1:5], collapse = " ")
return(paste(truncated_text, "..."))
}
return(text)
}
# Apply the function to the specified column
sorted_Tmax_annot$eggNOG.free.text.desc. <- sapply(
sorted_Tmax_annot$eggNOG.free.text.desc.,
truncate_and_ellipsis
)
# Plot functions
for (og_value in unique_og_values) {
# Filter the dataframe for the specific familyID
family_df <- sorted_Tmax_annot %>%
filter(familyID == og_value)
# Count the frequency of each description
description_counts <- family_df %>%
group_by(eggNOG.free.text.desc.) %>%
summarize(count = n())
# Calculate the percentage for each description
description_counts <- description_counts %>%
mutate(percentage = (count / sum(count)) * 100)
# Create a horizontal bar graph with red highlights for descriptions > 50%
gg <- ggplot(description_counts, aes(x = percentage, y = reorder(eggNOG.free.text.desc., -percentage), fill = percentage > 50)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("lightgrey", "lightcoral"), guide = FALSE) +
labs(title = paste("Frequency of Descriptions for Gene Family", og_value),
x = "Percentage", y = "Description") +
theme(axis.text.y = element_text(hjust = 1))
# Save the plot with the name "[OG value]_box_plot.png"
ggsave(paste0("./GeneFamily_description_bargraph/", og_value, "_description_bargraph.png"), gg, width = 16, height = 9, units = "in")
}
############################################################################################################
# Part 3 - summarize description for each gene family, use the second most frequent description if the most one is NA
############################################################################################################
sorted_Tmax_annot<-read.delim(file = "triMax_assigned_specific_OG_Annot.tsv", sep = "\t", header = TRUE)
sorted_Tmax_annot <- sorted_Tmax_annot %>% select(Gene,familyID,eggNOG.free.text.desc.)
# change space to NA
sorted_Tmax_annot <- sorted_Tmax_annot %>% mutate_all(na_if,"")
# Find the most common value in function desc.
# If the most common one is NA, use the second common one
# If all the values are NA, just use NA
sorted_Tmax_annot <- sorted_Tmax_annot %>%
group_by(familyID) %>%
mutate(most_common = {
# Count the occurrences of each value, excluding NA
counts <- table(na.omit(eggNOG.free.text.desc.))
if (length(counts) == 0) {
# If all values are NA, return NA
NA
} else {
# Get the name of the value with maximum count
names(sort(counts, decreasing = TRUE))[1]
}
}) %>%
ungroup()
# Create Tmax_summary with familyID, Genes, and most_common
Tmax_summary <- sorted_Tmax_annot %>%
group_by(familyID) %>%
summarise(
Genes = n(), # Count of genes in each familyID
most_common = first(most_common) # The most common value from 'most_common' column
)
# Calculate Frequency for each familyID independently
Tmax_summary <- Tmax_summary %>%
rowwise() %>%
mutate(Frequency = {
common_value <- most_common
total_genes <- Genes
if (is.na(common_value)) {
NA_real_ # If most_common is NA, set Frequency as NA
} else {
common_count <- sum(sorted_Tmax_annot$familyID == familyID & sorted_Tmax_annot$eggNOG.free.text.desc. == common_value, na.rm = TRUE)
(common_count / total_genes) * 100
}
})
############################################################################################################
# Part 4 - summarize function of un-assigned genes
############################################################################################################
OG2gene <- read.table(file="triMax_Unassigned_OG2Gene", sep = "\t")
colnames(OG2gene) <- c("familyID","Gene")
Annot <- read.delim(file = "Tmax_ReAnn.clean.annotations", sep = "\t")
colnames(Annot) <- c("Gene","seed_eggNOG_ortholog","seed_ortholog_evalue","seed_ortholog_score","best_tax_level","Preferred_name",
"GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","taxonomic scope","eggNOG OGs",
"best eggNOG OG","COG","eggNOG free text desc.")
Tmax_unasnd_annot <- merge(x=OG2gene,y=Annot,
by="Gene", all.x=TRUE)
sorted_Tmax_unasnd_annot <- Tmax_unasnd_annot[order(Tmax_unasnd_annot$familyID), ]
sorted_Tmax_unasnd_annot <- sorted_Tmax_unasnd_annot %>% mutate_if(is.character, na_if, "")
#write.table(sorted_Tmax_unasnd_annot, file = "Tmax_unasnd_OG_Annot.tsv", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
################### summarize function of un-assigned genes
sorted_Tmax_unasnd_annot <-read.delim(file = "Tmax_unasnd_OG_Annot.tsv", sep = "\t", header = TRUE)
Tmax_unasnd_summary <- sorted_Tmax_unasnd_annot %>%
group_by(eggNOG.free.text.desc.) %>%
summarise(
Count = n()
) %>%
ungroup() %>%
arrange(desc(Count))
# Non-TE and Non-NA
Tmax_unasnd_summary_nonTE_nonNA <- Tmax_unasnd_summary %>%
filter(!eggNOG.free.text.desc. %in% exclude_values, !is.na(eggNOG.free.text.desc.))
write.table(Tmax_unasnd_summary, file = "Tmax_UNassigned_Annot_summary.tsv", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
############################################################################################################
# Part 5 - GO MWU enrichment
############################################################################################################
input="triMax_specific_GOs.csv"
goAnnotations="triMax_gene2GO.txt"
goDatabase="go.obo"
source("gomwu.functions.R")
go=c("MF", "CC", "BP")
for (goDivision in go){
gomwuStats(input, goDatabase, goAnnotations, goDivision,
perlPath="perl",
largest=0.1,
smallest=5,
clusterCutHeight=0.25,
)
}
# grep each OG and save to gene Id to a file
while read line
do
grep -w $line Tmax_Mollusca_OG2gene.tab | cut -f2 > ./SingCopyOG_14_genelist/${line}_genelist.txt
done < Mollusca_SingCopyOG_14_OG
# Retrieve sequences
# Retrieve mollusca specific genes
for file in odb11v_cleanheader.part*.fasta.cidx
do
cat Mollusca_gene_ID | cdbyank $file > ${file}_Mollusca_gene_seq.fasta
done
cat *_Mollusca_gene_seq.fasta > Mollusca_gene_OrthoDB.fasta
# Retrieve seqs from each OG and save to separate fasta files
for file in ./SingCopyOG_14_genelist/*_genelist.txt
do
filename=`basename $file`
name=${filename%_*}
echo $name
cat $file | cdbyank Tmax_Mollusca.fasta.cidx > ./SingCopyOG_14_seqs/${name}.fasta
done
# Alignment, Trim, Phylogeny Inference
align each gene separately,
while read name;
do mafft ${name}.fasta > $name.output
trimal -in $name.output -out ${name}_trimmed.output -automated1
done < Mollusca_SingCopyOG_14_OG
./catfasta2phyml/catfasta2phyml.pl --concatenate --verbose --fasta *_trimmed.output > Tmax_Mollusca_SingCopyOG_14.fasta
echo "alignment done!"
# raxml
run1=all1
run2=all2
run=Tmax_Mollusca_SingCopyOG_14
raxmlHPC-PTHREADS -T 36 -p 12345 -N 20 -s Tmax_Mollusca_SingCopyOG_14.fasta -m PROTGAMMAWAG -n $run1
raxmlHPC-PTHREADS -T 36 -p 12345 -b 12345 -s Tmax_Mollusca_SingCopyOG_14.fasta -N 100 -m PROTGAMMAWAG -n $run2
raxmlHPC-PTHREADS -T 36 -f b -t RAxML_bestTree.${run1} -z RAxML_bootstrap.${run2} -m PROTGAMMAWAG -n $run
Pipeline demo with Cerastoderma edule genome
# # RepeatModeler
BuildDatabase -name cerEdu C_edule.fna
# Annotate Repeats
git clone https://github.com/darencard/GenomeAnnotation.git
./GenomeAnnotation/repclassifier -t 36 -d Mollusca -u cerEdu_families.prefix.fa.unknown \
-k cerEdu_families.prefix.fa.known -a cerEdu_families.prefix.fa.known \
-o round-1_RepbaseMollusca-Self
#986 unknowns after round 1
# round 2 with know library
./GenomeAnnotation/repclassifier -t 36 -u round-1_RepbaseMollusca-Self/round-1_RepbaseMollusca-Self.unknown \
-k round-1_RepbaseMollusca-Self/round-1_RepbaseMollusca-Self.known \
-a round-1_RepbaseMollusca-Self/round-1_RepbaseMollusca-Self.known -o round-2_Self
# 825 unknowns after round 2
# round 3 with know library
./GenomeAnnotation/repclassifier -t 36 -u round-2_Self/round-2_Self.unknown \
-k round-2_Self/round-2_Self.known \
-a round-2_Self/round-2_Self.known -o round-3_Self
# 765 unknowns after round 3
# round 4 with know library
./GenomeAnnotation/repclassifier -t 36 -u round-3_Self/round-3_Self.unknown \
-k round-3_Self/round-3_Self.known \
-a round-3_Self/round-3_Self.known -o round-4_Self
# 742 unknowns after round 4
# round 5 with know library
./GenomeAnnotation/repclassifier -t 36 -u round-4_Self/round-4_Self.unknown \
-k round-4_Self/round-4_Self.known \
-a round-4_Self/round-4_Self.known -o round-5_Self
# 736 unknowns after round 5
cat round-5_Self.unknown round-5_Self.known > C_edule_denovo_annot.fa
# round 1: annotate/mask simple repeats
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 01_simple_out -noint -xsmall C_edule.fna &
mv nohup.out logs/01_simplemask.log
# rename outputs for 01_simple_out
for file in *; do mv -v "$file" "${file/C_edule/cerEdu_simple_mask}"; done
mv cerEdu_simple_mask.fna.masked cerEdu_simple_mask.fna
# round 2: annotate/mask Tetrapoda elements sourced from Repbase using output from 1st round of RepeatMasker
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 02_mollusca_out -nolow -species Mollusca 01_simple_out/cerEdu_simple_mask.fna &
mv nohup.out logs/02_molluscaMask.log
# round 2:rename outputs
for file in *; do mv -v "$file" "${file/simple_mask/mollusca_mask}"; done
mv cerEdu_mollusca_mask.fna.masked cerEdu_mollusca_mask.masked.fasta
# round 3: annotate/mask known elements sourced from species-specific de novo repeat library using output froom 2nd round of RepeatMasker
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 03_known_out -nolow -lib round-5_Self.known \
02_mollusca_out/cerEdu_mollusca_mask.masked.fasta &
# round 3:rename outputs
for file in *; do mv -v "$file" "${file/mollusca_mask/known_mask}"; done
mv cerEdu_known_mask.masked.fasta.masked cerEdu_known_mask.masked.fasta
# round 4: annotate/mask unknown elements sourced from species-specific de novo repeat library using output from 3nd round of RepeatMasker
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 04_unknown_out -nolow -lib round-5_Self.unknown \
03_known_out/cerEdu_known_mask.masked.fasta &
# round 4:rename outputs
for file in *; do mv -v "$file" "${file/known_mask/unknown_mask}"; done
mv cerEdu_unknown_mask.masked.fasta.masked cerEdu_unknown_mask.masked.fasta
# create directory for full results
mkdir -p 05_full_out
# combine full RepeatMasker result files - .cat.gz
cat 01_simple_out/cerEdu_simple_mask.fna.cat.gz 02_mollusca_out/cerEdu_mollusca_mask.fna.cat.gz 03_known_out/cerEdu_known_mask.masked.fasta.cat.gz 04_unknown_out/cerEdu_unknown_mask.masked.fasta.cat.gz
cat 01_simple_out/cerEdu_simple_mask.fna.cat.gz 02_mollusca_out/cerEdu_mollusca_mask.fna.cat.gz 03_known_out/cerEdu_known_mask.masked.fasta.cat.gz 04_unknown_out/cerEdu_unknown_mask.masked.fasta.cat.gz > 05_full_out/cerEdu_full_mask.cat.gz
# combine RepeatMasker tabular files for all repeats - .out
cat 01_simple_out/cerEdu_simple_mask.fna.out \
<(cat 02_mollusca_out/cerEdu_mollusca_mask.fna.out | tail -n +4) \
<(cat 03_known_out/cerEdu_known_mask.masked.fasta.out | tail -n +4) \
<(cat 04_unknown_out/cerEdu_unknown_mask.masked.fasta.out | tail -n +4) \
> 05_full_out/cerEdu.full_mask.out
# combine RepeatMasker repeat alignments for all repeats - .align
cat 01_simple_out/cerEdu_simple_mask.fna.align \
02_mollusca_out/cerEdu_mollusca_mask.fna.align \
03_known_out/cerEdu_known_mask.masked.fasta.align \
04_unknown_out/cerEdu_unknown_mask.masked.fasta.align \
> 05_full_out/cerEdu.full_mask.align
# resummarize repeat compositions from combined analysis of all RepeatMasker rounds
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/ProcessRepeats -a -species Mollusca 05_full_out/cerEdu_full_mask.cat.gz &
mv nohup.out logs/05_fullmask.log
# Insertion time
nohup ../../Parsing-RepeatMasker-Outputs/parseRM.pl -i ../cerEdu_full_mask.align -l 120,5 -m 0.00109 -v&
# calculate gene density
cat Tmax_merged_longestisoform.gff | cut -f 1,4,5 > genes.bed
cut -d ' ' -f 3,6 chromosome.txt | tr ' ' '\t' > chromosome.genome
bedtools makewindows -g chromosome.genome -w 1000000 > chromosome.windows
bedtools coverage -a chromosome.windows -b genes.bed | cut -f 1-4 > genes_num.txt
# calculate repeat density
cat PO1429_Tridacna_maxima.RepeatMasked.gff | cut -f 1,4,5 > repeats.bed
bedtools coverage -a chromosome.windows -b repeats.bed | cut -f 1-4 > repeats_num.txt
# calculate GC content
bedtools nuc -fi $genome -bed chromosome.windows > gc_raw.txt
# Configuration file
karyotype = chromosome.txt
chromosomes_units=1000000
<ideogram>
<spacing>
default = 0.005r
</spacing>
radius = 0.75r
thickness = 25p
fill = yes
show_bands = yes
fill_bands = yes
stroke_color = dgrey
stroke_thickness = 2p
show_label = yes
label_font = default
label_radius = 1.05r
label_size = 30
label_parallel = yes
show_ticks = yes
show_tick_labels = yes
</ideogram>
show_ticks = yes
show_tick_labels = yes
<ticks>
radius = 1r
color = black
thickness = 2p
multiplier = 1e-6
format = %d
<tick>
spacing = 10u
size = 10p
show_label = yes
label_offset = 8p
label_size = 15p
format = %d
</tick>
<tick>
spacing = 5u
size = 5p
show_label = no
</tick>
</ticks>
<plots>
<plot>
type = heatmap
file = GCrate.txt
color = purd-9-seq
r1 = 0.85r
r0 = 0.80r
</plot>
<plot>
type = heatmap
file = repeats_num.txt
color = blues-9-seq
r1 = 0.91r
r0 = 0.86r
</plot>
<plot>
type = line
thickness = 1
max_gap = 1u
file = repeats_num.txt
color = black
r1 = 0.91r
r0 = 0.86r
</plot>
<plot>
type = heatmap
file = genes_num.txt
color = purples-9-seq
r1 = 0.99r
r0 = 0.92r
</plot>
<plot>
type = line
thickness = 1
max_gap = 1u
file = genes_num.txt
color = black
r1 = 0.99r
r0 = 0.92r
</plot>
</plots>
<image>
# Run the program
circos -conf circos.conf
Targeted Gene Finder demo with Paradox CDX gene
Find sequence: Nucula tumidula homeobox parahox cdx (Cdx) mRNA, complete cds
# base Environment
# makedb
makeblastdb -in C_edule.fna -dbtype nucl -parse_seqids -out C_edule
makeblastdb -in Tmax_ReAnn_dna.fasta -dbtype nucl -parse_seqids -out T_maxima_Annot
makeblastdb -in GCF_902652985.1_xPecMax1.1_cds_from_genomic.fna -dbtype nucl -parse_seqids -out PecMax
## tblastn
tblastx -query Paradox_cdx.fasta \
-db C_edule \
-outfmt 6 -evalue 1e-5 -num_threads 30 > Paradox_cdx_C_edule.outfmt6
tblastx -query Paradox_cdx.fasta \
-db T_maxima_Annot \
-outfmt 6 -evalue 1e-5 -num_threads 30 > Paradox_cdx_T_maxima_Annot.outfmt6
One tissue each (mantle, adductor muscle, go-nads, gills, byssus, visceral mass, and kidney) from this study
Adductor muscle 2 (TM_A_N02) is from Li et al. 2020.
Mantle mRNA available 29/53/65 day at control (29.2°C; ambient (condition N) and elevated (30.7°C) (condition E) temperature
Tissue | Sample Name | Original File Names | SRR number |
---|---|---|---|
Adductor Muscle | TM_A_N01 | Tmax_B_S4_R1_001_val_1.fq.gz, Tmax_B_S4_R2_001_val_2.fq.gz | NA |
Adductor Muscle | TM_A_N02 | SRR8217859_1.fastq,SRR8217859_2.fastq | SRR8217859 |
Adductor Muscle | TM_A_N03 | SRR12486110_1.fastq, SRR12486110_2.fastq | SRR12486110 |
Mantle | TM_M_N01 | SRR12486156_1.fastq, SRR12486156_2.fastq | SRR12486156 |
Mantle | TM_M_N02 | SRR12486155_1.fastq, SRR12486155_2.fastq | SRR12486155 |
Mantle | TM_M_N03 | SRR12486141_1.fastq, SRR12486141_2.fastq | SRR12486141 |
Mantle | TM_M_N04 | SRR12486109_1.fastq, SRR12486109_2.fastq | SRR12486109 |
Mantle | TM_M_E01 | SRR12486154_1.fastq, SRR12486154_2.fastq | SRR12486154 |
Mantle | TM_M_E02 | SRR12486147_1.fastq, SRR12486147_2.fastq | SRR12486147 |
Mantle | TM_M_E03 | SRR12486140_1.fastq, SRR12486140_2.fastq | SRR12486140 |
# Trim the reads following default setting in Trinity
trimmomatic PE -threads 20 TM_A_N01_1.fastq.gz TM_A_N01_2.fastq.gz \
TM_A_N01_1.trim.fq.gz TM_A_N01_1.untrim.fq.gz \
TM_A_N01_2.trim.fq.gz TM_A_N01_2.untrim.fq.gz \
ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/TruSeq3-PE.fa:2:30:10 SLIDINGWINDOW:4:5 LEADING:5 TRAILING:5 MINLEN:25
#Use a loop
#!/bin/bash
for file in $(ls *_1.fastq.gz)
do
echo $file
file="${file%%_1.*}"
echo $file
trimmomatic PE -threads 36 ${file}_1.fastq.gz ${file}_2.fastq.gz \
TrimmedReads/${file}_1.trim.fq.gz Untrimmed/${file}_1.untrim.fq.gz \
TrimmedReads/${file}_2.trim.fq.gz Untrimmed/${file}_2.untrim.fq.gz \
ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/TruSeq3-PE.fa:2:30:10 SLIDINGWINDOW:4:5 LEADING:5 TRAILING:5 MINLEN:25
done
# Prepare the sample file
for file in *_1.trim.fq.gz; do
prefix="${file%%_1.trim.fq.gz}"
path="/home/ruiqi/ruiqi_data/GiantClamGenome/Transcriptome_GiantClam/TrimmedReads/"
col1="${prefix:3:1}${prefix:5:1}"
col2="${prefix:0:8}"
col3="$path${file}"
col4="$path${prefix}_2.trim.fq.gz"
echo -e "$col1\t$col2\t$col3\t$col4"
done > TM_samples.txt
# env: trinity
# Use align_and_estimate_abundance.pl to quantify unigenes
nohup /home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/opt/trinity-2.8.5/util/align_and_estimate_abundance.pl \
--transcripts Tmax_ReAnn_dna.fasta \
--seqType fq \
--samples_file TM_samples.txt \
--est_method kallisto \
--prep_reference \
--kallisto_add_opts "-t 36" &
# get the abundace file list
find . -name "abundance.tsv" | sort | tee TM_abundance_files.list
# abuandance to matrix
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/bin/abundance_estimates_to_matrix.pl --est_method kallisto \
--gene_trans_map none \
--out_prefix TM_all \
--name_sample_by_basedir \
--quant_files TM_abundance_files.list
# Counting Numbers of Expressed Transcripts or Genes
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/opt/trinity-2.8.5/util/misc/count_matrix_features_given_MIN_TPM_threshold.pl \
TM_all.isoform.TPM.not_cross_norm | tee TM_all.isoform_matrix.TPM.not_cross_norm.counts_by_min_TPM
# DE
# Use TPM > 5 as cut off for DE analyses
#Retained 16119 / 46469 = 34.69% of total transcripts.
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/opt/trinity-2.8.5/util/filter_low_expr_transcripts.pl --matrix TM_all_afterQC.isoform.TPM.not_cross_norm --transcripts Tmax_ReAnn_dna.fasta --min_expr_any 5 > TPM5_Tmax_ReAnn_dna.fasta
# clean header
sed -i '/^>/ s/ .*//' TPM5_Tmax_ReAnn_dna.fasta
#get the gene list (TPM>5)
grep ">" TPM5_Tmax_ReAnn_dna.fasta | sed 's/>//' > TPM5_Tmax.geneID
#get the headers
head -1 TM_all_afterQC.isoform.counts.matrix >TPM5_TM_all.isoform.counts.matrix
# get the count matrix on the isoform list
grep -wFf TPM5_Tmax.geneID TM_all_afterQC.isoform.counts.matrix >> TPM5_TM_all.isoform.counts.matrix
## Step 1: Identifying DEs
# TPM can't be in the file name, rename it to
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/bin/run_DE_analysis.pl \
--matrix expr5_TM_all.isoform.counts.matrix \
--samples_file TM_DE_samples.txt \
--contrasts TM_contrasts.txt \
--output . \
--method DESeq2
## Step 2: Extracting and Clustering DEs
## please note that we are using the TMM.EXPR.matrix
# filtering it first
#get the headers
head -1 TM_all_afterQC.isoform.TMM.EXPR.matrix > expr5_TM_all_afterQC.isoform.TMM.EXPR.matrix
# get the count matrix on the isoform list
grep -wFf TPM5_Tmax.geneID TM_all_afterQC.isoform.TMM.EXPR.matrix >> expr5_TM_all_afterQC.isoform.TMM.EXPR.matrix
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/bin/analyze_diff_expr.pl \
--matrix expr5_TM_all_afterQC.isoform.TMM.EXPR.matrix \
--samples TM_DE_samples.txt \
--output TM_P${Pvalue}_C${C} \
--P 5e-2 \
--C 1
#${prefix}.sampleA_vs_sampleB.voom.DE_results.P0.001_C2.sampleA-UP.subset : the expression matrix subset for features up-regulated in sampleA
# ${prefix}.sampleA_vs_sampleB.voom.DE_results.P0.001_C2.sampleB-UP.subset : the expression matrix subset for features up-regulated in sampleB
# Enrichment
setwd("/home/ruiqi/ruiqi_data/GiantClamGenome/Transcriptome_GiantClam/GO_MWU")
goDatabase="go.obo"
source("gomwu.functions.R")
###########
# Tmax
###########
goAnnotations="expr5_Tmax_gene2go.tab"
# Input
go=c("MF", "CC", "BP")
input_files=c("Tmax_MN_vs_ME_lfc.csv","Tmax_MN_vs_AN_lfc.csv")
# Calculating stats
for (goDivision in go){
for (input in input_files){
gomwuStats(input, goDatabase, goAnnotations, goDivision,
perlPath="perl",
largest=0.1,
smallest=5,
clusterCutHeight=0.25,
)
}
}
library(tidyverse)
go=c("MF", "CC", "BP")
input_files=c("Tmax_MN_vs_AN_lfc.csv","Tmax_MN_vs_ME_lfc.csv")
# Calculating stats)
for (goDivision in go){
for (input in input_files){
mwu_data <- read.table(paste0("MWU_",goDivision,"_",input), sep = " ", header=TRUE) %>%
filter(p.adj<0.05)
mwu_data$change[mwu_data$delta.rank > 0] <- "Up"
mwu_data$change[mwu_data$delta.rank < 0] <- "Down"
mwu_up <- mwu_data %>% filter (change == "Up") %>% select(c("term", "name"))
write.table(mwu_up, file=paste0("./GOs_GO_MWU/GOs_UP_",substring(input, 1, 13), "_", goDivision, ".tsv"), quote = FALSE, sep="\t",row.names = FALSE, col.names = TRUE)
mwu_down <- mwu_data %>% filter (change == "Down") %>% select(c("term", "name"))
write.table(mwu_down, file=paste0("./GOs_GO_MWU/GOs_DOWN_",substring(input, 1, 13), "_", goDivision, ".tsv"), quote = FALSE, sep="\t",row.names = FALSE, col.names = TRUE)
}
}
# get all the microtubule/cilium/ciliary/cilia related GO terms in the down-regulated genes in the MN
for file in GOs_DOWN_Tmax*; do
echo $file
grep -E 'microtubule|cilium|ciliary|cilia' $file >> cilia_DOWN_MN.txt
done
# if there are multiple GO terms in one line, split into several lines
awk -F'\t' '{
split($1, terms, ";")
for (i in terms) {
print terms[i] "\t" $2
}
}' cilia_DOWN_MN.txt > cilia_DOWN_MN_1.txt
# Keep the unique GO terms
cat cilia_DOWN_MN_1.txt | sort -u > cilia_DOWN_MN_unique.txt
# get the GO terms
cut -f1 cilia_DOWN_MN_unique.txt > cilia_DOWN_MN_GO_unique.txt
# get the list of Down-Regulated gene in MN
cat expr5_TM_all.isoform.counts.matrix.MN_vs_AN.DESeq2.DE_results.P5e-2_C1.AN-UP.subset | sed '1d' | cut -f1 > MN_vs_AN_Down_genelist
cat expr5_TM_all.isoform.counts.matrix.MN_vs_ME.DESeq2.DE_results.P5e-2_C1.ME-UP.subset | sed '1d' | cut -f1 >> MN_vs_AN_Down_genelist
# get the unique genes
sort -u MN_vs_AN_Down_genelist > MN_vs_AN_Down_unique_genelist
### get the list of genes that annotated with the cilia GO terms, and down regulated in MN
grep -Fwf cilia_DOWN_MN_GO_unique.txt Tmax_ReAnn.clean.annotations | grep -Fwf TM_DE_subset/MN_vs_AN_Down_unique_genelist > cilia_DOWN_MN_genes.txt
cut -f1,6,22 cilia_DOWN_MN_genes.txt cilia_DOWN_MN_genes_summary.txt