T. maxima Genome Script

Li, R., Leiva, C., Lemer, S., Kirkendale, L., & Li, J. (n.d.). Photosymbiosis profoundly influenced animal genome architecture and gene evolution – Insights from the giant clam genome. Manuscript submitted for publication.

1. Gene Family Analyses

OrthoDB Pipeline

docker pull ezlabgva/orthologer:v3.0.0
docker run -u $(id -u) -v $(pwd)/odb:/odbwork ezlabgva/orthologer:v3.0.0 orthomapper -c run -p Tmax -f Tmax_ReAnn.fasta -n 6447


# CAFE
# Run with Universal lambda
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 1 -p -c 6 -o k1p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 2 -p -c 6 -o k2p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 3 -p -c 6 -o k3p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 4 -p -c 6 -o k4p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 5 -p -c 6 -o k5p_1lambda&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -k 6 -p -c 6 -o k6p_1lambda&

# Run with 2 seperate lambda for Bivalvia and Gastropoda
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 1 -p -c 6 -o k1p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 2 -p -c 6 -o k2p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 3 -p -c 6 -o k3p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 4 -p -c 6 -o k4p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 5 -p -c 6 -o k5p_2lambdas&
nohup cafe5 -i BivGas_GeneCount.tsv -t BivGas_ultrametric.tre -y Biv_Gas_separate_lambda.txt -k 6 -p -c 6 -o k6p_2lambdas&

# Results Mining
# Only viewing the trees with significant changes
echo $'#nexus\nbegin trees;'>Significant_trees.tre
grep "*" Gamma_asr.tre >>Significant_trees.tre
echo "end;">>Significant_trees.tre # 6794 trees

## Only viewing the trees significant changes in Tmax
echo $'#nexus\nbegin trees;'>Tmax_Significant_trees.tre
grep -F "<17>*" Gamma_asr.tre >>Tmax_Significant_trees.tre
echo "end;">>Tmax_Significant_trees.tre # 207 trees
# Get the gene family IDs that have significant changes in Tmax
grep -o "\w*at6447\w*" Tmax_Significant_trees.tre > Tmax_Significant_trees_IDs #207

# Get the gene family expaned in Tmax
cat Gamma_change.tab |cut -f1,19|grep "+[1-9]" >Tmax.expanded # 2783
# Get the gene family extracted in Tmax
cat Gamma_change.tab |cut -f1,19|grep "-" >Tmax.contracted # 3490

############ Significant Branch on the Tree ########################
#  Get the gene family SIGNIFICANTLY expaned in Tmax
grep -Fwf Tmax_Significant_trees_IDs Tmax.expanded | cut -f1 > Tmax.expanded.Sig.treebranch #122
#  Get the gene family SIGNIFICANTLY contracted in Tmax
grep -Fwf Tmax_Significant_trees_IDs Tmax.contracted | cut -f1 > Tmax.contracted.Sig.treebranch  #85

# Functional Enrichment
# Annotation of Tmax with emapper
nohup emapper.py -m diamond --output_dir $outputdir -o Tmax_ReAnn --cpu 40 -i $input --data_dir $db&
cat Tmax_ReAnn.emapper.annotations | sed '/^#/d' > Tmax_ReAnn.clean.annotations

#get the triMax most_expanded gene IDs
cat Tmax_most.expanded.OG2gene.tab | cut -f2 > Tmax_most.expanded_IDs

# assign 1 when most_expanded gene IDs are presented
# 114 genes with GO annotation from 2156 genes (while there are 13539 genes with GO Annotation in the total )
grep -Fwf Tmax_most.expanded_IDs triMax_gene2GO_IDs  > triMax_most_expanded_GOs_IDs

# Then run the GO_MWU script

OrthoFinder PipeLine

Data

Species Code	Species Name	Original File Name	New File Name
6454_0	Haliotis rufescens	GCF_003343065.1_H.ruf_v1.0_protein.faa.gz	halRuf.faa.gz
6500_0	Aplysia californica	GCF_000002075.1_AplCal3.0_protein.faa.gz	aplCal.faa.gz
6526_0	Biomphalaria glabrata	GCF_000457365.1_ASM45736v1_protein.faa.gz	bioGla.faa.gz
6565_0	Crassostrea virginica	GCF_002022765.2_C_virginica-3.0_protein.faa.gz	craVir.faa.gz
6573_0	Mizuhopecten yessoensis	GCF_002113885.1_ASM211388v2_protein.faa.gz	mizYes.faa.gz
6579_0	Pecten maximus	GCF_902652985.1_xPecMax1.1_protein.faa.gz	pecMax.faa.gz
6596_0	Mercenaria mercenaria	GCF_021730395.1_MADL_Memer_1_protein.faa.gz	merMer.faa.gz
29159_0	Crassostrea gigas	GCF_902806645.1_cgigas_uk_roslin_v1_protein.faa.gz	craGig.faa.gz
36100_0	Haliotis rubra	GCF_003918875.1_ASM391887v1_protein.faa.gz	halRub.faa.gz
37623_0	Ostrea edulis	GCF_947568905.1_xbOstEdul1.1_protein.faa.gz	ostEdu.faa.gz
37653_0	Octopus bimaculoides	GCF_001194135.2_ASM119413v2_protein.faa.gz	octBim.faa.gz
225164_0	Lottia gigantea	GCF_000327385.1_Helro1_protein.faa.gz	lotGig.faa.gz
400727_0	Pomacea canaliculata	GCF_003073045.1_ASM307304v1_protein.faa.gz	pomCan.faa.gz
1735272_0	Gigantopelta aegis	GCF_016097555.1_Gae_host_genome_protein.faa.gz	gigAeg.faa.gz
2607531_0	Octopus sinensis	GCF_006345805.1_ASM634580v1_protein.faa.gz	octSin.faa.gz

# run OrthoFinder
nohup orthofinder -f Separate_Genomes_NCBI &
# Find Orthogroup IDs specific to triMax and save them to a file
# /home/ruiqi/ruiqi_data/GiantClamGenome/OrthoDB/Separate_Genomes_NCBI/OrthoFinder/Results_Jan06/Orthogroups
awk 'NR > 1 && $NF == $15 {print $1}' Orthogroups.GeneCount.tsv > triMax_assigned_specific_Ortho_ID

#combine "unassigned genes from triMax
sed '1d' Orthogroups_UnassignedGenes.tsv | cut -f1,15 | grep "Tmax"  > triMax_Unassigned_OG2Gene
cat triMax_Unassigned_OG2Gene triMax_assigned_specific_OG2Gene > triMax_specific_OG2Gene.txt
sed -i 's/Tmax_Tmax/Tmax/g' triMax_specific_OG2Gene.txt

# Annotation and Enrichment
############################################################################################################
# Part 1 - add annotation to all Tmax specific gene families
############################################################################################################
OG2gene <- read.table(file="triMax_specific_OG2Gene.txt", sep = "\t")

colnames(OG2gene) <- c("familyID","Gene")
Annot <- read.delim(file = "Tmax_ReAnn.clean.annotations", sep = "\t")
colnames(Annot) <- c("Gene","seed_eggNOG_ortholog","seed_ortholog_evalue","seed_ortholog_score","best_tax_level","Preferred_name",
                     "GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","taxonomic scope","eggNOG OGs",
                     "best eggNOG OG","COG","eggNOG free text desc.")

Tmax_annot <- merge(x=OG2gene,y=Annot,
                        by="Gene", all.x=TRUE)
sorted_Tmax_annot <- Tmax_annot[order(Tmax_annot$familyID), ]

write.table(sorted_Tmax_annot, file = "Tmax_specific_OG_Annot.tsv", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
############################################################################################################
# Part 2 - plot bar graph for description for each gene family, mark the description that exceeds 50% in red
############################################################################################################

sorted_Tmax_annot<-read.delim(file = "triMax_assigned_specific_OG_Annot.tsv", sep = "\t", header = TRUE)
unique_og_values <- unique(sorted_Tmax_annot$familyID)
# Replace empty strings with NA in the "eggNOG free text desc." column
sorted_Tmax_annot$eggNOG.free.text.desc.[sorted_Tmax_annot$eggNOG.free.text.desc. == ""] <- NA

# Reform if the description is too long (more than 10 words, then only keep 5 and add ...)
# Function to truncate and add "..."
truncate_and_ellipsis <- function(text, max_words = 10) {
  words <- unlist(strsplit(text, " "))
  if (length(words) > max_words) {
    truncated_text <- paste(words[1:5], collapse = " ")
    return(paste(truncated_text, "..."))
  }
  return(text)
}

# Apply the function to the specified column
sorted_Tmax_annot$eggNOG.free.text.desc. <- sapply(
  sorted_Tmax_annot$eggNOG.free.text.desc.,
  truncate_and_ellipsis
)

# Plot functions
for (og_value in unique_og_values) {
  # Filter the dataframe for the specific familyID
  family_df <- sorted_Tmax_annot %>%
    filter(familyID == og_value)
  # Count the frequency of each description
  description_counts <- family_df %>%
    group_by(eggNOG.free.text.desc.) %>%
    summarize(count = n())
  # Calculate the percentage for each description
  description_counts <- description_counts %>%
    mutate(percentage = (count / sum(count)) * 100)
  # Create a horizontal bar graph with red highlights for descriptions > 50%
  gg <- ggplot(description_counts, aes(x = percentage, y = reorder(eggNOG.free.text.desc., -percentage), fill = percentage > 50)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = c("lightgrey", "lightcoral"), guide = FALSE) +
    labs(title = paste("Frequency of Descriptions for  Gene Family", og_value),
         x = "Percentage", y = "Description") +
    theme(axis.text.y = element_text(hjust = 1))
  # Save the plot with the name "[OG value]_box_plot.png"
  ggsave(paste0("./GeneFamily_description_bargraph/", og_value, "_description_bargraph.png"), gg, width = 16, height = 9, units = "in")
}

############################################################################################################
# Part 3 - summarize description for each gene family, use the second most frequent description if the most one is NA
############################################################################################################
sorted_Tmax_annot<-read.delim(file = "triMax_assigned_specific_OG_Annot.tsv", sep = "\t", header = TRUE)
sorted_Tmax_annot <- sorted_Tmax_annot %>% select(Gene,familyID,eggNOG.free.text.desc.)
# change space to NA
sorted_Tmax_annot <- sorted_Tmax_annot %>% mutate_all(na_if,"")

# Find the most common value in function desc.
# If the most common one is NA, use the second common one
# If all the values are NA, just use NA
sorted_Tmax_annot <- sorted_Tmax_annot %>%
  group_by(familyID) %>%
  mutate(most_common = {
    # Count the occurrences of each value, excluding NA
    counts <- table(na.omit(eggNOG.free.text.desc.))

    if (length(counts) == 0) {
      # If all values are NA, return NA
      NA
    } else {
      # Get the name of the value with maximum count
      names(sort(counts, decreasing = TRUE))[1]
    }
  }) %>%
  ungroup()

# Create Tmax_summary with familyID, Genes, and most_common
Tmax_summary <- sorted_Tmax_annot %>%
  group_by(familyID) %>%
  summarise(
    Genes = n(),  # Count of genes in each familyID
    most_common = first(most_common)  # The most common value from 'most_common' column
  )

# Calculate Frequency for each familyID independently
Tmax_summary <- Tmax_summary %>%
  rowwise() %>%
  mutate(Frequency = {
    common_value <- most_common
    total_genes <- Genes
    if (is.na(common_value)) {
      NA_real_  # If most_common is NA, set Frequency as NA
    } else {
      common_count <- sum(sorted_Tmax_annot$familyID == familyID & sorted_Tmax_annot$eggNOG.free.text.desc. == common_value, na.rm = TRUE)
      (common_count / total_genes) * 100
    }
  })
############################################################################################################
# Part 4 - summarize function of un-assigned genes
############################################################################################################
OG2gene <- read.table(file="triMax_Unassigned_OG2Gene", sep = "\t")

colnames(OG2gene) <- c("familyID","Gene")
Annot <- read.delim(file = "Tmax_ReAnn.clean.annotations", sep = "\t")
colnames(Annot) <- c("Gene","seed_eggNOG_ortholog","seed_ortholog_evalue","seed_ortholog_score","best_tax_level","Preferred_name",
                     "GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","taxonomic scope","eggNOG OGs",
                     "best eggNOG OG","COG","eggNOG free text desc.")

Tmax_unasnd_annot <- merge(x=OG2gene,y=Annot,
                    by="Gene", all.x=TRUE)
sorted_Tmax_unasnd_annot <- Tmax_unasnd_annot[order(Tmax_unasnd_annot$familyID), ]
sorted_Tmax_unasnd_annot <- sorted_Tmax_unasnd_annot %>% mutate_if(is.character, na_if, "")
#write.table(sorted_Tmax_unasnd_annot, file = "Tmax_unasnd_OG_Annot.tsv", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)

################### summarize function of un-assigned genes
sorted_Tmax_unasnd_annot <-read.delim(file = "Tmax_unasnd_OG_Annot.tsv", sep = "\t", header = TRUE)

Tmax_unasnd_summary <- sorted_Tmax_unasnd_annot %>%
  group_by(eggNOG.free.text.desc.) %>%
  summarise(
    Count = n()
  ) %>%
  ungroup() %>%
  arrange(desc(Count))
# Non-TE and Non-NA
Tmax_unasnd_summary_nonTE_nonNA  <- Tmax_unasnd_summary %>%
  filter(!eggNOG.free.text.desc. %in% exclude_values, !is.na(eggNOG.free.text.desc.))

write.table(Tmax_unasnd_summary, file = "Tmax_UNassigned_Annot_summary.tsv", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)

############################################################################################################
# Part 5 - GO MWU enrichment
############################################################################################################
input="triMax_specific_GOs.csv"
goAnnotations="triMax_gene2GO.txt"
goDatabase="go.obo"
source("gomwu.functions.R")
go=c("MF", "CC", "BP")
for (goDivision in go){
	gomwuStats(input, goDatabase, goAnnotations, goDivision,
               perlPath="perl",
               largest=0.1,
               smallest=5,
               clusterCutHeight=0.25,
    )
}

2. Phylogenomics

# grep each OG and save to gene Id to a file
while read line
 do
 grep -w $line Tmax_Mollusca_OG2gene.tab | cut -f2 > ./SingCopyOG_14_genelist/${line}_genelist.txt
 done < Mollusca_SingCopyOG_14_OG
# Retrieve sequences
# Retrieve mollusca specific genes
for file in odb11v_cleanheader.part*.fasta.cidx
do
  cat Mollusca_gene_ID | cdbyank $file > ${file}_Mollusca_gene_seq.fasta
done
cat *_Mollusca_gene_seq.fasta > Mollusca_gene_OrthoDB.fasta
# Retrieve seqs from each OG and save to separate fasta files
for file in ./SingCopyOG_14_genelist/*_genelist.txt
do
  filename=`basename $file`
  name=${filename%_*}
  echo $name
  cat $file | cdbyank Tmax_Mollusca.fasta.cidx > ./SingCopyOG_14_seqs/${name}.fasta
done
# Alignment, Trim, Phylogeny Inference
align each gene separately,
while read name;
do mafft ${name}.fasta > $name.output
 trimal -in $name.output -out ${name}_trimmed.output -automated1
done < Mollusca_SingCopyOG_14_OG

./catfasta2phyml/catfasta2phyml.pl --concatenate --verbose --fasta *_trimmed.output > Tmax_Mollusca_SingCopyOG_14.fasta
echo "alignment done!"

# raxml
run1=all1
run2=all2
run=Tmax_Mollusca_SingCopyOG_14
raxmlHPC-PTHREADS -T 36 -p 12345 -N 20 -s Tmax_Mollusca_SingCopyOG_14.fasta -m PROTGAMMAWAG -n $run1
raxmlHPC-PTHREADS -T 36 -p 12345 -b 12345 -s Tmax_Mollusca_SingCopyOG_14.fasta -N 100 -m PROTGAMMAWAG -n $run2
raxmlHPC-PTHREADS -T 36 -f b -t RAxML_bestTree.${run1}  -z RAxML_bootstrap.${run2} -m PROTGAMMAWAG -n $run

3. Repeat Annotation

Pipeline demo with Cerastoderma edule genome

# # RepeatModeler
BuildDatabase -name cerEdu C_edule.fna

# Annotate Repeats
git clone https://github.com/darencard/GenomeAnnotation.git

./GenomeAnnotation/repclassifier -t 36 -d Mollusca -u cerEdu_families.prefix.fa.unknown \
-k cerEdu_families.prefix.fa.known -a cerEdu_families.prefix.fa.known \
-o round-1_RepbaseMollusca-Self
#986 unknowns after round 1

# round 2 with know library

./GenomeAnnotation/repclassifier -t 36 -u round-1_RepbaseMollusca-Self/round-1_RepbaseMollusca-Self.unknown \
-k round-1_RepbaseMollusca-Self/round-1_RepbaseMollusca-Self.known \
-a round-1_RepbaseMollusca-Self/round-1_RepbaseMollusca-Self.known -o round-2_Self
# 825 unknowns after round 2

# round 3 with know library

./GenomeAnnotation/repclassifier -t 36 -u round-2_Self/round-2_Self.unknown \
-k round-2_Self/round-2_Self.known \
-a round-2_Self/round-2_Self.known -o round-3_Self
# 765 unknowns after round 3

# round 4 with know library

./GenomeAnnotation/repclassifier -t 36 -u round-3_Self/round-3_Self.unknown \
-k round-3_Self/round-3_Self.known \
-a round-3_Self/round-3_Self.known -o round-4_Self
# 742 unknowns after round 4

# round 5 with know library

./GenomeAnnotation/repclassifier -t 36 -u round-4_Self/round-4_Self.unknown \
-k round-4_Self/round-4_Self.known \
-a round-4_Self/round-4_Self.known -o round-5_Self
# 736 unknowns after round 5

cat round-5_Self.unknown round-5_Self.known > C_edule_denovo_annot.fa

# round 1: annotate/mask simple repeats
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 01_simple_out -noint -xsmall C_edule.fna &

mv nohup.out logs/01_simplemask.log

# rename outputs for 01_simple_out
for file in *; do mv -v "$file" "${file/C_edule/cerEdu_simple_mask}"; done
mv cerEdu_simple_mask.fna.masked cerEdu_simple_mask.fna


# round 2: annotate/mask Tetrapoda elements sourced from Repbase using output from 1st round of RepeatMasker
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 02_mollusca_out -nolow -species Mollusca 01_simple_out/cerEdu_simple_mask.fna &

mv nohup.out logs/02_molluscaMask.log

# round 2:rename outputs
for file in *; do mv -v "$file" "${file/simple_mask/mollusca_mask}"; done
mv cerEdu_mollusca_mask.fna.masked cerEdu_mollusca_mask.masked.fasta

# round 3: annotate/mask known elements sourced from species-specific de novo repeat library using output froom 2nd round of RepeatMasker
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 03_known_out -nolow -lib round-5_Self.known \
02_mollusca_out/cerEdu_mollusca_mask.masked.fasta &

# round 3:rename outputs
for file in *; do mv -v "$file" "${file/mollusca_mask/known_mask}"; done
mv cerEdu_known_mask.masked.fasta.masked cerEdu_known_mask.masked.fasta

# round 4: annotate/mask unknown elements sourced from species-specific de novo repeat library using output from 3nd round of RepeatMasker
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/RepeatMasker -pa 48 -a -e ncbi -dir 04_unknown_out -nolow -lib round-5_Self.unknown \
03_known_out/cerEdu_known_mask.masked.fasta &
# round 4:rename outputs
for file in *; do mv -v "$file" "${file/known_mask/unknown_mask}"; done
mv cerEdu_unknown_mask.masked.fasta.masked cerEdu_unknown_mask.masked.fasta

# create directory for full results
mkdir -p 05_full_out

# combine full RepeatMasker result files - .cat.gz
cat 01_simple_out/cerEdu_simple_mask.fna.cat.gz 02_mollusca_out/cerEdu_mollusca_mask.fna.cat.gz 03_known_out/cerEdu_known_mask.masked.fasta.cat.gz 04_unknown_out/cerEdu_unknown_mask.masked.fasta.cat.gz

cat 01_simple_out/cerEdu_simple_mask.fna.cat.gz 02_mollusca_out/cerEdu_mollusca_mask.fna.cat.gz 03_known_out/cerEdu_known_mask.masked.fasta.cat.gz 04_unknown_out/cerEdu_unknown_mask.masked.fasta.cat.gz > 05_full_out/cerEdu_full_mask.cat.gz

# combine RepeatMasker tabular files for all repeats - .out
cat 01_simple_out/cerEdu_simple_mask.fna.out \
<(cat 02_mollusca_out/cerEdu_mollusca_mask.fna.out | tail -n +4) \
<(cat 03_known_out/cerEdu_known_mask.masked.fasta.out | tail -n +4) \
<(cat 04_unknown_out/cerEdu_unknown_mask.masked.fasta.out | tail -n +4) \
> 05_full_out/cerEdu.full_mask.out


# combine RepeatMasker repeat alignments for all repeats - .align
cat 01_simple_out/cerEdu_simple_mask.fna.align \
02_mollusca_out/cerEdu_mollusca_mask.fna.align \
03_known_out/cerEdu_known_mask.masked.fasta.align \
04_unknown_out/cerEdu_unknown_mask.masked.fasta.align \
> 05_full_out/cerEdu.full_mask.align

# resummarize repeat compositions from combined analysis of all RepeatMasker rounds
nohup /home/ruiqi/ruiqi_data/2023GiantClam/RepeatMaskerInstall/RepeatMasker/ProcessRepeats -a -species Mollusca 05_full_out/cerEdu_full_mask.cat.gz &

mv nohup.out logs/05_fullmask.log

# Insertion time
nohup ../../Parsing-RepeatMasker-Outputs/parseRM.pl -i ../cerEdu_full_mask.align -l 120,5 -m 0.00109 -v&

4. Circos Plot

# calculate gene density
cat Tmax_merged_longestisoform.gff | cut -f 1,4,5 > genes.bed
cut -d ' ' -f 3,6 chromosome.txt | tr ' ' '\t' > chromosome.genome
bedtools makewindows -g chromosome.genome -w 1000000 > chromosome.windows
bedtools coverage -a chromosome.windows -b genes.bed | cut -f 1-4 > genes_num.txt

# calculate repeat density
cat PO1429_Tridacna_maxima.RepeatMasked.gff | cut -f 1,4,5 > repeats.bed
bedtools coverage -a chromosome.windows -b repeats.bed | cut -f 1-4 > repeats_num.txt

# calculate GC content
bedtools nuc -fi $genome -bed chromosome.windows > gc_raw.txt

# Configuration file
karyotype = chromosome.txt
chromosomes_units=1000000
<ideogram>

<spacing>
default = 0.005r
</spacing>

radius    = 0.75r
thickness = 25p
fill      = yes

show_bands = yes
fill_bands = yes

stroke_color     = dgrey
stroke_thickness = 2p

show_label       = yes
label_font       = default
label_radius     = 1.05r
label_size       = 30
label_parallel   = yes


show_ticks          = yes
show_tick_labels    = yes

</ideogram>

show_ticks          = yes
show_tick_labels    = yes

<ticks>
radius           = 1r
color            = black
thickness        = 2p
multiplier       = 1e-6
format           = %d

<tick>
spacing        = 10u
size           = 10p
show_label = yes
label_offset = 8p
label_size = 15p
format = %d
</tick>

<tick>
spacing        = 5u
size           = 5p
show_label = no
</tick>

</ticks>

<plots>

<plot>
type    = heatmap
file    = GCrate.txt
color   = purd-9-seq
r1      = 0.85r
r0      = 0.80r
</plot>

<plot>
type      = heatmap
file = repeats_num.txt
color   = blues-9-seq
r1      = 0.91r
r0      = 0.86r
</plot>
<plot>
type      = line
thickness = 1
max_gap = 1u
file = repeats_num.txt
color   = black
r1      = 0.91r
r0      = 0.86r
</plot>

<plot>
type = heatmap
file = genes_num.txt
color = purples-9-seq
r1   = 0.99r
r0   = 0.92r
</plot>

<plot>
type = line
thickness = 1
max_gap = 1u
file = genes_num.txt
color = black
r1   = 0.99r
r0   = 0.92r
</plot>


</plots>

<image>

# Run the program
circos -conf circos.conf

5. Targeted Gene Finder

Targeted Gene Finder demo with Paradox CDX gene

Find sequence: Nucula tumidula homeobox parahox cdx (Cdx) mRNA, complete cds

# base Environment
# makedb
makeblastdb -in C_edule.fna -dbtype nucl -parse_seqids -out C_edule
makeblastdb -in Tmax_ReAnn_dna.fasta -dbtype nucl -parse_seqids -out T_maxima_Annot
makeblastdb -in GCF_902652985.1_xPecMax1.1_cds_from_genomic.fna -dbtype nucl -parse_seqids -out PecMax

## tblastn
tblastx -query Paradox_cdx.fasta \
    -db C_edule \
    -outfmt 6 -evalue 1e-5 -num_threads 30 >  Paradox_cdx_C_edule.outfmt6

tblastx -query Paradox_cdx.fasta \
    -db T_maxima_Annot \
    -outfmt 6 -evalue 1e-5 -num_threads 30 > Paradox_cdx_T_maxima_Annot.outfmt6

6. Transcriptome

One tissue each (mantle, adductor muscle, go-nads, gills, byssus, visceral mass, and kidney) from this study

Adductor muscle 2 (TM_A_N02) is from Li et al. 2020.

Mantle mRNA available 29/53/65 day at control (29.2°C; ambient (condition N) and elevated (30.7°C) (condition E) temperature

Tissue	Sample Name	Original File Names	SRR number
Adductor Muscle	TM_A_N01	Tmax_B_S4_R1_001_val_1.fq.gz, Tmax_B_S4_R2_001_val_2.fq.gz	NA
Adductor Muscle	TM_A_N02	SRR8217859_1.fastq,SRR8217859_2.fastq	SRR8217859
Adductor Muscle	TM_A_N03	SRR12486110_1.fastq, SRR12486110_2.fastq	SRR12486110
Mantle	TM_M_N01	SRR12486156_1.fastq, SRR12486156_2.fastq	SRR12486156
Mantle	TM_M_N02	SRR12486155_1.fastq, SRR12486155_2.fastq	SRR12486155
Mantle	TM_M_N03	SRR12486141_1.fastq, SRR12486141_2.fastq	SRR12486141
Mantle	TM_M_N04	SRR12486109_1.fastq, SRR12486109_2.fastq	SRR12486109
Mantle	TM_M_E01	SRR12486154_1.fastq, SRR12486154_2.fastq	SRR12486154
Mantle	TM_M_E02	SRR12486147_1.fastq, SRR12486147_2.fastq	SRR12486147
Mantle	TM_M_E03	SRR12486140_1.fastq, SRR12486140_2.fastq	SRR12486140

# Trim the reads following default setting in Trinity

trimmomatic PE -threads 20 TM_A_N01_1.fastq.gz TM_A_N01_2.fastq.gz \
TM_A_N01_1.trim.fq.gz TM_A_N01_1.untrim.fq.gz \
TM_A_N01_2.trim.fq.gz TM_A_N01_2.untrim.fq.gz \
ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/TruSeq3-PE.fa:2:30:10 SLIDINGWINDOW:4:5 LEADING:5 TRAILING:5 MINLEN:25

#Use a loop
#!/bin/bash
for file in $(ls *_1.fastq.gz)
do
  echo $file
  file="${file%%_1.*}"
  echo $file
  trimmomatic PE -threads 36 ${file}_1.fastq.gz ${file}_2.fastq.gz \
  TrimmedReads/${file}_1.trim.fq.gz Untrimmed/${file}_1.untrim.fq.gz \
  TrimmedReads/${file}_2.trim.fq.gz Untrimmed/${file}_2.untrim.fq.gz \
  ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/TruSeq3-PE.fa:2:30:10 SLIDINGWINDOW:4:5 LEADING:5 TRAILING:5 MINLEN:25
done

3. QC and Quantification

# Prepare the sample file
for file in *_1.trim.fq.gz; do
    prefix="${file%%_1.trim.fq.gz}"
    path="/home/ruiqi/ruiqi_data/GiantClamGenome/Transcriptome_GiantClam/TrimmedReads/"
    col1="${prefix:3:1}${prefix:5:1}"
    col2="${prefix:0:8}"
    col3="$path${file}"
    col4="$path${prefix}_2.trim.fq.gz"
    echo -e "$col1\t$col2\t$col3\t$col4"
done > TM_samples.txt

# env: trinity
# Use align_and_estimate_abundance.pl to quantify unigenes
nohup /home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/opt/trinity-2.8.5/util/align_and_estimate_abundance.pl \
--transcripts Tmax_ReAnn_dna.fasta \
--seqType fq \
--samples_file TM_samples.txt \
--est_method kallisto \
--prep_reference \
--kallisto_add_opts "-t 36" &

# get the abundace file list
find . -name "abundance.tsv" | sort | tee TM_abundance_files.list

# abuandance to matrix
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/bin/abundance_estimates_to_matrix.pl --est_method kallisto \
--gene_trans_map none \
--out_prefix TM_all \
--name_sample_by_basedir \
--quant_files TM_abundance_files.list

# Counting Numbers of Expressed Transcripts or Genes
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/opt/trinity-2.8.5/util/misc/count_matrix_features_given_MIN_TPM_threshold.pl \
TM_all.isoform.TPM.not_cross_norm | tee TM_all.isoform_matrix.TPM.not_cross_norm.counts_by_min_TPM

# DE
# Use TPM > 5 as cut off for DE analyses
#Retained 16119 / 46469 = 34.69% of total transcripts.
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/opt/trinity-2.8.5/util/filter_low_expr_transcripts.pl --matrix TM_all_afterQC.isoform.TPM.not_cross_norm --transcripts Tmax_ReAnn_dna.fasta --min_expr_any 5 > TPM5_Tmax_ReAnn_dna.fasta
# clean header
sed -i '/^>/ s/ .*//' TPM5_Tmax_ReAnn_dna.fasta
#get the gene list (TPM>5)
grep ">" TPM5_Tmax_ReAnn_dna.fasta | sed 's/>//' > TPM5_Tmax.geneID
#get the headers
head -1 TM_all_afterQC.isoform.counts.matrix >TPM5_TM_all.isoform.counts.matrix
# get the count matrix on the isoform list
grep -wFf TPM5_Tmax.geneID TM_all_afterQC.isoform.counts.matrix >> TPM5_TM_all.isoform.counts.matrix

## Step 1: Identifying DEs
# TPM can't be in the file name, rename it to
/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/bin/run_DE_analysis.pl \
--matrix expr5_TM_all.isoform.counts.matrix \
--samples_file TM_DE_samples.txt \
--contrasts TM_contrasts.txt \
--output . \
--method DESeq2

## Step 2: Extracting and Clustering DEs
## please note that we are using the TMM.EXPR.matrix
# filtering it first
#get the headers
head -1 TM_all_afterQC.isoform.TMM.EXPR.matrix > expr5_TM_all_afterQC.isoform.TMM.EXPR.matrix
# get the count matrix on the isoform list
grep -wFf TPM5_Tmax.geneID TM_all_afterQC.isoform.TMM.EXPR.matrix >> expr5_TM_all_afterQC.isoform.TMM.EXPR.matrix

/home/ruiqi/miniconda3/pkgs/trinity-2.8.5-h8b12597_5/bin/analyze_diff_expr.pl \
--matrix expr5_TM_all_afterQC.isoform.TMM.EXPR.matrix \
--samples TM_DE_samples.txt \
--output TM_P${Pvalue}_C${C} \
--P 5e-2 \
--C 1
#${prefix}.sampleA_vs_sampleB.voom.DE_results.P0.001_C2.sampleA-UP.subset : the expression matrix subset for features up-regulated in sampleA
# ${prefix}.sampleA_vs_sampleB.voom.DE_results.P0.001_C2.sampleB-UP.subset : the expression matrix subset for features up-regulated in sampleB

# Enrichment

setwd("/home/ruiqi/ruiqi_data/GiantClamGenome/Transcriptome_GiantClam/GO_MWU")

goDatabase="go.obo"
source("gomwu.functions.R")
###########
# Tmax
###########
goAnnotations="expr5_Tmax_gene2go.tab"
# Input
go=c("MF", "CC", "BP")
input_files=c("Tmax_MN_vs_ME_lfc.csv","Tmax_MN_vs_AN_lfc.csv")
# Calculating stats
for (goDivision in go){
  for (input in input_files){
    gomwuStats(input, goDatabase, goAnnotations, goDivision,
               perlPath="perl",
               largest=0.1,
               smallest=5,
               clusterCutHeight=0.25,
    )
  }
}

library(tidyverse)
 go=c("MF", "CC", "BP")
 input_files=c("Tmax_MN_vs_AN_lfc.csv","Tmax_MN_vs_ME_lfc.csv")
 # Calculating stats)
 for (goDivision in go){
   for (input in input_files){
     mwu_data <- read.table(paste0("MWU_",goDivision,"_",input), sep = " ", header=TRUE) %>%
       filter(p.adj<0.05)
     mwu_data$change[mwu_data$delta.rank > 0] <- "Up"
     mwu_data$change[mwu_data$delta.rank < 0] <- "Down"
     mwu_up <- mwu_data %>% filter (change == "Up") %>% select(c("term", "name"))
     write.table(mwu_up, file=paste0("./GOs_GO_MWU/GOs_UP_",substring(input, 1, 13), "_", goDivision, ".tsv"), quote = FALSE, sep="\t",row.names = FALSE, col.names = TRUE)
     mwu_down <- mwu_data %>% filter (change == "Down") %>% select(c("term", "name"))
     write.table(mwu_down, file=paste0("./GOs_GO_MWU/GOs_DOWN_",substring(input, 1, 13), "_", goDivision, ".tsv"), quote = FALSE, sep="\t",row.names = FALSE, col.names = TRUE)
   }
 }

from GO_MWU to genes

# get all the microtubule/cilium/ciliary/cilia related GO terms in the down-regulated genes in the MN
for file in GOs_DOWN_Tmax*; do
  echo $file
  grep -E 'microtubule|cilium|ciliary|cilia' $file >> cilia_DOWN_MN.txt
done

# if there are multiple GO terms in one line, split into several lines
awk -F'\t' '{
  split($1, terms, ";")
  for (i in terms) {
    print terms[i] "\t" $2
  }
}' cilia_DOWN_MN.txt > cilia_DOWN_MN_1.txt
# Keep the unique GO terms
cat cilia_DOWN_MN_1.txt | sort -u > cilia_DOWN_MN_unique.txt
# get the GO terms
cut -f1 cilia_DOWN_MN_unique.txt > cilia_DOWN_MN_GO_unique.txt



# get the list of Down-Regulated gene in MN
cat expr5_TM_all.isoform.counts.matrix.MN_vs_AN.DESeq2.DE_results.P5e-2_C1.AN-UP.subset | sed '1d' | cut -f1 > MN_vs_AN_Down_genelist

cat expr5_TM_all.isoform.counts.matrix.MN_vs_ME.DESeq2.DE_results.P5e-2_C1.ME-UP.subset | sed '1d' | cut -f1 >> MN_vs_AN_Down_genelist
# get the unique genes
sort -u  MN_vs_AN_Down_genelist > MN_vs_AN_Down_unique_genelist


###  get the list of genes that annotated with the cilia GO terms, and down regulated in MN
grep -Fwf cilia_DOWN_MN_GO_unique.txt Tmax_ReAnn.clean.annotations | grep -Fwf TM_DE_subset/MN_vs_AN_Down_unique_genelist > cilia_DOWN_MN_genes.txt

cut -f1,6,22 cilia_DOWN_MN_genes.txt cilia_DOWN_MN_genes_summary.txt

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

T_maixima_Genome_script.md

T_maixima_Genome_script.md

T. maxima Genome Script

1. Gene Family Analyses

2. Phylogenomics

3. Repeat Annotation

4. Circos Plot

5. Targeted Gene Finder

6. Transcriptome

3. QC and Quantification

from GO_MWU to genes

Files

T_maixima_Genome_script.md

Latest commit

History

T_maixima_Genome_script.md

File metadata and controls

T. maxima Genome Script

1. Gene Family Analyses

2. Phylogenomics

3. Repeat Annotation

4. Circos Plot

5. Targeted Gene Finder

6. Transcriptome

3. QC and Quantification

from GO_MWU to genes