diff --git a/Ab_epitope/.gitignore b/Ab_epitope/.gitignore new file mode 100644 index 0000000..5d81013 --- /dev/null +++ b/Ab_epitope/.gitignore @@ -0,0 +1,9 @@ +OAS_download + +data/epitope_all.fasta + +result/ + +data/dataset + +data/raw_data/OAS_memory_paired.csv diff --git a/Ab_epitope/data_clean_for_LM.sh b/Ab_epitope/data_clean_for_LM.sh index 52b756e..01220b5 100755 --- a/Ab_epitope/data_clean_for_LM.sh +++ b/Ab_epitope/data_clean_for_LM.sh @@ -1,25 +1,40 @@ #!/bin/bash -python mBLM/script/merge_raw_data.py -python mBLM/script/df2fasta.py mBLM/result/memory_paired_Abs.csv mBLM/result/memory_paired_Abs.fasta - # clustering output_prefix=mBLM/result/cluster/ f=mBLM/result/memory_paired_Abs.fasta name=memory_paired_Abs + +echo "Checking if $output_prefix exists" +if [ ! -d "$output_prefix" ]; then + echo "Creating output directory" + mkdir -p "$output_prefix" +fi + +python mBLM/script/merge_raw_data.py +python mBLM/script/df2fasta.py mBLM/result/memory_paired_Abs.csv mBLM/result/memory_paired_Abs.fasta + + for x in 0.5 0.6 do - cd-hit -i $f -o $output_prefix$name$x -c $x -M 32000 -d 0 -T 32 -n 3 -aL 0.8 -s 0.95 -uS 0.2 -sc 1 -sf 1 + cd-hit -i $f -o $output_prefix$name$x -c $x -M 32000 -d 0 -T 32 -n 3 -aL 0.8 -s 0.95 -uS 0.2 -sc 1 -sf 1 done for x in 0.7 0.8 0.9 0.95 do - cd-hit -i $f -o $output_prefix$name$x -c $x -M 32000 -d 0 -T 32 -n 5 -aL 0.8 -s 0.95 -uS 0.2 -sc 1 -sf 1 + cd-hit -i $f -o $output_prefix$name$x -c $x -M 32000 -d 0 -T 32 -n 5 -aL 0.8 -s 0.95 -uS 0.2 -sc 1 -sf 1 done # add cluster id to dataset - python mBLM/script/add_clstr2df.py -i mBLM/result/memory_paired_Abs.csv -o mBLM/result/memory_paired_Abs_final.csv + +datasetdir=data/dataset +echo "Checking if $datasetdir exists" +if [ ! -d "$datasetdir" ]; then + echo "Creating output directory" + mkdir "$datasetdir" +fi + # split dataset python mBLM/script/split_dataset.py -i mBLM/result/memory_paired_Abs_final.csv -o data/dataset/memory_paired_Abs diff --git a/Ab_epitope/data_clean_for_epitope.sh b/Ab_epitope/data_clean_for_epitope.sh index 2b55088..1d7904b 100644 --- a/Ab_epitope/data_clean_for_epitope.sh +++ b/Ab_epitope/data_clean_for_epitope.sh @@ -10,9 +10,16 @@ fasta_prefix=./data/ output_prefix=./result/cluster/ f=epitope_all.fasta name=`basename $f ".fasta"` + +echo "Checking if $output_prefix exists" +if [ ! -d "$output_prefix" ]; then + echo "Creating output directory" + mkdir "$output_prefix" +fi + for x in 0.8 0.9 0.92 0.95 0.98 0.99 0.995 do - cd-hit -i $f -o $output_prefix$name$x$fasta_suffix -c $x -M 32000 -d 0 -T 8 -n 5 -aL 0.8 -s 0.95 -uS 0.2 -sc 1 -sf 1 + cd-hit -i $fasta_prefix$f -o $output_prefix$name$x$fasta_suffix -c $x -M 32000 -d 0 -T 8 -n 5 -aL 0.8 -s 0.95 -uS 0.2 -sc 1 -sf 1 done @@ -23,4 +30,4 @@ python ./script/add_clstr2df.py # split dataset python ./script/split_dataset.py # remove repeat fasta sequence -pytho script/rm_fas_repeats.py result/epitope_clean.fasta result/epitope_clean_v2.fasta \ No newline at end of file +python script/rm_fas_repeats.py result/epitope_clean.fasta result/epitope_clean_v2.fasta \ No newline at end of file