Skip to content

Commit

Permalink
homework4
Browse files Browse the repository at this point in the history
  • Loading branch information
Huangy57 committed Nov 10, 2023
1 parent 983fb29 commit 8ecb7bc
Showing 1 changed file with 190 additions and 10 deletions.
200 changes: 190 additions & 10 deletions homeworks/homework04/homework04.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,103 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'AACCGTGACCAGGAAG': 70, 'TTATCGTCTCCCATAT': 77, 'CATACCAGTCATCCCT': 28, 'ACTTACGTATAAGTCA': 53, 'GCTACTACTATACCAT': 119, 'GTTACCCACAGTCCGC': 62, 'CACCACACAAGGATGT': 41, 'TCATACATCACACTTA': 47, 'GTACCCTCCGTGAATC': 99, 'ACTCCACGCTACCACG': 31, 'GCACTCCTCAACCCTT': 48, 'CCGCTCCCTGCTGTCC': 43, 'AAACGTAGCGATAACT': 61, 'TCACGTCCCATATTAC': 9, 'CCCGACCCGACATTAA': 155, 'ACGAGAGGTCGACTCG': 60, 'TTCGACTTCCTAGTAC': 86, 'ACCCAGTCTAGCTAAC': 70, 'GGTCATACGCCTTCGC': 16, 'CTTAACCTTCCGACAA': 55, 'TGGGCAATAAATGTAG': 57, 'CCCTCATCCTGTGTCA': 54, 'AATTCCATTCAGGCTG': 35, 'TTCTAGCCTTATCTCC': 81, 'AGAATAATCTCAAACT': 62, 'AAACAAACAAGTCTGT': 50, 'AATACGAACATCTCGG': 49, 'CGAATCTGCGCAATCT': 60, 'GATTTCCGATCAGTCT': 124, 'AGAACGTCTTGATAGC': 30, 'GACCGGTGCTTCAACA': 76, 'TGCTGACAACACGTAA': 47, 'ATAGTCGGGCCTACCG': 21, 'TTCCCAATGGTACTCG': 28, 'CTGGCAAGAAAATTCC': 43, 'TTCGGTGAACGTACAC': 76, 'CTCCATCACTCGCCAA': 115, 'TCAACCCCACGCCGCT': 24, 'TAACTAGCGTTTTCAA': 1, 'AGTACTCATCGAGCAA': 31, 'AACCGTCACCTCAACC': 104, 'GAACGACCCTTAACTA': 18, 'TCCAGAAACTTCAATG': 71, 'ACACGATACCCTTGAG': 26, 'CCAGCCATCCCCTCTT': 76, 'ATTCACACTCCTGCCA': 22, 'TACTTATTAGCATGTC': 46, 'GGAAATTATACTGCCA': 10, 'GCCCCACAATAACCGC': 38, 'GCACGCGATCTGCGAC': 32, 'AGGCACTAACCCAGCA': 37, 'ACTTCATACCATTACA': 32, 'CCCACCTTCACACCTA': 62, 'ACGAACGTCAAAAAAA': 5, 'GTCGCATATGAAGACG': 40, 'ACAAAACCCCCCGACA': 27, 'TCCGTATATCCAAAAG': 13, 'GAGCGTCCCTTCTAAA': 34, 'TCGCTCAGTCGTCTTT': 38, 'GTTACCCACAGTCGGC': 1, 'GAAATCCCTAACGGGC': 17, 'ATTTACGGGCTAGTTG': 7, 'AGACCGTCAGGCACCA': 46, 'TTCCCTAGAGACTACC': 37, 'TGACAACCACGTCCCC': 29, 'ATACTCTAAAAAATGA': 56, 'CACAACACGTACGCAA': 39, 'TGCACACACACCACGT': 38, 'CCCAGATCCCACCCGC': 12, 'GAGTCACAACCCTGCT': 1, 'ACTGATCAAAGCGTGG': 34, 'ACCGACTCACAGTACT': 52, 'TCGGGCGACTTAAATG': 20, 'GGTTCCGTGTGTGGAC': 40, 'GATCTCCCAAGTTGCC': 28, 'CGAGGCGGAAGGCGTC': 18, 'CGGAACGCCATCACCG': 13, 'TGATTTACCTAACCCG': 47, 'CTGGGTCCCCCCCAAA': 25, 'AAATGAGACAGCCTAC': 45, 'GAGTCACAACCCGGCT': 63, 'TCCCCTCACGGATTCA': 43, 'GTCAAGCGTCTAACCA': 23, 'CGCGTTAAGATCACCA': 20, 'CCCAGACCGTGAGCCA': 25, 'CAGTGATTTATTTGCG': 30, 'CGTCATAGAGACGGTT': 68, 'TGCAATACCAATAACT': 32, 'CTTAGTATTTCAGTAT': 81, 'GCTGCACCAGCTCCTC': 14, 'GTCGCATATGAAGACT': 4, 'ACACCGGTTCCTCCCC': 19, 'CGATCCCGTCCAGACA': 47, 'ACGGATAATTACCGCC': 31, 'CTTAACCGTCCGACAC': 1, 'GGAACACTGTGAAGCA': 40, 'CGAATCTTCGCAATCT': 1, 'ATTACCCCCATTTAAG': 11, 'GACCTCTCAGATTAGA': 42, 'CCAATCCCCTCGAGTG': 24, 'GGTTTTCTAACACCGC': 8, 'GTGAACTTACAGCTCC': 28, 'CCCCTATCATACTCTC': 5, 'AGGGTAGTCGACTACT': 15, 'GCGAAATGCAATCCTC': 26, 'ATAGTACCGCTCGCAA': 1, 'TCTTAACAGCGTCAGC': 17, 'CCGTACCCAAGACCAT': 12, 'GCGTATAAGTACGCCG': 50, 'GGGTCCACGTCCCAAG': 6, 'CGGCCCCAACGCAATA': 23, 'TTATACCTTAGTCAGT': 40, 'GGGCGTCCCTTCTAAA': 1, 'CCCCCCACTACAATTA': 38, 'GAGCACATTTATTCGT': 1, 'ATTGCCCGGTGGCCCT': 39, 'ATTGCATAACTACAAC': 73, 'AATTCCATTCGGGCTG': 1, 'GACCCCTACTTAGTGG': 23, 'TGCCCTGAGCCTGCAC': 8, 'ACGGAAACCCAGGCCA': 36, 'TCTGACTAGCACATCG': 18, 'CATCATAGAGACGGTT': 1, 'ACTTCATACCTTTACA': 1, 'GCTGCACCGGCTCCTC': 1, 'AATAAGAACATCTCGG': 1, 'CCTTTGCTTCGTAATA': 15, 'AAGCTTTCGTCGCGCA': 4, 'TGTCTAAATGTCCACG': 2, 'AGAGTAGTCGACTACT': 1, 'CCTGCATTCTGGATGC': 3, 'CGACGCAGCACCCCAT': 15, 'GACAGAGCTAGCACAG': 18, 'CTTAACCGTCCGACAA': 1, 'CGGTGCTACCTTCTAT': 1, 'GACCGGGGCTTCAACA': 1, 'GTTTCATCCCCCTATC': 18, 'CGGAATCAACCCCCTC': 10, 'TTAGAAGAAATCAAAT': 7, 'CATCATTCTTACTAAA': 32, 'CCCTGACCTAACCACG': 1, 'TCAGTATATCCAAAAG': 2, 'AGTAAATTCCCGGACG': 5, 'CGCACAATACTTCGAC': 35, 'ACAAAACCCCCCGATA': 1, 'TGAGTTTGCAACGAAC': 6, 'TTCGGTGAACGAACAC': 1, 'TCAACCCCACGCCTCT': 1, 'GACCACATTTATTCGT': 45, 'ACATAAATTAAAGGAA': 7, 'AATACGAACATCTAGG': 2, 'TCCACCCACTCGCGAT': 3, 'GCCGCAAGGAACGCTT': 14, 'TCCTCTAGCATCCCCA': 15, 'TCCAGAAACTTGAATG': 1, 'TCGGACGGTATACTGC': 9, 'TAAAAACGTTATAACA': 27, 'CCACCGTGGCAGCCCG': 24, 'CACCGTAATGTCCTAC': 7, 'TCCACCCACTCGAGAT': 1, 'AGGCACTAACCAAGCA': 1, 'TCGGGGGACTTAAATG': 1, 'AAACTTAATAATTTCC': 1, 'GGGTATAAGTACGCCG': 2, 'ATACCCTCCGTGAATC': 1, 'AAAGGTAGCGATAACT': 1, 'TAATACATCACACTTA': 1, 'ACTGATCAAAGCGTGT': 1, 'CATCACGCCCGTTAGC': 18, 'ACTCGTTGTCCCATAA': 18, 'GGTGATACGCCTTCGC': 1, 'AGAATCTGCGCAATCT': 1, 'CCTTTGCTTCGTAACA': 1, 'AAACTTACTAATTTCC': 6, 'CGAATCTGCGCAAACT': 1, 'ATTTCATAACTACAAC': 2, 'AGAGGCGGAAGGCGTC': 1, 'TTGGCCTGCCCTAAAT': 2, 'CGGCCCCAACGCAATG': 1, 'CGTCGGCCTGTTCATA': 3, 'TGAATTACCTAACCCG': 1, 'CATCACGCCCCTTAGC': 1, 'CTCGACCCGAGGTTAA': 1, 'TTTGTCCCTTGAACCC': 1, 'ACGGATAATTACCGGC': 1, 'TGGAATACCAATAACT': 1, 'ATGGGTCCCCCCCAAA': 1, 'AATACGAGCATCTCGG': 1, 'CGTCATAGAGAAGGTT': 1, 'ACGAGAGGCCGACTCG': 1, 'TTACTCTAAAAAATGA': 1, 'CGAATCTGCGCAATCC': 1, 'TGGGCAATAAACGTAG': 1, 'TTATACCTTAGTCTGT': 1, 'TCTTAACAGCATCAGC': 1, 'TTCGACTTCCTAATAC': 1, 'TAATATATGGATATTC': 13, 'CACACCAGTCATCCCT': 1, 'GTTAGTATTTCAGTAT': 1, 'GTCTCCCATCATGTAG': 2, 'AACCGTCACCTCTACC': 1, 'ACCAAGTCTAGCTAAC': 1, 'AACTGTGACCAGGAAG': 1, 'ATAGTCGGGACTACCG': 1, 'GCGGCAGAGCTGCTTT': 6, 'AAATGAGACAGACTAC': 1, 'GACTTACGGCGTCATG': 2, 'GTACACTCCGTGAATC': 1, 'ACACTCCTCAACCCTT': 1, 'TGGGAAATAAATGTAG': 1, 'AAAAAACCCCCCGACA': 1, 'CATACCTTTCTGACTT': 3, 'CCACCGTGTCAGCCCG': 1, 'TTCGAATTCCTAGTAC': 1, 'GATCTCTCAGATTAGA': 1, 'GCCCCACAATAACCAC': 1, 'TGGGCAAGAAATGTAG': 1, 'CACAACACGTACGTAA': 1, 'GCACGCGATCTCCGAC': 1, 'AACCGTCACCTCAAAC': 1, 'AAACAAACAAGTGTGT': 1, 'TATCATTCTTACTAAA': 1, 'TGACAACCACATCCCC': 1, 'ATGGAAACCCAGGCCA': 1, 'ATTGAATAACTACAAC': 1, 'AGGGGCACGAGTCGCC': 2, 'TCAAGAAACTTCAATG': 1, 'CTGGGTCGCCCCCAAA': 1, 'ATCGACTCACAGTACT': 1, 'TCCGTATAACCAAAAG': 1, 'GGTCATACGCCTCCGC': 1, 'ACAGGCAGCTACGGGA': 3, 'AGAATAATCTCAGACT': 1, 'TTATCGTCTCCCATAC': 1, 'CTTAGTAGTTCAGTAT': 1, 'TTCGGTGAAAGTACAC': 1, 'GACCACATTTATCCGT': 1, 'CCGCTCCCTCCTGTCC': 1, 'AAACGTGACCAGGAAG': 1, 'GACACTCCAGTCGACA': 1, 'CCACCGTGGGAGCCCG': 1, 'ACCGACTGACAGTACT': 1, 'TGCTGACAACGCGTAA': 1, 'GCTGCACCATCTCCTC': 1, 'ACCAGACCGTGAGCCA': 1, 'GGAAATTATACTCCCA': 1, 'TGCAATACCAATAACG': 1, 'TATTTCCGATCAGTCT': 1, 'GATATAGCTAGCACAG': 1, 'GTTTCATACCCCTATC': 1, 'GTACCCTACGTGAATC': 1, 'GTGAACTTAAAGCTCC': 1, 'GTCCCATATGAAGACG': 1, 'AGTACTCATCGTGCAA': 1, 'CCCAGACCGTGAGACA': 1, 'CTCCTTCACTCGCCAA': 1, 'CGAATCTGCGCAATAT': 1, 'GCGAAATGCAATCCCC': 1, 'CCCACCTTCACAACTA': 1, 'AGCACACCCAGCTCGA': 1, 'AGACCGTCAGACACCA': 1, 'CGAGGCGGAAGGGGTC': 1, 'GTACCATCCGTGAATC': 1, 'CAAGCCATCCCCTCTT': 1, 'TTCCCTAGAGACTAAC': 1, 'TCTTAACATCGTCAGC': 1, 'CGAATCTGAGCAATCT': 1, 'ACCGACCAGACATTAA': 1, 'ATTCACACTACTGCCA': 1, 'CCGCTCCCTGCTGTAC': 1, 'TACGGTGAACGTACAC': 1, 'CCCTACCCGACATTAA': 1, 'TGACAAACACGTCCCC': 1, 'AGGCCCCAACGCAATA': 1, 'ATAGTCGAGCCTACCG': 1, 'ACCGACGCACAGTACT': 1, 'GCACGCGATTTGCGAC': 1, 'ATTGCATGACTACAAC': 1, 'AATACGAACATATCGG': 1, 'GCTACTACTATACCTT': 1}\n",
"{'ACTAAATAACTTAAGC': 62, 'CTACCCGTTTCCCAAC': 118, 'CGTCTTCCATCCCCAT': 133, 'GCGAGTCCACCAATTG': 93, 'ATATAAATAAGCTATG': 51, 'TCTTTTCTTATTGAGG': 73, 'ATAAACGTTCCCCAAG': 107, 'ACCCGTGAACCAAAGC': 1, 'AATCGTACATCCCCAC': 123, 'CCACCAGCCATACGGC': 93, 'TGGACTCCACACCCCG': 76, 'TCAAGAAGCCTTGGAG': 150, 'TTTCTCCCGCATCTCG': 105, 'AATCAATACGACCTGT': 74, 'TGACAATACCCTACTC': 46, 'TTTCTCCCCCATCTCG': 1, 'CATTCACTACTCCCAA': 92, 'TCACCAGTGATCCCTT': 65, 'ACCACCGCCACACCCT': 44, 'GCTGCATTCCTGTGGC': 72, 'GTTCAGGACACACTTA': 70, 'CCAGCCTCACACTAGC': 107, 'ACGCGTGAACCAAAGC': 102, 'ACCAGTTCTCCCCGGG': 152, 'TCACGGCCACGAGTAA': 65, 'TAACACTGATGGCAAC': 76, 'GCATGTTATAAACTCC': 45, 'TCACCCCGACCAGCAG': 33, 'TTCCACCTCACCTTCT': 95, 'GATAGTCGAACACATG': 85, 'TGATGTAATGACAGGC': 74, 'GCACATAGGGAGCCAA': 58, 'TCTGGCAGGCCTCGCT': 122, 'GAACCCTAGAATCCCC': 69, 'CTAATTCAATCCTACG': 99, 'CTTACCATATTACAAA': 105, 'TGTACAGCAAAAGTAG': 69, 'TGACATCGCGAGACGG': 72, 'CCATGTCTTGCTCGCC': 24, 'TCAAGCCTGCCACACG': 39, 'AAATTTCTTCCCACAC': 16, 'GCACTTACATATATGA': 69, 'GCAAGAAGCCTTGGAG': 1, 'TGACGATCCTCAAGAA': 141, 'ACGAAACTCATATAAA': 72, 'ACTGCCATCCCACACC': 49, 'CGCAGCCGGACTCACA': 70, 'TGACATCACGAGACGG': 1, 'GAACCCTAGAAACCCC': 2, 'CACACACCGTGACAGG': 88, 'CTCACTTATTGTTCTG': 48, 'CGTCTTCCATCCACAT': 1, 'CTCTCAGTTTACGAAA': 18, 'ACTCGATTACACGTGC': 83, 'GCATTTTATAAACTCC': 1, 'ACGCGTGAACCAAAGG': 1, 'TCACCCCGACCAGAAG': 1, 'TATCGTACATCCCCAC': 1, 'CGTCTTCCATCCCCAC': 9, 'ACCAGTGCTCCCCGGT': 1, 'GTTCAGTACACACTTA': 1, 'ACGCGTGAACAAAAGC': 2, 'ATAAACGTTCCCCAAT': 1, 'TCACCAGTTATCCCTT': 1, 'GTTCAGGAGACACTTA': 1, 'GCGAGTCAACCAATTG': 1, 'ACGAAAGTCATATAAA': 1, 'TTTCTCCCGCATCTCC': 1, 'GATAGGCGAACACATG': 1, 'TGTACAACAAAAGTAG': 1, 'AAGAAACTCATATAAA': 1, 'CGTCTTCCATCCGCAT': 1, 'ACCAGTTCTCCCCTGG': 2, 'TGTGCAGCAAAAGTAG': 1, 'GAACCCTAGAAGCCCC': 1, 'TCAAGAAGCCTTGGTG': 1, 'GTTCAGGATACACTTA': 1, 'ACACTCACGACCCAAA': 16, 'TGTACAGCAAGAGTAG': 1, 'TAACACTGATGGAAAC': 2, 'ACATGTCTTGCTCGCC': 1, 'CGTCTTCCATCCCAAT': 1, 'TAACACTTATGGCAAC': 1, 'ACCAGTTCTCCCCGGA': 2, 'ACTAAAGAACTTAAGC': 1, 'TGACATAGCGAGACGG': 1, 'AATCAATACCACCTGT': 1, 'ACCAGTTCTCCCAGGG': 1, 'ACGCGTGAACCAAATC': 1, 'ATATAAATAATCTATG': 1, 'ACCAGTGCTCCCCGGG': 1, 'TGACATCTCGAGACGG': 1, 'ACTCGATTACGCGTGC': 1, 'GATAGTTGAACACATG': 1, 'TGACTATCCTCAAGAA': 1, 'ACTCAATTACACGTGC': 1, 'TGACAATACCCTACCC': 1, 'TCTGGCAGGACTCGCT': 1, 'TCACGGCCACGATTAA': 1, 'AATAGTACATCCCCAC': 1, 'ATAAACGTTCCCCAAA': 1, 'GCTGCATTCCTGTGGA': 1, 'GGACTTACATATATGA': 1, 'CTTACCATATTAAAAA': 1, 'AATCAATACGACCTTT': 1, 'ATCACTTATTGTTCTG': 1, 'CGTCTTCCATCCTCAT': 1, 'GTTCACGACACACTTA': 1, 'TGACGAGCCTCAAGAA': 1, 'AATCGTACATCCCCGC': 1, 'GCATGTTATAAACTCA': 1, 'CACACACCTTGACAGG': 1, 'AATCGTACATCAACAC': 1, 'GCGACTCCACCAATTG': 1, 'CATTCACTAATCCCAA': 1, 'CTAATTCAATCATACG': 1, 'ATATAAAGAAGCTATG': 1, 'ACCACCGCCACACCCC': 1, 'GATAGTCGAGCACATG': 1, 'GCACATAGGGAGCAAA': 1, 'CATTCACTACTCCCAT': 1, 'TCACGTCCACGAGTAA': 1, 'TCTCCAGTGATCCCTT': 1, 'ACACTCACCACCCAAA': 1, 'TAGACTCCACACCCCG': 1, 'GCTGCATGCCTGTGGC': 1, 'GCAGCCTCACACTAGC': 1, 'ACTGCCATCCCACTCC': 1, 'ATATAAATAAGCTATA': 1, 'GTAAACGTTCCCCAAG': 1, 'TCAAGAAGGCTTGGAG': 1, 'CTTACCATATTACAAT': 1, 'TCAAGCCTGCCATACG': 1, 'CATTAACTACTCCCAA': 1, 'GCACATAAGGAGCCAA': 1}\n",
"\n",
"\n",
"There 3907 sequence map to NA\n",
"There 5245 sequence map to HA\n"
]
}
],
"source": [
"# your code here..."
"import re\n",
"import Bio.SeqIO\n",
"\n",
"# Read the sequencing reads\n",
"seqreads = list(Bio.SeqIO.parse('barcodes_R1.fastq', format = 'fastq'))\n",
"\n",
"# Create a list of sequence from seqread\n",
"seqreads_l = []\n",
"for seqrecord in seqreads: \n",
" sequence = str(seqrecord.seq)\n",
" seqreads_l.append(sequence)\n",
" \n",
"# Define a function that read the reverse_complement of the sequence\n",
"def reverse_complement(seq, unk_partner = 'N'):\n",
" base_partner = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}\n",
" rseq = ''\n",
" for a in seq:\n",
" if a in base_partner:\n",
" # look up the complementary base in the dictionary\n",
" pair = base_partner[a]\n",
" rseq = pair + rseq\n",
" else:\n",
" rseq = unk_partner + rseq\n",
" return rseq\n",
"\n",
"# Define each arguments: upstream, bclen, end of HA, and end of NA \n",
"upstream ='AGGCGGCCGC'\n",
"end_of_ha = 'CCGGATTTGCATATAATGATGCACCAT'\n",
"end_of_na = 'CACGATAGATAAATAATAGTGCACCAT'\n",
"bclen = 16\n",
"\n",
"def read_barcode_pattern(seqread, bclen, end_of_na, end_of_ha, upstream):\n",
" \n",
" # compile the barcode search pattern\n",
" barcode_pattern_na = re.compile(end_of_na + upstream + f\"(?P<barcode>[ATCG]{{{bclen}}})$\")\n",
" barcode_pattern_ha = re.compile(end_of_ha + upstream + f\"(?P<barcode>[ATCG]{{{bclen}}})$\") \n",
" # make an empty dictionary to count na\n",
" barcode_na_dic = {}\n",
" # make an empty dictionary to count ha \n",
" barcode_ha_dic = {}\n",
" for seq in seqread:\n",
" seq = seq.upper()\n",
" # get the reverse complement of the read\n",
" reverse = reverse_complement(seq)\n",
" # search for the barcode pattern \n",
" match_na = barcode_pattern_na.search(reverse)\n",
" match_ha = barcode_pattern_ha.search(reverse)\n",
"\n",
" if match_na: #check if there is a match for na pattern\n",
" barcode_na = match_na.group('barcode')\n",
" # add barcode in dictionary and count number of na pattern\n",
" if barcode_na in barcode_na_dic:\n",
" barcode_na_dic[barcode_na] += 1\n",
" else: \n",
" barcode_na_dic[barcode_na] = 1\n",
" elif match_ha: #check if there is a match for ha pattern\n",
" barcode_ha = match_ha.group('barcode')\n",
" # add barcode in dictionary and count number of ha pattern \n",
" if barcode_ha in barcode_ha_dic:\n",
" barcode_ha_dic[barcode_ha] += 1\n",
" else:\n",
" barcode_ha_dic[barcode_ha] = 1\n",
" \n",
" return barcode_na_dic, barcode_ha_dic\n",
"\n",
"barcode_na, barcode_ha = read_barcode_pattern(seqreads_l, bclen, end_of_na, end_of_ha, upstream) \n",
"print(barcode_ha)\n",
"print(barcode_na)\n",
"\n",
"def count_barcode (barcode_dic):\n",
" count = 0\n",
" for key, value in barcode_dic.items():\n",
" count += value \n",
" return count\n",
"\n",
"na_barcode_count = count_barcode(barcode_na)\n",
"ha_barcode_count = count_barcode(barcode_ha)\n",
"print('\\n')\n",
"print(f'There {na_barcode_count} sequence map to NA')\n",
"print(f'There {ha_barcode_count} sequence map to HA')"
]
},
{
Expand All @@ -81,11 +173,73 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are total 4122 NA sequence\n",
"There are total 5409 HA sequence\n",
"\n",
"\n",
"There 215 NA sequence that did not have a valid barcode\n",
"There 164 HA sequence that did not have a valid barcode\n"
]
}
],
"source": [
"def read_seq(seqread, bclen, end_of_na, end_of_ha, upstream):\n",
" # compile the seq search pattern\n",
" pattern_na = re.compile(end_of_na)\n",
" pattern_ha = re.compile(end_of_ha)\n",
" # counters for na and ha\n",
" na_count = 0\n",
" ha_count = 0\n",
"\n",
" for seq in seqread:\n",
" seq = seq.upper()\n",
" # get the reverse complement of the read\n",
" reverse = reverse_complement(seq)\n",
" # search for the patterns\n",
" match_na = pattern_na.search(reverse)\n",
" match_ha = pattern_ha.search(reverse)\n",
"\n",
" # check for matches and increment the respective counters\n",
" if match_na:\n",
" na_count += 1\n",
" elif match_ha:\n",
" ha_count += 1\n",
" \n",
" return na_count, ha_count\n",
"\n",
"na_count, ha_count = read_seq(seqreads_l, bclen, end_of_na, end_of_ha, upstream) \n",
"print(f'There are total {na_count} NA sequence')\n",
"print(f'There are total {ha_count} HA sequence')\n",
"print('\\n')\n",
"print(f'There {na_count - na_barcode_count} NA sequence that did not have a valid barcode')\n",
"print(f'There {ha_count - ha_barcode_count} HA sequence that did not have a valid barcode')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"164"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# your code here..."
"5409 - 5245"
]
},
{
Expand All @@ -100,11 +254,37 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"most frequence HA barcode is: ['CCCGACCCGACATTAA']\n",
"counts of the most freqeunt HA barcode is: 155\n",
"most frequent NA barcode is: ['ACCAGTTCTCCCCGGG']\n",
"counts of the most frequent HA barcode is: 152\n"
]
}
],
"source": [
"# your code here..."
"# HA barcode with the most counts\n",
"# maximum value in dictionary expression found in: https://datagy.io/python-get-dictionary-key-with-max-value/\n",
"mf_ha_barcode = [key for key, value in barcode_ha.items() if value == max(barcode_ha.values())]\n",
"print('most frequence HA barcode is:', mf_ha_barcode)\n",
"\n",
"# Number of most frequent HA barcode counts\n",
"mf_ha_barcode_count = max(barcode_ha.values())\n",
"print('counts of the most freqeunt HA barcode is:', mf_ha_barcode_count)\n",
"\n",
"# NA barcode with the most counts\n",
"mf_na_barcode = [key for key, value in barcode_na.items() if value == max(barcode_na.values())]\n",
"print('most frequent NA barcode is:', mf_na_barcode)\n",
"\n",
"# Number of most frequent NA barcode counts\n",
"mf_na_barcode_count = max(barcode_na.values())\n",
"print('counts of the most frequent HA barcode is:', mf_na_barcode_count)"
]
}
],
Expand All @@ -124,7 +304,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.9.0"
},
"toc": {
"nav_menu": {},
Expand Down

0 comments on commit 8ecb7bc

Please sign in to comment.